daa3662da0ee809422db12f0ff58c7975b0b8be7
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102                          struct fib6_info *rt, struct dst_entry *dst,
103                          struct in6_addr *dest, struct in6_addr *src,
104                          int iif, int type, u32 portid, u32 seq,
105                          unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107                                            struct in6_addr *daddr,
108                                            struct in6_addr *saddr);
109
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112                                            const struct in6_addr *prefix, int prefixlen,
113                                            const struct in6_addr *gwaddr,
114                                            struct net_device *dev,
115                                            unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117                                            const struct in6_addr *prefix, int prefixlen,
118                                            const struct in6_addr *gwaddr,
119                                            struct net_device *dev);
120 #endif
121
122 struct uncached_list {
123         spinlock_t              lock;
124         struct list_head        head;
125 };
126
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132
133         rt->rt6i_uncached_list = ul;
134
135         spin_lock_bh(&ul->lock);
136         list_add_tail(&rt->rt6i_uncached, &ul->head);
137         spin_unlock_bh(&ul->lock);
138 }
139
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142         if (!list_empty(&rt->rt6i_uncached)) {
143                 struct uncached_list *ul = rt->rt6i_uncached_list;
144                 struct net *net = dev_net(rt->dst.dev);
145
146                 spin_lock_bh(&ul->lock);
147                 list_del(&rt->rt6i_uncached);
148                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         if (!ipv6_addr_any(p))
190                 return (const void *) p;
191         else if (skb)
192                 return &ipv6_hdr(skb)->daddr;
193         return daddr;
194 }
195
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197                                    struct net_device *dev,
198                                    struct sk_buff *skb,
199                                    const void *daddr)
200 {
201         struct neighbour *n;
202
203         daddr = choose_neigh_daddr(gw, skb, daddr);
204         n = __ipv6_neigh_lookup(dev, daddr);
205         if (n)
206                 return n;
207         return neigh_create(&nd_tbl, daddr, dev);
208 }
209
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211                                               struct sk_buff *skb,
212                                               const void *daddr)
213 {
214         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215
216         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221         struct net_device *dev = dst->dev;
222         struct rt6_info *rt = (struct rt6_info *)dst;
223
224         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225         if (!daddr)
226                 return;
227         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228                 return;
229         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230                 return;
231         __ipv6_confirm_neigh(dev, daddr);
232 }
233
234 static struct dst_ops ip6_dst_ops_template = {
235         .family                 =       AF_INET6,
236         .gc                     =       ip6_dst_gc,
237         .gc_thresh              =       1024,
238         .check                  =       ip6_dst_check,
239         .default_advmss         =       ip6_default_advmss,
240         .mtu                    =       ip6_mtu,
241         .cow_metrics            =       dst_cow_metrics_generic,
242         .destroy                =       ip6_dst_destroy,
243         .ifdown                 =       ip6_dst_ifdown,
244         .negative_advice        =       ip6_negative_advice,
245         .link_failure           =       ip6_link_failure,
246         .update_pmtu            =       ip6_rt_update_pmtu,
247         .redirect               =       rt6_do_redirect,
248         .local_out              =       __ip6_local_out,
249         .neigh_lookup           =       ip6_dst_neigh_lookup,
250         .confirm_neigh          =       ip6_confirm_neigh,
251 };
252
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256
257         return mtu ? : dst->dev->mtu;
258 }
259
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261                                          struct sk_buff *skb, u32 mtu)
262 {
263 }
264
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266                                       struct sk_buff *skb)
267 {
268 }
269
270 static struct dst_ops ip6_dst_blackhole_ops = {
271         .family                 =       AF_INET6,
272         .destroy                =       ip6_dst_destroy,
273         .check                  =       ip6_dst_check,
274         .mtu                    =       ip6_blackhole_mtu,
275         .default_advmss         =       ip6_default_advmss,
276         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
277         .redirect               =       ip6_rt_blackhole_redirect,
278         .cow_metrics            =       dst_cow_metrics_generic,
279         .neigh_lookup           =       ip6_dst_neigh_lookup,
280 };
281
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283         [RTAX_HOPLIMIT - 1] = 0,
284 };
285
286 static const struct fib6_info fib6_null_entry_template = {
287         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
288         .fib6_protocol  = RTPROT_KERNEL,
289         .fib6_metric    = ~(u32)0,
290         .fib6_ref       = ATOMIC_INIT(1),
291         .fib6_type      = RTN_UNREACHABLE,
292         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
293 };
294
295 static const struct rt6_info ip6_null_entry_template = {
296         .dst = {
297                 .__refcnt       = ATOMIC_INIT(1),
298                 .__use          = 1,
299                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
300                 .error          = -ENETUNREACH,
301                 .input          = ip6_pkt_discard,
302                 .output         = ip6_pkt_discard_out,
303         },
304         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310         .dst = {
311                 .__refcnt       = ATOMIC_INIT(1),
312                 .__use          = 1,
313                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
314                 .error          = -EACCES,
315                 .input          = ip6_pkt_prohibit,
316                 .output         = ip6_pkt_prohibit_out,
317         },
318         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
319 };
320
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322         .dst = {
323                 .__refcnt       = ATOMIC_INIT(1),
324                 .__use          = 1,
325                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
326                 .error          = -EINVAL,
327                 .input          = dst_discard,
328                 .output         = dst_discard_out,
329         },
330         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
331 };
332
333 #endif
334
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337         struct dst_entry *dst = &rt->dst;
338
339         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340         INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348                                         1, DST_OBSOLETE_FORCE_CHK, flags);
349
350         if (rt) {
351                 rt6_info_init(rt);
352                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353         }
354
355         return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361         struct rt6_info *rt = (struct rt6_info *)dst;
362         struct fib6_info *from;
363         struct inet6_dev *idev;
364
365         dst_destroy_metrics_generic(dst);
366         rt6_uncached_list_del(rt);
367
368         idev = rt->rt6i_idev;
369         if (idev) {
370                 rt->rt6i_idev = NULL;
371                 in6_dev_put(idev);
372         }
373
374         rcu_read_lock();
375         from = rcu_dereference(rt->from);
376         rcu_assign_pointer(rt->from, NULL);
377         fib6_info_release(from);
378         rcu_read_unlock();
379 }
380
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
382                            int how)
383 {
384         struct rt6_info *rt = (struct rt6_info *)dst;
385         struct inet6_dev *idev = rt->rt6i_idev;
386         struct net_device *loopback_dev =
387                 dev_net(dev)->loopback_dev;
388
389         if (idev && idev->dev != loopback_dev) {
390                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
391                 if (loopback_idev) {
392                         rt->rt6i_idev = loopback_idev;
393                         in6_dev_put(idev);
394                 }
395         }
396 }
397
398 static bool __rt6_check_expired(const struct rt6_info *rt)
399 {
400         if (rt->rt6i_flags & RTF_EXPIRES)
401                 return time_after(jiffies, rt->dst.expires);
402         else
403                 return false;
404 }
405
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408         struct fib6_info *from;
409
410         from = rcu_dereference(rt->from);
411
412         if (rt->rt6i_flags & RTF_EXPIRES) {
413                 if (time_after(jiffies, rt->dst.expires))
414                         return true;
415         } else if (from) {
416                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417                         fib6_check_expired(from);
418         }
419         return false;
420 }
421
422 static struct fib6_info *rt6_multipath_select(const struct net *net,
423                                               struct fib6_info *match,
424                                              struct flowi6 *fl6, int oif,
425                                              const struct sk_buff *skb,
426                                              int strict)
427 {
428         struct fib6_info *sibling, *next_sibling;
429
430         /* We might have already computed the hash for ICMPv6 errors. In such
431          * case it will always be non-zero. Otherwise now is the time to do it.
432          */
433         if (!fl6->mp_hash)
434                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435
436         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437                 return match;
438
439         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
440                                  fib6_siblings) {
441                 int nh_upper_bound;
442
443                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444                 if (fl6->mp_hash > nh_upper_bound)
445                         continue;
446                 if (rt6_score_route(sibling, oif, strict) < 0)
447                         break;
448                 match = sibling;
449                 break;
450         }
451
452         return match;
453 }
454
455 /*
456  *      Route lookup. rcu_read_lock() should be held.
457  */
458
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460                                                  struct fib6_info *rt,
461                                                     const struct in6_addr *saddr,
462                                                     int oif,
463                                                     int flags)
464 {
465         struct fib6_info *sprt;
466
467         if (!oif && ipv6_addr_any(saddr) &&
468             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469                 return rt;
470
471         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
472                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
473
474                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475                         continue;
476
477                 if (oif) {
478                         if (dev->ifindex == oif)
479                                 return sprt;
480                 } else {
481                         if (ipv6_chk_addr(net, saddr, dev,
482                                           flags & RT6_LOOKUP_F_IFACE))
483                                 return sprt;
484                 }
485         }
486
487         if (oif && flags & RT6_LOOKUP_F_IFACE)
488                 return net->ipv6.fib6_null_entry;
489
490         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
491 }
492
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495         struct work_struct work;
496         struct in6_addr target;
497         struct net_device *dev;
498 };
499
500 static void rt6_probe_deferred(struct work_struct *w)
501 {
502         struct in6_addr mcaddr;
503         struct __rt6_probe_work *work =
504                 container_of(w, struct __rt6_probe_work, work);
505
506         addrconf_addr_solict_mult(&work->target, &mcaddr);
507         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508         dev_put(work->dev);
509         kfree(work);
510 }
511
512 static void rt6_probe(struct fib6_info *rt)
513 {
514         struct __rt6_probe_work *work;
515         const struct in6_addr *nh_gw;
516         struct neighbour *neigh;
517         struct net_device *dev;
518
519         /*
520          * Okay, this does not seem to be appropriate
521          * for now, however, we need to check if it
522          * is really so; aka Router Reachability Probing.
523          *
524          * Router Reachability Probe MUST be rate-limited
525          * to no more than one per minute.
526          */
527         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528                 return;
529
530         nh_gw = &rt->fib6_nh.nh_gw;
531         dev = rt->fib6_nh.nh_dev;
532         rcu_read_lock_bh();
533         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534         if (neigh) {
535                 struct inet6_dev *idev;
536
537                 if (neigh->nud_state & NUD_VALID)
538                         goto out;
539
540                 idev = __in6_dev_get(dev);
541                 work = NULL;
542                 write_lock(&neigh->lock);
543                 if (!(neigh->nud_state & NUD_VALID) &&
544                     time_after(jiffies,
545                                neigh->updated + idev->cnf.rtr_probe_interval)) {
546                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
547                         if (work)
548                                 __neigh_set_probe_once(neigh);
549                 }
550                 write_unlock(&neigh->lock);
551         } else {
552                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
553         }
554
555         if (work) {
556                 INIT_WORK(&work->work, rt6_probe_deferred);
557                 work->target = *nh_gw;
558                 dev_hold(dev);
559                 work->dev = dev;
560                 schedule_work(&work->work);
561         }
562
563 out:
564         rcu_read_unlock_bh();
565 }
566 #else
567 static inline void rt6_probe(struct fib6_info *rt)
568 {
569 }
570 #endif
571
572 /*
573  * Default Router Selection (RFC 2461 6.3.6)
574  */
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576 {
577         const struct net_device *dev = rt->fib6_nh.nh_dev;
578
579         if (!oif || dev->ifindex == oif)
580                 return 2;
581         return 0;
582 }
583
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
585 {
586         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587         struct neighbour *neigh;
588
589         if (rt->fib6_flags & RTF_NONEXTHOP ||
590             !(rt->fib6_flags & RTF_GATEWAY))
591                 return RT6_NUD_SUCCEED;
592
593         rcu_read_lock_bh();
594         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
595                                           &rt->fib6_nh.nh_gw);
596         if (neigh) {
597                 read_lock(&neigh->lock);
598                 if (neigh->nud_state & NUD_VALID)
599                         ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601                 else if (!(neigh->nud_state & NUD_FAILED))
602                         ret = RT6_NUD_SUCCEED;
603                 else
604                         ret = RT6_NUD_FAIL_PROBE;
605 #endif
606                 read_unlock(&neigh->lock);
607         } else {
608                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610         }
611         rcu_read_unlock_bh();
612
613         return ret;
614 }
615
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
617 {
618         int m;
619
620         m = rt6_check_dev(rt, oif);
621         if (!m && (strict & RT6_LOOKUP_F_IFACE))
622                 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625 #endif
626         if (strict & RT6_LOOKUP_F_REACHABLE) {
627                 int n = rt6_check_neigh(rt);
628                 if (n < 0)
629                         return n;
630         }
631         return m;
632 }
633
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
636 {
637         const struct net_device *dev = fib6_info_nh_dev(f6i);
638         bool rc = false;
639
640         if (dev) {
641                 const struct inet6_dev *idev = __in6_dev_get(dev);
642
643                 rc = !!idev->cnf.ignore_routes_with_linkdown;
644         }
645
646         return rc;
647 }
648
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650                                    int *mpri, struct fib6_info *match,
651                                    bool *do_rr)
652 {
653         int m;
654         bool match_do_rr = false;
655
656         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657                 goto out;
658
659         if (fib6_ignore_linkdown(rt) &&
660             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662                 goto out;
663
664         if (fib6_check_expired(rt))
665                 goto out;
666
667         m = rt6_score_route(rt, oif, strict);
668         if (m == RT6_NUD_FAIL_DO_RR) {
669                 match_do_rr = true;
670                 m = 0; /* lowest valid score */
671         } else if (m == RT6_NUD_FAIL_HARD) {
672                 goto out;
673         }
674
675         if (strict & RT6_LOOKUP_F_REACHABLE)
676                 rt6_probe(rt);
677
678         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
679         if (m > *mpri) {
680                 *do_rr = match_do_rr;
681                 *mpri = m;
682                 match = rt;
683         }
684 out:
685         return match;
686 }
687
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689                                      struct fib6_info *leaf,
690                                      struct fib6_info *rr_head,
691                                      u32 metric, int oif, int strict,
692                                      bool *do_rr)
693 {
694         struct fib6_info *rt, *match, *cont;
695         int mpri = -1;
696
697         match = NULL;
698         cont = NULL;
699         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
700                 if (rt->fib6_metric != metric) {
701                         cont = rt;
702                         break;
703                 }
704
705                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
706         }
707
708         for (rt = leaf; rt && rt != rr_head;
709              rt = rcu_dereference(rt->fib6_next)) {
710                 if (rt->fib6_metric != metric) {
711                         cont = rt;
712                         break;
713                 }
714
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716         }
717
718         if (match || !cont)
719                 return match;
720
721         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
722                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723
724         return match;
725 }
726
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
728                                    int oif, int strict)
729 {
730         struct fib6_info *leaf = rcu_dereference(fn->leaf);
731         struct fib6_info *match, *rt0;
732         bool do_rr = false;
733         int key_plen;
734
735         if (!leaf || leaf == net->ipv6.fib6_null_entry)
736                 return net->ipv6.fib6_null_entry;
737
738         rt0 = rcu_dereference(fn->rr_ptr);
739         if (!rt0)
740                 rt0 = leaf;
741
742         /* Double check to make sure fn is not an intermediate node
743          * and fn->leaf does not points to its child's leaf
744          * (This might happen if all routes under fn are deleted from
745          * the tree and fib6_repair_tree() is called on the node.)
746          */
747         key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749         if (rt0->fib6_src.plen)
750                 key_plen = rt0->fib6_src.plen;
751 #endif
752         if (fn->fn_bit != key_plen)
753                 return net->ipv6.fib6_null_entry;
754
755         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756                              &do_rr);
757
758         if (do_rr) {
759                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
760
761                 /* no entries matched; do round-robin */
762                 if (!next || next->fib6_metric != rt0->fib6_metric)
763                         next = leaf;
764
765                 if (next != rt0) {
766                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
767                         /* make sure next is not being deleted from the tree */
768                         if (next->fib6_node)
769                                 rcu_assign_pointer(fn->rr_ptr, next);
770                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771                 }
772         }
773
774         return match ? match : net->ipv6.fib6_null_entry;
775 }
776
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778 {
779         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 }
781
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784                   const struct in6_addr *gwaddr)
785 {
786         struct net *net = dev_net(dev);
787         struct route_info *rinfo = (struct route_info *) opt;
788         struct in6_addr prefix_buf, *prefix;
789         unsigned int pref;
790         unsigned long lifetime;
791         struct fib6_info *rt;
792
793         if (len < sizeof(struct route_info)) {
794                 return -EINVAL;
795         }
796
797         /* Sanity check for prefix_len and length */
798         if (rinfo->length > 3) {
799                 return -EINVAL;
800         } else if (rinfo->prefix_len > 128) {
801                 return -EINVAL;
802         } else if (rinfo->prefix_len > 64) {
803                 if (rinfo->length < 2) {
804                         return -EINVAL;
805                 }
806         } else if (rinfo->prefix_len > 0) {
807                 if (rinfo->length < 1) {
808                         return -EINVAL;
809                 }
810         }
811
812         pref = rinfo->route_pref;
813         if (pref == ICMPV6_ROUTER_PREF_INVALID)
814                 return -EINVAL;
815
816         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817
818         if (rinfo->length == 3)
819                 prefix = (struct in6_addr *)rinfo->prefix;
820         else {
821                 /* this function is safe */
822                 ipv6_addr_prefix(&prefix_buf,
823                                  (struct in6_addr *)rinfo->prefix,
824                                  rinfo->prefix_len);
825                 prefix = &prefix_buf;
826         }
827
828         if (rinfo->prefix_len == 0)
829                 rt = rt6_get_dflt_router(net, gwaddr, dev);
830         else
831                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832                                         gwaddr, dev);
833
834         if (rt && !lifetime) {
835                 ip6_del_rt(net, rt);
836                 rt = NULL;
837         }
838
839         if (!rt && lifetime)
840                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
841                                         dev, pref);
842         else if (rt)
843                 rt->fib6_flags = RTF_ROUTEINFO |
844                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845
846         if (rt) {
847                 if (!addrconf_finite_timeout(lifetime))
848                         fib6_clean_expires(rt);
849                 else
850                         fib6_set_expires(rt, jiffies + HZ * lifetime);
851
852                 fib6_info_release(rt);
853         }
854         return 0;
855 }
856 #endif
857
858 /*
859  *      Misc support functions
860  */
861
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864 {
865         struct net_device *dev = rt->fib6_nh.nh_dev;
866
867         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868                 /* for copies of local routes, dst->dev needs to be the
869                  * device if it is a master device, the master device if
870                  * device is enslaved, and the loopback as the default
871                  */
872                 if (netif_is_l3_slave(dev) &&
873                     !rt6_need_strict(&rt->fib6_dst.addr))
874                         dev = l3mdev_master_dev_rcu(dev);
875                 else if (!netif_is_l3_master(dev))
876                         dev = dev_net(dev)->loopback_dev;
877                 /* last case is netif_is_l3_master(dev) is true in which
878                  * case we want dev returned to be dev
879                  */
880         }
881
882         return dev;
883 }
884
885 static const int fib6_prop[RTN_MAX + 1] = {
886         [RTN_UNSPEC]    = 0,
887         [RTN_UNICAST]   = 0,
888         [RTN_LOCAL]     = 0,
889         [RTN_BROADCAST] = 0,
890         [RTN_ANYCAST]   = 0,
891         [RTN_MULTICAST] = 0,
892         [RTN_BLACKHOLE] = -EINVAL,
893         [RTN_UNREACHABLE] = -EHOSTUNREACH,
894         [RTN_PROHIBIT]  = -EACCES,
895         [RTN_THROW]     = -EAGAIN,
896         [RTN_NAT]       = -EINVAL,
897         [RTN_XRESOLVE]  = -EINVAL,
898 };
899
900 static int ip6_rt_type_to_error(u8 fib6_type)
901 {
902         return fib6_prop[fib6_type];
903 }
904
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 {
907         unsigned short flags = 0;
908
909         if (rt->dst_nocount)
910                 flags |= DST_NOCOUNT;
911         if (rt->dst_nopolicy)
912                 flags |= DST_NOPOLICY;
913         if (rt->dst_host)
914                 flags |= DST_HOST;
915
916         return flags;
917 }
918
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 {
921         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
922
923         switch (ort->fib6_type) {
924         case RTN_BLACKHOLE:
925                 rt->dst.output = dst_discard_out;
926                 rt->dst.input = dst_discard;
927                 break;
928         case RTN_PROHIBIT:
929                 rt->dst.output = ip6_pkt_prohibit_out;
930                 rt->dst.input = ip6_pkt_prohibit;
931                 break;
932         case RTN_THROW:
933         case RTN_UNREACHABLE:
934         default:
935                 rt->dst.output = ip6_pkt_discard_out;
936                 rt->dst.input = ip6_pkt_discard;
937                 break;
938         }
939 }
940
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942 {
943         rt->dst.flags |= fib6_info_dst_flags(ort);
944
945         if (ort->fib6_flags & RTF_REJECT) {
946                 ip6_rt_init_dst_reject(rt, ort);
947                 return;
948         }
949
950         rt->dst.error = 0;
951         rt->dst.output = ip6_output;
952
953         if (ort->fib6_type == RTN_LOCAL) {
954                 rt->dst.input = ip6_input;
955         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956                 rt->dst.input = ip6_mc_input;
957         } else {
958                 rt->dst.input = ip6_forward;
959         }
960
961         if (ort->fib6_nh.nh_lwtstate) {
962                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963                 lwtunnel_set_redirect(&rt->dst);
964         }
965
966         rt->dst.lastuse = jiffies;
967 }
968
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 {
971         rt->rt6i_flags &= ~RTF_EXPIRES;
972         fib6_info_hold(from);
973         rcu_assign_pointer(rt->from, from);
974         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975         if (from->fib6_metrics != &dst_default_metrics) {
976                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977                 refcount_inc(&from->fib6_metrics->refcnt);
978         }
979 }
980
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983         struct net_device *dev = fib6_info_nh_dev(ort);
984
985         ip6_rt_init_dst(rt, ort);
986
987         rt->rt6i_dst = ort->fib6_dst;
988         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990         rt->rt6i_flags = ort->fib6_flags;
991         rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993         rt->rt6i_src = ort->fib6_src;
994 #endif
995         rt->rt6i_prefsrc = ort->fib6_prefsrc;
996         rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 }
998
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000                                         struct in6_addr *saddr)
1001 {
1002         struct fib6_node *pn, *sn;
1003         while (1) {
1004                 if (fn->fn_flags & RTN_TL_ROOT)
1005                         return NULL;
1006                 pn = rcu_dereference(fn->parent);
1007                 sn = FIB6_SUBTREE(pn);
1008                 if (sn && sn != fn)
1009                         fn = fib6_lookup(sn, NULL, saddr);
1010                 else
1011                         fn = pn;
1012                 if (fn->fn_flags & RTN_RTINFO)
1013                         return fn;
1014         }
1015 }
1016
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018                           bool null_fallback)
1019 {
1020         struct rt6_info *rt = *prt;
1021
1022         if (dst_hold_safe(&rt->dst))
1023                 return true;
1024         if (null_fallback) {
1025                 rt = net->ipv6.ip6_null_entry;
1026                 dst_hold(&rt->dst);
1027         } else {
1028                 rt = NULL;
1029         }
1030         *prt = rt;
1031         return false;
1032 }
1033
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037         unsigned short flags = fib6_info_dst_flags(rt);
1038         struct net_device *dev = rt->fib6_nh.nh_dev;
1039         struct rt6_info *nrt;
1040
1041         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042         if (nrt)
1043                 ip6_rt_copy_init(nrt, rt);
1044
1045         return nrt;
1046 }
1047
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049                                              struct fib6_table *table,
1050                                              struct flowi6 *fl6,
1051                                              const struct sk_buff *skb,
1052                                              int flags)
1053 {
1054         struct fib6_info *f6i;
1055         struct fib6_node *fn;
1056         struct rt6_info *rt;
1057
1058         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059                 flags &= ~RT6_LOOKUP_F_IFACE;
1060
1061         rcu_read_lock();
1062         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064         f6i = rcu_dereference(fn->leaf);
1065         if (!f6i) {
1066                 f6i = net->ipv6.fib6_null_entry;
1067         } else {
1068                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069                                       fl6->flowi6_oif, flags);
1070                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071                         f6i = rt6_multipath_select(net, f6i, fl6,
1072                                                    fl6->flowi6_oif, skb, flags);
1073         }
1074         if (f6i == net->ipv6.fib6_null_entry) {
1075                 fn = fib6_backtrack(fn, &fl6->saddr);
1076                 if (fn)
1077                         goto restart;
1078         }
1079
1080         /* Search through exception table */
1081         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1082         if (rt) {
1083                 if (ip6_hold_safe(net, &rt, true))
1084                         dst_use_noref(&rt->dst, jiffies);
1085         } else if (f6i == net->ipv6.fib6_null_entry) {
1086                 rt = net->ipv6.ip6_null_entry;
1087                 dst_hold(&rt->dst);
1088         } else {
1089                 rt = ip6_create_rt_rcu(f6i);
1090                 if (!rt) {
1091                         rt = net->ipv6.ip6_null_entry;
1092                         dst_hold(&rt->dst);
1093                 }
1094         }
1095
1096         rcu_read_unlock();
1097
1098         trace_fib6_table_lookup(net, rt, table, fl6);
1099
1100         return rt;
1101 }
1102
1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1104                                    const struct sk_buff *skb, int flags)
1105 {
1106         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1107 }
1108 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1109
1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1111                             const struct in6_addr *saddr, int oif,
1112                             const struct sk_buff *skb, int strict)
1113 {
1114         struct flowi6 fl6 = {
1115                 .flowi6_oif = oif,
1116                 .daddr = *daddr,
1117         };
1118         struct dst_entry *dst;
1119         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1120
1121         if (saddr) {
1122                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1124         }
1125
1126         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1127         if (dst->error == 0)
1128                 return (struct rt6_info *) dst;
1129
1130         dst_release(dst);
1131
1132         return NULL;
1133 }
1134 EXPORT_SYMBOL(rt6_lookup);
1135
1136 /* ip6_ins_rt is called with FREE table->tb6_lock.
1137  * It takes new route entry, the addition fails by any reason the
1138  * route is released.
1139  * Caller must hold dst before calling it.
1140  */
1141
1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143                         struct netlink_ext_ack *extack)
1144 {
1145         int err;
1146         struct fib6_table *table;
1147
1148         table = rt->fib6_table;
1149         spin_lock_bh(&table->tb6_lock);
1150         err = fib6_add(&table->tb6_root, rt, info, extack);
1151         spin_unlock_bh(&table->tb6_lock);
1152
1153         return err;
1154 }
1155
1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1157 {
1158         struct nl_info info = { .nl_net = net, };
1159
1160         return __ip6_ins_rt(rt, &info, NULL);
1161 }
1162
1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164                                            const struct in6_addr *daddr,
1165                                            const struct in6_addr *saddr)
1166 {
1167         struct net_device *dev;
1168         struct rt6_info *rt;
1169
1170         /*
1171          *      Clone the route.
1172          */
1173
1174         dev = ip6_rt_get_dev_rcu(ort);
1175         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1176         if (!rt)
1177                 return NULL;
1178
1179         ip6_rt_copy_init(rt, ort);
1180         rt->rt6i_flags |= RTF_CACHE;
1181         rt->dst.flags |= DST_HOST;
1182         rt->rt6i_dst.addr = *daddr;
1183         rt->rt6i_dst.plen = 128;
1184
1185         if (!rt6_is_gw_or_nonexthop(ort)) {
1186                 if (ort->fib6_dst.plen != 128 &&
1187                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1188                         rt->rt6i_flags |= RTF_ANYCAST;
1189 #ifdef CONFIG_IPV6_SUBTREES
1190                 if (rt->rt6i_src.plen && saddr) {
1191                         rt->rt6i_src.addr = *saddr;
1192                         rt->rt6i_src.plen = 128;
1193                 }
1194 #endif
1195         }
1196
1197         return rt;
1198 }
1199
1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1201 {
1202         unsigned short flags = fib6_info_dst_flags(rt);
1203         struct net_device *dev;
1204         struct rt6_info *pcpu_rt;
1205
1206         rcu_read_lock();
1207         dev = ip6_rt_get_dev_rcu(rt);
1208         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1209         rcu_read_unlock();
1210         if (!pcpu_rt)
1211                 return NULL;
1212         ip6_rt_copy_init(pcpu_rt, rt);
1213         pcpu_rt->rt6i_flags |= RTF_PCPU;
1214         return pcpu_rt;
1215 }
1216
1217 /* It should be called with rcu_read_lock() acquired */
1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1219 {
1220         struct rt6_info *pcpu_rt, **p;
1221
1222         p = this_cpu_ptr(rt->rt6i_pcpu);
1223         pcpu_rt = *p;
1224
1225         if (pcpu_rt)
1226                 ip6_hold_safe(NULL, &pcpu_rt, false);
1227
1228         return pcpu_rt;
1229 }
1230
1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232                                             struct fib6_info *rt)
1233 {
1234         struct rt6_info *pcpu_rt, *prev, **p;
1235
1236         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1237         if (!pcpu_rt) {
1238                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1239                 return net->ipv6.ip6_null_entry;
1240         }
1241
1242         dst_hold(&pcpu_rt->dst);
1243         p = this_cpu_ptr(rt->rt6i_pcpu);
1244         prev = cmpxchg(p, NULL, pcpu_rt);
1245         BUG_ON(prev);
1246
1247         return pcpu_rt;
1248 }
1249
1250 /* exception hash table implementation
1251  */
1252 static DEFINE_SPINLOCK(rt6_exception_lock);
1253
1254 /* Remove rt6_ex from hash table and free the memory
1255  * Caller must hold rt6_exception_lock
1256  */
1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1258                                  struct rt6_exception *rt6_ex)
1259 {
1260         struct net *net;
1261
1262         if (!bucket || !rt6_ex)
1263                 return;
1264
1265         net = dev_net(rt6_ex->rt6i->dst.dev);
1266         hlist_del_rcu(&rt6_ex->hlist);
1267         dst_release(&rt6_ex->rt6i->dst);
1268         kfree_rcu(rt6_ex, rcu);
1269         WARN_ON_ONCE(!bucket->depth);
1270         bucket->depth--;
1271         net->ipv6.rt6_stats->fib_rt_cache--;
1272 }
1273
1274 /* Remove oldest rt6_ex in bucket and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1278 {
1279         struct rt6_exception *rt6_ex, *oldest = NULL;
1280
1281         if (!bucket)
1282                 return;
1283
1284         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1285                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1286                         oldest = rt6_ex;
1287         }
1288         rt6_remove_exception(bucket, oldest);
1289 }
1290
1291 static u32 rt6_exception_hash(const struct in6_addr *dst,
1292                               const struct in6_addr *src)
1293 {
1294         static u32 seed __read_mostly;
1295         u32 val;
1296
1297         net_get_random_once(&seed, sizeof(seed));
1298         val = jhash(dst, sizeof(*dst), seed);
1299
1300 #ifdef CONFIG_IPV6_SUBTREES
1301         if (src)
1302                 val = jhash(src, sizeof(*src), val);
1303 #endif
1304         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1305 }
1306
1307 /* Helper function to find the cached rt in the hash table
1308  * and update bucket pointer to point to the bucket for this
1309  * (daddr, saddr) pair
1310  * Caller must hold rt6_exception_lock
1311  */
1312 static struct rt6_exception *
1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1314                               const struct in6_addr *daddr,
1315                               const struct in6_addr *saddr)
1316 {
1317         struct rt6_exception *rt6_ex;
1318         u32 hval;
1319
1320         if (!(*bucket) || !daddr)
1321                 return NULL;
1322
1323         hval = rt6_exception_hash(daddr, saddr);
1324         *bucket += hval;
1325
1326         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1327                 struct rt6_info *rt6 = rt6_ex->rt6i;
1328                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1329
1330 #ifdef CONFIG_IPV6_SUBTREES
1331                 if (matched && saddr)
1332                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1333 #endif
1334                 if (matched)
1335                         return rt6_ex;
1336         }
1337         return NULL;
1338 }
1339
1340 /* Helper function to find the cached rt in the hash table
1341  * and update bucket pointer to point to the bucket for this
1342  * (daddr, saddr) pair
1343  * Caller must hold rcu_read_lock()
1344  */
1345 static struct rt6_exception *
1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1347                          const struct in6_addr *daddr,
1348                          const struct in6_addr *saddr)
1349 {
1350         struct rt6_exception *rt6_ex;
1351         u32 hval;
1352
1353         WARN_ON_ONCE(!rcu_read_lock_held());
1354
1355         if (!(*bucket) || !daddr)
1356                 return NULL;
1357
1358         hval = rt6_exception_hash(daddr, saddr);
1359         *bucket += hval;
1360
1361         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1362                 struct rt6_info *rt6 = rt6_ex->rt6i;
1363                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1364
1365 #ifdef CONFIG_IPV6_SUBTREES
1366                 if (matched && saddr)
1367                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1368 #endif
1369                 if (matched)
1370                         return rt6_ex;
1371         }
1372         return NULL;
1373 }
1374
1375 static unsigned int fib6_mtu(const struct fib6_info *rt)
1376 {
1377         unsigned int mtu;
1378
1379         if (rt->fib6_pmtu) {
1380                 mtu = rt->fib6_pmtu;
1381         } else {
1382                 struct net_device *dev = fib6_info_nh_dev(rt);
1383                 struct inet6_dev *idev;
1384
1385                 rcu_read_lock();
1386                 idev = __in6_dev_get(dev);
1387                 mtu = idev->cnf.mtu6;
1388                 rcu_read_unlock();
1389         }
1390
1391         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1392
1393         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1394 }
1395
1396 static int rt6_insert_exception(struct rt6_info *nrt,
1397                                 struct fib6_info *ort)
1398 {
1399         struct net *net = dev_net(nrt->dst.dev);
1400         struct rt6_exception_bucket *bucket;
1401         struct in6_addr *src_key = NULL;
1402         struct rt6_exception *rt6_ex;
1403         int err = 0;
1404
1405         spin_lock_bh(&rt6_exception_lock);
1406
1407         if (ort->exception_bucket_flushed) {
1408                 err = -EINVAL;
1409                 goto out;
1410         }
1411
1412         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1413                                         lockdep_is_held(&rt6_exception_lock));
1414         if (!bucket) {
1415                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1416                                  GFP_ATOMIC);
1417                 if (!bucket) {
1418                         err = -ENOMEM;
1419                         goto out;
1420                 }
1421                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1422         }
1423
1424 #ifdef CONFIG_IPV6_SUBTREES
1425         /* rt6i_src.plen != 0 indicates ort is in subtree
1426          * and exception table is indexed by a hash of
1427          * both rt6i_dst and rt6i_src.
1428          * Otherwise, the exception table is indexed by
1429          * a hash of only rt6i_dst.
1430          */
1431         if (ort->fib6_src.plen)
1432                 src_key = &nrt->rt6i_src.addr;
1433 #endif
1434
1435         /* Update rt6i_prefsrc as it could be changed
1436          * in rt6_remove_prefsrc()
1437          */
1438         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439         /* rt6_mtu_change() might lower mtu on ort.
1440          * Only insert this exception route if its mtu
1441          * is less than ort's mtu value.
1442          */
1443         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1444                 err = -EINVAL;
1445                 goto out;
1446         }
1447
1448         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1449                                                src_key);
1450         if (rt6_ex)
1451                 rt6_remove_exception(bucket, rt6_ex);
1452
1453         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1454         if (!rt6_ex) {
1455                 err = -ENOMEM;
1456                 goto out;
1457         }
1458         rt6_ex->rt6i = nrt;
1459         rt6_ex->stamp = jiffies;
1460         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1461         bucket->depth++;
1462         net->ipv6.rt6_stats->fib_rt_cache++;
1463
1464         if (bucket->depth > FIB6_MAX_DEPTH)
1465                 rt6_exception_remove_oldest(bucket);
1466
1467 out:
1468         spin_unlock_bh(&rt6_exception_lock);
1469
1470         /* Update fn->fn_sernum to invalidate all cached dst */
1471         if (!err) {
1472                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1473                 fib6_update_sernum(net, ort);
1474                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475                 fib6_force_start_gc(net);
1476         }
1477
1478         return err;
1479 }
1480
1481 void rt6_flush_exceptions(struct fib6_info *rt)
1482 {
1483         struct rt6_exception_bucket *bucket;
1484         struct rt6_exception *rt6_ex;
1485         struct hlist_node *tmp;
1486         int i;
1487
1488         spin_lock_bh(&rt6_exception_lock);
1489         /* Prevent rt6_insert_exception() to recreate the bucket list */
1490         rt->exception_bucket_flushed = 1;
1491
1492         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493                                     lockdep_is_held(&rt6_exception_lock));
1494         if (!bucket)
1495                 goto out;
1496
1497         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1499                         rt6_remove_exception(bucket, rt6_ex);
1500                 WARN_ON_ONCE(bucket->depth);
1501                 bucket++;
1502         }
1503
1504 out:
1505         spin_unlock_bh(&rt6_exception_lock);
1506 }
1507
1508 /* Find cached rt in the hash table inside passed in rt
1509  * Caller has to hold rcu_read_lock()
1510  */
1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512                                            struct in6_addr *daddr,
1513                                            struct in6_addr *saddr)
1514 {
1515         struct rt6_exception_bucket *bucket;
1516         struct in6_addr *src_key = NULL;
1517         struct rt6_exception *rt6_ex;
1518         struct rt6_info *res = NULL;
1519
1520         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1521
1522 #ifdef CONFIG_IPV6_SUBTREES
1523         /* rt6i_src.plen != 0 indicates rt is in subtree
1524          * and exception table is indexed by a hash of
1525          * both rt6i_dst and rt6i_src.
1526          * Otherwise, the exception table is indexed by
1527          * a hash of only rt6i_dst.
1528          */
1529         if (rt->fib6_src.plen)
1530                 src_key = saddr;
1531 #endif
1532         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1533
1534         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1535                 res = rt6_ex->rt6i;
1536
1537         return res;
1538 }
1539
1540 /* Remove the passed in cached rt from the hash table that contains it */
1541 static int rt6_remove_exception_rt(struct rt6_info *rt)
1542 {
1543         struct rt6_exception_bucket *bucket;
1544         struct in6_addr *src_key = NULL;
1545         struct rt6_exception *rt6_ex;
1546         struct fib6_info *from;
1547         int err;
1548
1549         from = rcu_dereference(rt->from);
1550         if (!from ||
1551             !(rt->rt6i_flags & RTF_CACHE))
1552                 return -EINVAL;
1553
1554         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1555                 return -ENOENT;
1556
1557         spin_lock_bh(&rt6_exception_lock);
1558         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1559                                     lockdep_is_held(&rt6_exception_lock));
1560 #ifdef CONFIG_IPV6_SUBTREES
1561         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1562          * and exception table is indexed by a hash of
1563          * both rt6i_dst and rt6i_src.
1564          * Otherwise, the exception table is indexed by
1565          * a hash of only rt6i_dst.
1566          */
1567         if (from->fib6_src.plen)
1568                 src_key = &rt->rt6i_src.addr;
1569 #endif
1570         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1571                                                &rt->rt6i_dst.addr,
1572                                                src_key);
1573         if (rt6_ex) {
1574                 rt6_remove_exception(bucket, rt6_ex);
1575                 err = 0;
1576         } else {
1577                 err = -ENOENT;
1578         }
1579
1580         spin_unlock_bh(&rt6_exception_lock);
1581         return err;
1582 }
1583
1584 /* Find rt6_ex which contains the passed in rt cache and
1585  * refresh its stamp
1586  */
1587 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1588 {
1589         struct rt6_exception_bucket *bucket;
1590         struct fib6_info *from = rt->from;
1591         struct in6_addr *src_key = NULL;
1592         struct rt6_exception *rt6_ex;
1593
1594         if (!from ||
1595             !(rt->rt6i_flags & RTF_CACHE))
1596                 return;
1597
1598         rcu_read_lock();
1599         bucket = rcu_dereference(from->rt6i_exception_bucket);
1600
1601 #ifdef CONFIG_IPV6_SUBTREES
1602         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1603          * and exception table is indexed by a hash of
1604          * both rt6i_dst and rt6i_src.
1605          * Otherwise, the exception table is indexed by
1606          * a hash of only rt6i_dst.
1607          */
1608         if (from->fib6_src.plen)
1609                 src_key = &rt->rt6i_src.addr;
1610 #endif
1611         rt6_ex = __rt6_find_exception_rcu(&bucket,
1612                                           &rt->rt6i_dst.addr,
1613                                           src_key);
1614         if (rt6_ex)
1615                 rt6_ex->stamp = jiffies;
1616
1617         rcu_read_unlock();
1618 }
1619
1620 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1621 {
1622         struct rt6_exception_bucket *bucket;
1623         struct rt6_exception *rt6_ex;
1624         int i;
1625
1626         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1627                                         lockdep_is_held(&rt6_exception_lock));
1628
1629         if (bucket) {
1630                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1631                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1632                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1633                         }
1634                         bucket++;
1635                 }
1636         }
1637 }
1638
1639 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1640                                          struct rt6_info *rt, int mtu)
1641 {
1642         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1643          * lowest MTU in the path: always allow updating the route PMTU to
1644          * reflect PMTU decreases.
1645          *
1646          * If the new MTU is higher, and the route PMTU is equal to the local
1647          * MTU, this means the old MTU is the lowest in the path, so allow
1648          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1649          * handle this.
1650          */
1651
1652         if (dst_mtu(&rt->dst) >= mtu)
1653                 return true;
1654
1655         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1656                 return true;
1657
1658         return false;
1659 }
1660
1661 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1662                                        struct fib6_info *rt, int mtu)
1663 {
1664         struct rt6_exception_bucket *bucket;
1665         struct rt6_exception *rt6_ex;
1666         int i;
1667
1668         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1669                                         lockdep_is_held(&rt6_exception_lock));
1670
1671         if (!bucket)
1672                 return;
1673
1674         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1675                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1676                         struct rt6_info *entry = rt6_ex->rt6i;
1677
1678                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1679                          * route), the metrics of its rt->from have already
1680                          * been updated.
1681                          */
1682                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1683                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1684                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1685                 }
1686                 bucket++;
1687         }
1688 }
1689
1690 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1691
1692 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1693                                         struct in6_addr *gateway)
1694 {
1695         struct rt6_exception_bucket *bucket;
1696         struct rt6_exception *rt6_ex;
1697         struct hlist_node *tmp;
1698         int i;
1699
1700         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1701                 return;
1702
1703         spin_lock_bh(&rt6_exception_lock);
1704         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1705                                      lockdep_is_held(&rt6_exception_lock));
1706
1707         if (bucket) {
1708                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1709                         hlist_for_each_entry_safe(rt6_ex, tmp,
1710                                                   &bucket->chain, hlist) {
1711                                 struct rt6_info *entry = rt6_ex->rt6i;
1712
1713                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1714                                     RTF_CACHE_GATEWAY &&
1715                                     ipv6_addr_equal(gateway,
1716                                                     &entry->rt6i_gateway)) {
1717                                         rt6_remove_exception(bucket, rt6_ex);
1718                                 }
1719                         }
1720                         bucket++;
1721                 }
1722         }
1723
1724         spin_unlock_bh(&rt6_exception_lock);
1725 }
1726
1727 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1728                                       struct rt6_exception *rt6_ex,
1729                                       struct fib6_gc_args *gc_args,
1730                                       unsigned long now)
1731 {
1732         struct rt6_info *rt = rt6_ex->rt6i;
1733
1734         /* we are pruning and obsoleting aged-out and non gateway exceptions
1735          * even if others have still references to them, so that on next
1736          * dst_check() such references can be dropped.
1737          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1738          * expired, independently from their aging, as per RFC 8201 section 4
1739          */
1740         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1741                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1742                         RT6_TRACE("aging clone %p\n", rt);
1743                         rt6_remove_exception(bucket, rt6_ex);
1744                         return;
1745                 }
1746         } else if (time_after(jiffies, rt->dst.expires)) {
1747                 RT6_TRACE("purging expired route %p\n", rt);
1748                 rt6_remove_exception(bucket, rt6_ex);
1749                 return;
1750         }
1751
1752         if (rt->rt6i_flags & RTF_GATEWAY) {
1753                 struct neighbour *neigh;
1754                 __u8 neigh_flags = 0;
1755
1756                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1757                 if (neigh)
1758                         neigh_flags = neigh->flags;
1759
1760                 if (!(neigh_flags & NTF_ROUTER)) {
1761                         RT6_TRACE("purging route %p via non-router but gateway\n",
1762                                   rt);
1763                         rt6_remove_exception(bucket, rt6_ex);
1764                         return;
1765                 }
1766         }
1767
1768         gc_args->more++;
1769 }
1770
1771 void rt6_age_exceptions(struct fib6_info *rt,
1772                         struct fib6_gc_args *gc_args,
1773                         unsigned long now)
1774 {
1775         struct rt6_exception_bucket *bucket;
1776         struct rt6_exception *rt6_ex;
1777         struct hlist_node *tmp;
1778         int i;
1779
1780         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1781                 return;
1782
1783         rcu_read_lock_bh();
1784         spin_lock(&rt6_exception_lock);
1785         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1786                                     lockdep_is_held(&rt6_exception_lock));
1787
1788         if (bucket) {
1789                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1790                         hlist_for_each_entry_safe(rt6_ex, tmp,
1791                                                   &bucket->chain, hlist) {
1792                                 rt6_age_examine_exception(bucket, rt6_ex,
1793                                                           gc_args, now);
1794                         }
1795                         bucket++;
1796                 }
1797         }
1798         spin_unlock(&rt6_exception_lock);
1799         rcu_read_unlock_bh();
1800 }
1801
1802 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1803                                int oif, struct flowi6 *fl6,
1804                                const struct sk_buff *skb, int flags)
1805 {
1806         struct fib6_node *fn, *saved_fn;
1807         struct fib6_info *f6i;
1808         struct rt6_info *rt;
1809         int strict = 0;
1810
1811         strict |= flags & RT6_LOOKUP_F_IFACE;
1812         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1813         if (net->ipv6.devconf_all->forwarding == 0)
1814                 strict |= RT6_LOOKUP_F_REACHABLE;
1815
1816         rcu_read_lock();
1817
1818         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1819         saved_fn = fn;
1820
1821         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1822                 oif = 0;
1823
1824 redo_rt6_select:
1825         f6i = rt6_select(net, fn, oif, strict);
1826         if (f6i->fib6_nsiblings)
1827                 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1828         if (f6i == net->ipv6.fib6_null_entry) {
1829                 fn = fib6_backtrack(fn, &fl6->saddr);
1830                 if (fn)
1831                         goto redo_rt6_select;
1832                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1833                         /* also consider unreachable route */
1834                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1835                         fn = saved_fn;
1836                         goto redo_rt6_select;
1837                 }
1838         }
1839
1840         if (f6i == net->ipv6.fib6_null_entry) {
1841                 rt = net->ipv6.ip6_null_entry;
1842                 rcu_read_unlock();
1843                 dst_hold(&rt->dst);
1844                 trace_fib6_table_lookup(net, rt, table, fl6);
1845                 return rt;
1846         }
1847
1848         /*Search through exception table */
1849         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1850         if (rt) {
1851                 if (ip6_hold_safe(net, &rt, true))
1852                         dst_use_noref(&rt->dst, jiffies);
1853
1854                 rcu_read_unlock();
1855                 trace_fib6_table_lookup(net, rt, table, fl6);
1856                 return rt;
1857         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1858                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1859                 /* Create a RTF_CACHE clone which will not be
1860                  * owned by the fib6 tree.  It is for the special case where
1861                  * the daddr in the skb during the neighbor look-up is different
1862                  * from the fl6->daddr used to look-up route here.
1863                  */
1864                 struct rt6_info *uncached_rt;
1865
1866                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1867
1868                 rcu_read_unlock();
1869
1870                 if (uncached_rt) {
1871                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1872                          * No need for another dst_hold()
1873                          */
1874                         rt6_uncached_list_add(uncached_rt);
1875                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1876                 } else {
1877                         uncached_rt = net->ipv6.ip6_null_entry;
1878                         dst_hold(&uncached_rt->dst);
1879                 }
1880
1881                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1882                 return uncached_rt;
1883
1884         } else {
1885                 /* Get a percpu copy */
1886
1887                 struct rt6_info *pcpu_rt;
1888
1889                 local_bh_disable();
1890                 pcpu_rt = rt6_get_pcpu_route(f6i);
1891
1892                 if (!pcpu_rt)
1893                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1894
1895                 local_bh_enable();
1896                 rcu_read_unlock();
1897                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1898                 return pcpu_rt;
1899         }
1900 }
1901 EXPORT_SYMBOL_GPL(ip6_pol_route);
1902
1903 static struct rt6_info *ip6_pol_route_input(struct net *net,
1904                                             struct fib6_table *table,
1905                                             struct flowi6 *fl6,
1906                                             const struct sk_buff *skb,
1907                                             int flags)
1908 {
1909         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1910 }
1911
1912 struct dst_entry *ip6_route_input_lookup(struct net *net,
1913                                          struct net_device *dev,
1914                                          struct flowi6 *fl6,
1915                                          const struct sk_buff *skb,
1916                                          int flags)
1917 {
1918         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1919                 flags |= RT6_LOOKUP_F_IFACE;
1920
1921         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1922 }
1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1924
1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1926                                   struct flow_keys *keys,
1927                                   struct flow_keys *flkeys)
1928 {
1929         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1930         const struct ipv6hdr *key_iph = outer_iph;
1931         struct flow_keys *_flkeys = flkeys;
1932         const struct ipv6hdr *inner_iph;
1933         const struct icmp6hdr *icmph;
1934         struct ipv6hdr _inner_iph;
1935         struct icmp6hdr _icmph;
1936
1937         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938                 goto out;
1939
1940         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1941                                    sizeof(_icmph), &_icmph);
1942         if (!icmph)
1943                 goto out;
1944
1945         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1946             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1947             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1948             icmph->icmp6_type != ICMPV6_PARAMPROB)
1949                 goto out;
1950
1951         inner_iph = skb_header_pointer(skb,
1952                                        skb_transport_offset(skb) + sizeof(*icmph),
1953                                        sizeof(_inner_iph), &_inner_iph);
1954         if (!inner_iph)
1955                 goto out;
1956
1957         key_iph = inner_iph;
1958         _flkeys = NULL;
1959 out:
1960         if (_flkeys) {
1961                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1962                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1963                 keys->tags.flow_label = _flkeys->tags.flow_label;
1964                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1965         } else {
1966                 keys->addrs.v6addrs.src = key_iph->saddr;
1967                 keys->addrs.v6addrs.dst = key_iph->daddr;
1968                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1969                 keys->basic.ip_proto = key_iph->nexthdr;
1970         }
1971 }
1972
1973 /* if skb is set it will be used and fl6 can be NULL */
1974 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1975                        const struct sk_buff *skb, struct flow_keys *flkeys)
1976 {
1977         struct flow_keys hash_keys;
1978         u32 mhash;
1979
1980         switch (ip6_multipath_hash_policy(net)) {
1981         case 0:
1982                 memset(&hash_keys, 0, sizeof(hash_keys));
1983                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1984                 if (skb) {
1985                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1986                 } else {
1987                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1988                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1989                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1990                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1991                 }
1992                 break;
1993         case 1:
1994                 if (skb) {
1995                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1996                         struct flow_keys keys;
1997
1998                         /* short-circuit if we already have L4 hash present */
1999                         if (skb->l4_hash)
2000                                 return skb_get_hash_raw(skb) >> 1;
2001
2002                         memset(&hash_keys, 0, sizeof(hash_keys));
2003
2004                         if (!flkeys) {
2005                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2006                                 flkeys = &keys;
2007                         }
2008                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2010                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2011                         hash_keys.ports.src = flkeys->ports.src;
2012                         hash_keys.ports.dst = flkeys->ports.dst;
2013                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2014                 } else {
2015                         memset(&hash_keys, 0, sizeof(hash_keys));
2016                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2017                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2018                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2019                         hash_keys.ports.src = fl6->fl6_sport;
2020                         hash_keys.ports.dst = fl6->fl6_dport;
2021                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2022                 }
2023                 break;
2024         }
2025         mhash = flow_hash_from_keys(&hash_keys);
2026
2027         return mhash >> 1;
2028 }
2029
2030 void ip6_route_input(struct sk_buff *skb)
2031 {
2032         const struct ipv6hdr *iph = ipv6_hdr(skb);
2033         struct net *net = dev_net(skb->dev);
2034         int flags = RT6_LOOKUP_F_HAS_SADDR;
2035         struct ip_tunnel_info *tun_info;
2036         struct flowi6 fl6 = {
2037                 .flowi6_iif = skb->dev->ifindex,
2038                 .daddr = iph->daddr,
2039                 .saddr = iph->saddr,
2040                 .flowlabel = ip6_flowinfo(iph),
2041                 .flowi6_mark = skb->mark,
2042                 .flowi6_proto = iph->nexthdr,
2043         };
2044         struct flow_keys *flkeys = NULL, _flkeys;
2045
2046         tun_info = skb_tunnel_info(skb);
2047         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2048                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2049
2050         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2051                 flkeys = &_flkeys;
2052
2053         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2054                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2055         skb_dst_drop(skb);
2056         skb_dst_set(skb,
2057                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2058 }
2059
2060 static struct rt6_info *ip6_pol_route_output(struct net *net,
2061                                              struct fib6_table *table,
2062                                              struct flowi6 *fl6,
2063                                              const struct sk_buff *skb,
2064                                              int flags)
2065 {
2066         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2067 }
2068
2069 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2070                                          struct flowi6 *fl6, int flags)
2071 {
2072         bool any_src;
2073
2074         if (rt6_need_strict(&fl6->daddr)) {
2075                 struct dst_entry *dst;
2076
2077                 dst = l3mdev_link_scope_lookup(net, fl6);
2078                 if (dst)
2079                         return dst;
2080         }
2081
2082         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2083
2084         any_src = ipv6_addr_any(&fl6->saddr);
2085         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2086             (fl6->flowi6_oif && any_src))
2087                 flags |= RT6_LOOKUP_F_IFACE;
2088
2089         if (!any_src)
2090                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2091         else if (sk)
2092                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2093
2094         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2095 }
2096 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2097
2098 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2099 {
2100         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2101         struct net_device *loopback_dev = net->loopback_dev;
2102         struct dst_entry *new = NULL;
2103
2104         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2105                        DST_OBSOLETE_DEAD, 0);
2106         if (rt) {
2107                 rt6_info_init(rt);
2108                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2109
2110                 new = &rt->dst;
2111                 new->__use = 1;
2112                 new->input = dst_discard;
2113                 new->output = dst_discard_out;
2114
2115                 dst_copy_metrics(new, &ort->dst);
2116
2117                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2118                 rt->rt6i_gateway = ort->rt6i_gateway;
2119                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2120
2121                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2122 #ifdef CONFIG_IPV6_SUBTREES
2123                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2124 #endif
2125         }
2126
2127         dst_release(dst_orig);
2128         return new ? new : ERR_PTR(-ENOMEM);
2129 }
2130
2131 /*
2132  *      Destination cache support functions
2133  */
2134
2135 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2136 {
2137         u32 rt_cookie = 0;
2138
2139         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2140                 return false;
2141
2142         if (fib6_check_expired(f6i))
2143                 return false;
2144
2145         return true;
2146 }
2147
2148 static struct dst_entry *rt6_check(struct rt6_info *rt,
2149                                    struct fib6_info *from,
2150                                    u32 cookie)
2151 {
2152         u32 rt_cookie = 0;
2153
2154         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2155             rt_cookie != cookie)
2156                 return NULL;
2157
2158         if (rt6_check_expired(rt))
2159                 return NULL;
2160
2161         return &rt->dst;
2162 }
2163
2164 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2165                                             struct fib6_info *from,
2166                                             u32 cookie)
2167 {
2168         if (!__rt6_check_expired(rt) &&
2169             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2170             fib6_check(from, cookie))
2171                 return &rt->dst;
2172         else
2173                 return NULL;
2174 }
2175
2176 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2177 {
2178         struct dst_entry *dst_ret;
2179         struct fib6_info *from;
2180         struct rt6_info *rt;
2181
2182         rt = container_of(dst, struct rt6_info, dst);
2183
2184         rcu_read_lock();
2185
2186         /* All IPV6 dsts are created with ->obsolete set to the value
2187          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2188          * into this function always.
2189          */
2190
2191         from = rcu_dereference(rt->from);
2192
2193         if (from && (rt->rt6i_flags & RTF_PCPU ||
2194             unlikely(!list_empty(&rt->rt6i_uncached))))
2195                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2196         else
2197                 dst_ret = rt6_check(rt, from, cookie);
2198
2199         rcu_read_unlock();
2200
2201         return dst_ret;
2202 }
2203
2204 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2205 {
2206         struct rt6_info *rt = (struct rt6_info *) dst;
2207
2208         if (rt) {
2209                 if (rt->rt6i_flags & RTF_CACHE) {
2210                         rcu_read_lock();
2211                         if (rt6_check_expired(rt)) {
2212                                 rt6_remove_exception_rt(rt);
2213                                 dst = NULL;
2214                         }
2215                         rcu_read_unlock();
2216                 } else {
2217                         dst_release(dst);
2218                         dst = NULL;
2219                 }
2220         }
2221         return dst;
2222 }
2223
2224 static void ip6_link_failure(struct sk_buff *skb)
2225 {
2226         struct rt6_info *rt;
2227
2228         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2229
2230         rt = (struct rt6_info *) skb_dst(skb);
2231         if (rt) {
2232                 rcu_read_lock();
2233                 if (rt->rt6i_flags & RTF_CACHE) {
2234                         if (dst_hold_safe(&rt->dst))
2235                                 rt6_remove_exception_rt(rt);
2236                 } else {
2237                         struct fib6_info *from;
2238                         struct fib6_node *fn;
2239
2240                         from = rcu_dereference(rt->from);
2241                         if (from) {
2242                                 fn = rcu_dereference(from->fib6_node);
2243                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2244                                         fn->fn_sernum = -1;
2245                         }
2246                 }
2247                 rcu_read_unlock();
2248         }
2249 }
2250
2251 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2252 {
2253         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2254                 struct fib6_info *from;
2255
2256                 rcu_read_lock();
2257                 from = rcu_dereference(rt0->from);
2258                 if (from)
2259                         rt0->dst.expires = from->expires;
2260                 rcu_read_unlock();
2261         }
2262
2263         dst_set_expires(&rt0->dst, timeout);
2264         rt0->rt6i_flags |= RTF_EXPIRES;
2265 }
2266
2267 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2268 {
2269         struct net *net = dev_net(rt->dst.dev);
2270
2271         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2272         rt->rt6i_flags |= RTF_MODIFIED;
2273         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2274 }
2275
2276 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2277 {
2278         bool from_set;
2279
2280         rcu_read_lock();
2281         from_set = !!rcu_dereference(rt->from);
2282         rcu_read_unlock();
2283
2284         return !(rt->rt6i_flags & RTF_CACHE) &&
2285                 (rt->rt6i_flags & RTF_PCPU || from_set);
2286 }
2287
2288 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2289                                  const struct ipv6hdr *iph, u32 mtu)
2290 {
2291         const struct in6_addr *daddr, *saddr;
2292         struct rt6_info *rt6 = (struct rt6_info *)dst;
2293
2294         if (rt6->rt6i_flags & RTF_LOCAL)
2295                 return;
2296
2297         if (dst_metric_locked(dst, RTAX_MTU))
2298                 return;
2299
2300         if (iph) {
2301                 daddr = &iph->daddr;
2302                 saddr = &iph->saddr;
2303         } else if (sk) {
2304                 daddr = &sk->sk_v6_daddr;
2305                 saddr = &inet6_sk(sk)->saddr;
2306         } else {
2307                 daddr = NULL;
2308                 saddr = NULL;
2309         }
2310         dst_confirm_neigh(dst, daddr);
2311         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2312         if (mtu >= dst_mtu(dst))
2313                 return;
2314
2315         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2316                 rt6_do_update_pmtu(rt6, mtu);
2317                 /* update rt6_ex->stamp for cache */
2318                 if (rt6->rt6i_flags & RTF_CACHE)
2319                         rt6_update_exception_stamp_rt(rt6);
2320         } else if (daddr) {
2321                 struct fib6_info *from;
2322                 struct rt6_info *nrt6;
2323
2324                 rcu_read_lock();
2325                 from = rcu_dereference(rt6->from);
2326                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2327                 if (nrt6) {
2328                         rt6_do_update_pmtu(nrt6, mtu);
2329                         if (rt6_insert_exception(nrt6, from))
2330                                 dst_release_immediate(&nrt6->dst);
2331                 }
2332                 rcu_read_unlock();
2333         }
2334 }
2335
2336 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2337                                struct sk_buff *skb, u32 mtu)
2338 {
2339         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2340 }
2341
2342 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2343                      int oif, u32 mark, kuid_t uid)
2344 {
2345         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2346         struct dst_entry *dst;
2347         struct flowi6 fl6;
2348
2349         memset(&fl6, 0, sizeof(fl6));
2350         fl6.flowi6_oif = oif;
2351         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2352         fl6.daddr = iph->daddr;
2353         fl6.saddr = iph->saddr;
2354         fl6.flowlabel = ip6_flowinfo(iph);
2355         fl6.flowi6_uid = uid;
2356
2357         dst = ip6_route_output(net, NULL, &fl6);
2358         if (!dst->error)
2359                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2360         dst_release(dst);
2361 }
2362 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2363
2364 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2365 {
2366         struct dst_entry *dst;
2367
2368         ip6_update_pmtu(skb, sock_net(sk), mtu,
2369                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2370
2371         dst = __sk_dst_get(sk);
2372         if (!dst || !dst->obsolete ||
2373             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2374                 return;
2375
2376         bh_lock_sock(sk);
2377         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2378                 ip6_datagram_dst_update(sk, false);
2379         bh_unlock_sock(sk);
2380 }
2381 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2382
2383 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2384                            const struct flowi6 *fl6)
2385 {
2386 #ifdef CONFIG_IPV6_SUBTREES
2387         struct ipv6_pinfo *np = inet6_sk(sk);
2388 #endif
2389
2390         ip6_dst_store(sk, dst,
2391                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2392                       &sk->sk_v6_daddr : NULL,
2393 #ifdef CONFIG_IPV6_SUBTREES
2394                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2395                       &np->saddr :
2396 #endif
2397                       NULL);
2398 }
2399
2400 /* Handle redirects */
2401 struct ip6rd_flowi {
2402         struct flowi6 fl6;
2403         struct in6_addr gateway;
2404 };
2405
2406 static struct rt6_info *__ip6_route_redirect(struct net *net,
2407                                              struct fib6_table *table,
2408                                              struct flowi6 *fl6,
2409                                              const struct sk_buff *skb,
2410                                              int flags)
2411 {
2412         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2413         struct rt6_info *ret = NULL, *rt_cache;
2414         struct fib6_info *rt;
2415         struct fib6_node *fn;
2416
2417         /* Get the "current" route for this destination and
2418          * check if the redirect has come from appropriate router.
2419          *
2420          * RFC 4861 specifies that redirects should only be
2421          * accepted if they come from the nexthop to the target.
2422          * Due to the way the routes are chosen, this notion
2423          * is a bit fuzzy and one might need to check all possible
2424          * routes.
2425          */
2426
2427         rcu_read_lock();
2428         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2429 restart:
2430         for_each_fib6_node_rt_rcu(fn) {
2431                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2432                         continue;
2433                 if (fib6_check_expired(rt))
2434                         continue;
2435                 if (rt->fib6_flags & RTF_REJECT)
2436                         break;
2437                 if (!(rt->fib6_flags & RTF_GATEWAY))
2438                         continue;
2439                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2440                         continue;
2441                 /* rt_cache's gateway might be different from its 'parent'
2442                  * in the case of an ip redirect.
2443                  * So we keep searching in the exception table if the gateway
2444                  * is different.
2445                  */
2446                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2447                         rt_cache = rt6_find_cached_rt(rt,
2448                                                       &fl6->daddr,
2449                                                       &fl6->saddr);
2450                         if (rt_cache &&
2451                             ipv6_addr_equal(&rdfl->gateway,
2452                                             &rt_cache->rt6i_gateway)) {
2453                                 ret = rt_cache;
2454                                 break;
2455                         }
2456                         continue;
2457                 }
2458                 break;
2459         }
2460
2461         if (!rt)
2462                 rt = net->ipv6.fib6_null_entry;
2463         else if (rt->fib6_flags & RTF_REJECT) {
2464                 ret = net->ipv6.ip6_null_entry;
2465                 goto out;
2466         }
2467
2468         if (rt == net->ipv6.fib6_null_entry) {
2469                 fn = fib6_backtrack(fn, &fl6->saddr);
2470                 if (fn)
2471                         goto restart;
2472         }
2473
2474 out:
2475         if (ret)
2476                 dst_hold(&ret->dst);
2477         else
2478                 ret = ip6_create_rt_rcu(rt);
2479
2480         rcu_read_unlock();
2481
2482         trace_fib6_table_lookup(net, ret, table, fl6);
2483         return ret;
2484 };
2485
2486 static struct dst_entry *ip6_route_redirect(struct net *net,
2487                                             const struct flowi6 *fl6,
2488                                             const struct sk_buff *skb,
2489                                             const struct in6_addr *gateway)
2490 {
2491         int flags = RT6_LOOKUP_F_HAS_SADDR;
2492         struct ip6rd_flowi rdfl;
2493
2494         rdfl.fl6 = *fl6;
2495         rdfl.gateway = *gateway;
2496
2497         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2498                                 flags, __ip6_route_redirect);
2499 }
2500
2501 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2502                   kuid_t uid)
2503 {
2504         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2505         struct dst_entry *dst;
2506         struct flowi6 fl6;
2507
2508         memset(&fl6, 0, sizeof(fl6));
2509         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2510         fl6.flowi6_oif = oif;
2511         fl6.flowi6_mark = mark;
2512         fl6.daddr = iph->daddr;
2513         fl6.saddr = iph->saddr;
2514         fl6.flowlabel = ip6_flowinfo(iph);
2515         fl6.flowi6_uid = uid;
2516
2517         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2518         rt6_do_redirect(dst, NULL, skb);
2519         dst_release(dst);
2520 }
2521 EXPORT_SYMBOL_GPL(ip6_redirect);
2522
2523 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2524                             u32 mark)
2525 {
2526         const struct ipv6hdr *iph = ipv6_hdr(skb);
2527         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2528         struct dst_entry *dst;
2529         struct flowi6 fl6;
2530
2531         memset(&fl6, 0, sizeof(fl6));
2532         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2533         fl6.flowi6_oif = oif;
2534         fl6.flowi6_mark = mark;
2535         fl6.daddr = msg->dest;
2536         fl6.saddr = iph->daddr;
2537         fl6.flowi6_uid = sock_net_uid(net, NULL);
2538
2539         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2540         rt6_do_redirect(dst, NULL, skb);
2541         dst_release(dst);
2542 }
2543
2544 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2545 {
2546         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2547                      sk->sk_uid);
2548 }
2549 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2550
2551 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2552 {
2553         struct net_device *dev = dst->dev;
2554         unsigned int mtu = dst_mtu(dst);
2555         struct net *net = dev_net(dev);
2556
2557         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2558
2559         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2560                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2561
2562         /*
2563          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2564          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2565          * IPV6_MAXPLEN is also valid and means: "any MSS,
2566          * rely only on pmtu discovery"
2567          */
2568         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2569                 mtu = IPV6_MAXPLEN;
2570         return mtu;
2571 }
2572
2573 static unsigned int ip6_mtu(const struct dst_entry *dst)
2574 {
2575         struct inet6_dev *idev;
2576         unsigned int mtu;
2577
2578         mtu = dst_metric_raw(dst, RTAX_MTU);
2579         if (mtu)
2580                 goto out;
2581
2582         mtu = IPV6_MIN_MTU;
2583
2584         rcu_read_lock();
2585         idev = __in6_dev_get(dst->dev);
2586         if (idev)
2587                 mtu = idev->cnf.mtu6;
2588         rcu_read_unlock();
2589
2590 out:
2591         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2592
2593         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2594 }
2595
2596 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2597                                   struct flowi6 *fl6)
2598 {
2599         struct dst_entry *dst;
2600         struct rt6_info *rt;
2601         struct inet6_dev *idev = in6_dev_get(dev);
2602         struct net *net = dev_net(dev);
2603
2604         if (unlikely(!idev))
2605                 return ERR_PTR(-ENODEV);
2606
2607         rt = ip6_dst_alloc(net, dev, 0);
2608         if (unlikely(!rt)) {
2609                 in6_dev_put(idev);
2610                 dst = ERR_PTR(-ENOMEM);
2611                 goto out;
2612         }
2613
2614         rt->dst.flags |= DST_HOST;
2615         rt->dst.input = ip6_input;
2616         rt->dst.output  = ip6_output;
2617         rt->rt6i_gateway  = fl6->daddr;
2618         rt->rt6i_dst.addr = fl6->daddr;
2619         rt->rt6i_dst.plen = 128;
2620         rt->rt6i_idev     = idev;
2621         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2622
2623         /* Add this dst into uncached_list so that rt6_disable_ip() can
2624          * do proper release of the net_device
2625          */
2626         rt6_uncached_list_add(rt);
2627         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2628
2629         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2630
2631 out:
2632         return dst;
2633 }
2634
2635 static int ip6_dst_gc(struct dst_ops *ops)
2636 {
2637         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2638         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2639         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2640         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2641         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2642         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2643         int entries;
2644
2645         entries = dst_entries_get_fast(ops);
2646         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2647             entries <= rt_max_size)
2648                 goto out;
2649
2650         net->ipv6.ip6_rt_gc_expire++;
2651         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2652         entries = dst_entries_get_slow(ops);
2653         if (entries < ops->gc_thresh)
2654                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2655 out:
2656         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2657         return entries > rt_max_size;
2658 }
2659
2660 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2661                                struct fib6_config *cfg)
2662 {
2663         struct dst_metrics *p;
2664
2665         if (!cfg->fc_mx)
2666                 return 0;
2667
2668         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2669         if (unlikely(!p))
2670                 return -ENOMEM;
2671
2672         refcount_set(&p->refcnt, 1);
2673         rt->fib6_metrics = p;
2674
2675         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2676 }
2677
2678 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2679                                             struct fib6_config *cfg,
2680                                             const struct in6_addr *gw_addr,
2681                                             u32 tbid, int flags)
2682 {
2683         struct flowi6 fl6 = {
2684                 .flowi6_oif = cfg->fc_ifindex,
2685                 .daddr = *gw_addr,
2686                 .saddr = cfg->fc_prefsrc,
2687         };
2688         struct fib6_table *table;
2689         struct rt6_info *rt;
2690
2691         table = fib6_get_table(net, tbid);
2692         if (!table)
2693                 return NULL;
2694
2695         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2696                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2697
2698         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2699         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2700
2701         /* if table lookup failed, fall back to full lookup */
2702         if (rt == net->ipv6.ip6_null_entry) {
2703                 ip6_rt_put(rt);
2704                 rt = NULL;
2705         }
2706
2707         return rt;
2708 }
2709
2710 static int ip6_route_check_nh_onlink(struct net *net,
2711                                      struct fib6_config *cfg,
2712                                      const struct net_device *dev,
2713                                      struct netlink_ext_ack *extack)
2714 {
2715         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2716         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2717         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2718         struct rt6_info *grt;
2719         int err;
2720
2721         err = 0;
2722         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2723         if (grt) {
2724                 if (!grt->dst.error &&
2725                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2726                         NL_SET_ERR_MSG(extack,
2727                                        "Nexthop has invalid gateway or device mismatch");
2728                         err = -EINVAL;
2729                 }
2730
2731                 ip6_rt_put(grt);
2732         }
2733
2734         return err;
2735 }
2736
2737 static int ip6_route_check_nh(struct net *net,
2738                               struct fib6_config *cfg,
2739                               struct net_device **_dev,
2740                               struct inet6_dev **idev)
2741 {
2742         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2743         struct net_device *dev = _dev ? *_dev : NULL;
2744         struct rt6_info *grt = NULL;
2745         int err = -EHOSTUNREACH;
2746
2747         if (cfg->fc_table) {
2748                 int flags = RT6_LOOKUP_F_IFACE;
2749
2750                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2751                                           cfg->fc_table, flags);
2752                 if (grt) {
2753                         if (grt->rt6i_flags & RTF_GATEWAY ||
2754                             (dev && dev != grt->dst.dev)) {
2755                                 ip6_rt_put(grt);
2756                                 grt = NULL;
2757                         }
2758                 }
2759         }
2760
2761         if (!grt)
2762                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2763
2764         if (!grt)
2765                 goto out;
2766
2767         if (dev) {
2768                 if (dev != grt->dst.dev) {
2769                         ip6_rt_put(grt);
2770                         goto out;
2771                 }
2772         } else {
2773                 *_dev = dev = grt->dst.dev;
2774                 *idev = grt->rt6i_idev;
2775                 dev_hold(dev);
2776                 in6_dev_hold(grt->rt6i_idev);
2777         }
2778
2779         if (!(grt->rt6i_flags & RTF_GATEWAY))
2780                 err = 0;
2781
2782         ip6_rt_put(grt);
2783
2784 out:
2785         return err;
2786 }
2787
2788 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2789                            struct net_device **_dev, struct inet6_dev **idev,
2790                            struct netlink_ext_ack *extack)
2791 {
2792         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2793         int gwa_type = ipv6_addr_type(gw_addr);
2794         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2795         const struct net_device *dev = *_dev;
2796         bool need_addr_check = !dev;
2797         int err = -EINVAL;
2798
2799         /* if gw_addr is local we will fail to detect this in case
2800          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2801          * will return already-added prefix route via interface that
2802          * prefix route was assigned to, which might be non-loopback.
2803          */
2804         if (dev &&
2805             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2806                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2807                 goto out;
2808         }
2809
2810         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2811                 /* IPv6 strictly inhibits using not link-local
2812                  * addresses as nexthop address.
2813                  * Otherwise, router will not able to send redirects.
2814                  * It is very good, but in some (rare!) circumstances
2815                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2816                  * some exceptions. --ANK
2817                  * We allow IPv4-mapped nexthops to support RFC4798-type
2818                  * addressing
2819                  */
2820                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2821                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2822                         goto out;
2823                 }
2824
2825                 if (cfg->fc_flags & RTNH_F_ONLINK)
2826                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2827                 else
2828                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2829
2830                 if (err)
2831                         goto out;
2832         }
2833
2834         /* reload in case device was changed */
2835         dev = *_dev;
2836
2837         err = -EINVAL;
2838         if (!dev) {
2839                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2840                 goto out;
2841         } else if (dev->flags & IFF_LOOPBACK) {
2842                 NL_SET_ERR_MSG(extack,
2843                                "Egress device can not be loopback device for this route");
2844                 goto out;
2845         }
2846
2847         /* if we did not check gw_addr above, do so now that the
2848          * egress device has been resolved.
2849          */
2850         if (need_addr_check &&
2851             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2852                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2853                 goto out;
2854         }
2855
2856         err = 0;
2857 out:
2858         return err;
2859 }
2860
2861 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2862                                               gfp_t gfp_flags,
2863                                               struct netlink_ext_ack *extack)
2864 {
2865         struct net *net = cfg->fc_nlinfo.nl_net;
2866         struct fib6_info *rt = NULL;
2867         struct net_device *dev = NULL;
2868         struct inet6_dev *idev = NULL;
2869         struct fib6_table *table;
2870         int addr_type;
2871         int err = -EINVAL;
2872
2873         /* RTF_PCPU is an internal flag; can not be set by userspace */
2874         if (cfg->fc_flags & RTF_PCPU) {
2875                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2876                 goto out;
2877         }
2878
2879         /* RTF_CACHE is an internal flag; can not be set by userspace */
2880         if (cfg->fc_flags & RTF_CACHE) {
2881                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2882                 goto out;
2883         }
2884
2885         if (cfg->fc_type > RTN_MAX) {
2886                 NL_SET_ERR_MSG(extack, "Invalid route type");
2887                 goto out;
2888         }
2889
2890         if (cfg->fc_dst_len > 128) {
2891                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2892                 goto out;
2893         }
2894         if (cfg->fc_src_len > 128) {
2895                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2896                 goto out;
2897         }
2898 #ifndef CONFIG_IPV6_SUBTREES
2899         if (cfg->fc_src_len) {
2900                 NL_SET_ERR_MSG(extack,
2901                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2902                 goto out;
2903         }
2904 #endif
2905         if (cfg->fc_ifindex) {
2906                 err = -ENODEV;
2907                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2908                 if (!dev)
2909                         goto out;
2910                 idev = in6_dev_get(dev);
2911                 if (!idev)
2912                         goto out;
2913         }
2914
2915         if (cfg->fc_metric == 0)
2916                 cfg->fc_metric = IP6_RT_PRIO_USER;
2917
2918         if (cfg->fc_flags & RTNH_F_ONLINK) {
2919                 if (!dev) {
2920                         NL_SET_ERR_MSG(extack,
2921                                        "Nexthop device required for onlink");
2922                         err = -ENODEV;
2923                         goto out;
2924                 }
2925
2926                 if (!(dev->flags & IFF_UP)) {
2927                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2928                         err = -ENETDOWN;
2929                         goto out;
2930                 }
2931         }
2932
2933         err = -ENOBUFS;
2934         if (cfg->fc_nlinfo.nlh &&
2935             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2936                 table = fib6_get_table(net, cfg->fc_table);
2937                 if (!table) {
2938                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2939                         table = fib6_new_table(net, cfg->fc_table);
2940                 }
2941         } else {
2942                 table = fib6_new_table(net, cfg->fc_table);
2943         }
2944
2945         if (!table)
2946                 goto out;
2947
2948         err = -ENOMEM;
2949         rt = fib6_info_alloc(gfp_flags);
2950         if (!rt)
2951                 goto out;
2952
2953         if (cfg->fc_flags & RTF_ADDRCONF)
2954                 rt->dst_nocount = true;
2955
2956         err = ip6_convert_metrics(net, rt, cfg);
2957         if (err < 0)
2958                 goto out;
2959
2960         if (cfg->fc_flags & RTF_EXPIRES)
2961                 fib6_set_expires(rt, jiffies +
2962                                 clock_t_to_jiffies(cfg->fc_expires));
2963         else
2964                 fib6_clean_expires(rt);
2965
2966         if (cfg->fc_protocol == RTPROT_UNSPEC)
2967                 cfg->fc_protocol = RTPROT_BOOT;
2968         rt->fib6_protocol = cfg->fc_protocol;
2969
2970         addr_type = ipv6_addr_type(&cfg->fc_dst);
2971
2972         if (cfg->fc_encap) {
2973                 struct lwtunnel_state *lwtstate;
2974
2975                 err = lwtunnel_build_state(cfg->fc_encap_type,
2976                                            cfg->fc_encap, AF_INET6, cfg,
2977                                            &lwtstate, extack);
2978                 if (err)
2979                         goto out;
2980                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2981         }
2982
2983         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2984         rt->fib6_dst.plen = cfg->fc_dst_len;
2985         if (rt->fib6_dst.plen == 128)
2986                 rt->dst_host = true;
2987
2988 #ifdef CONFIG_IPV6_SUBTREES
2989         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2990         rt->fib6_src.plen = cfg->fc_src_len;
2991 #endif
2992
2993         rt->fib6_metric = cfg->fc_metric;
2994         rt->fib6_nh.nh_weight = 1;
2995
2996         rt->fib6_type = cfg->fc_type;
2997
2998         /* We cannot add true routes via loopback here,
2999            they would result in kernel looping; promote them to reject routes
3000          */
3001         if ((cfg->fc_flags & RTF_REJECT) ||
3002             (dev && (dev->flags & IFF_LOOPBACK) &&
3003              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3004              !(cfg->fc_flags & RTF_LOCAL))) {
3005                 /* hold loopback dev/idev if we haven't done so. */
3006                 if (dev != net->loopback_dev) {
3007                         if (dev) {
3008                                 dev_put(dev);
3009                                 in6_dev_put(idev);
3010                         }
3011                         dev = net->loopback_dev;
3012                         dev_hold(dev);
3013                         idev = in6_dev_get(dev);
3014                         if (!idev) {
3015                                 err = -ENODEV;
3016                                 goto out;
3017                         }
3018                 }
3019                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3020                 goto install_route;
3021         }
3022
3023         if (cfg->fc_flags & RTF_GATEWAY) {
3024                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3025                 if (err)
3026                         goto out;
3027
3028                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3029         }
3030
3031         err = -ENODEV;
3032         if (!dev)
3033                 goto out;
3034
3035         if (idev->cnf.disable_ipv6) {
3036                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3037                 err = -EACCES;
3038                 goto out;
3039         }
3040
3041         if (!(dev->flags & IFF_UP)) {
3042                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3043                 err = -ENETDOWN;
3044                 goto out;
3045         }
3046
3047         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3048                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3049                         NL_SET_ERR_MSG(extack, "Invalid source address");
3050                         err = -EINVAL;
3051                         goto out;
3052                 }
3053                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3054                 rt->fib6_prefsrc.plen = 128;
3055         } else
3056                 rt->fib6_prefsrc.plen = 0;
3057
3058         rt->fib6_flags = cfg->fc_flags;
3059
3060 install_route:
3061         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3062             !netif_carrier_ok(dev))
3063                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3064         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3065         rt->fib6_nh.nh_dev = dev;
3066         rt->fib6_table = table;
3067
3068         cfg->fc_nlinfo.nl_net = dev_net(dev);
3069
3070         if (idev)
3071                 in6_dev_put(idev);
3072
3073         return rt;
3074 out:
3075         if (dev)
3076                 dev_put(dev);
3077         if (idev)
3078                 in6_dev_put(idev);
3079
3080         fib6_info_release(rt);
3081         return ERR_PTR(err);
3082 }
3083
3084 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3085                   struct netlink_ext_ack *extack)
3086 {
3087         struct fib6_info *rt;
3088         int err;
3089
3090         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3091         if (IS_ERR(rt))
3092                 return PTR_ERR(rt);
3093
3094         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3095         fib6_info_release(rt);
3096
3097         return err;
3098 }
3099
3100 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3101 {
3102         struct net *net = info->nl_net;
3103         struct fib6_table *table;
3104         int err;
3105
3106         if (rt == net->ipv6.fib6_null_entry) {
3107                 err = -ENOENT;
3108                 goto out;
3109         }
3110
3111         table = rt->fib6_table;
3112         spin_lock_bh(&table->tb6_lock);
3113         err = fib6_del(rt, info);
3114         spin_unlock_bh(&table->tb6_lock);
3115
3116 out:
3117         fib6_info_release(rt);
3118         return err;
3119 }
3120
3121 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3122 {
3123         struct nl_info info = { .nl_net = net };
3124
3125         return __ip6_del_rt(rt, &info);
3126 }
3127
3128 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3129 {
3130         struct nl_info *info = &cfg->fc_nlinfo;
3131         struct net *net = info->nl_net;
3132         struct sk_buff *skb = NULL;
3133         struct fib6_table *table;
3134         int err = -ENOENT;
3135
3136         if (rt == net->ipv6.fib6_null_entry)
3137                 goto out_put;
3138         table = rt->fib6_table;
3139         spin_lock_bh(&table->tb6_lock);
3140
3141         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3142                 struct fib6_info *sibling, *next_sibling;
3143
3144                 /* prefer to send a single notification with all hops */
3145                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3146                 if (skb) {
3147                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3148
3149                         if (rt6_fill_node(net, skb, rt, NULL,
3150                                           NULL, NULL, 0, RTM_DELROUTE,
3151                                           info->portid, seq, 0) < 0) {
3152                                 kfree_skb(skb);
3153                                 skb = NULL;
3154                         } else
3155                                 info->skip_notify = 1;
3156                 }
3157
3158                 list_for_each_entry_safe(sibling, next_sibling,
3159                                          &rt->fib6_siblings,
3160                                          fib6_siblings) {
3161                         err = fib6_del(sibling, info);
3162                         if (err)
3163                                 goto out_unlock;
3164                 }
3165         }
3166
3167         err = fib6_del(rt, info);
3168 out_unlock:
3169         spin_unlock_bh(&table->tb6_lock);
3170 out_put:
3171         fib6_info_release(rt);
3172
3173         if (skb) {
3174                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3175                             info->nlh, gfp_any());
3176         }
3177         return err;
3178 }
3179
3180 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3181 {
3182         int rc = -ESRCH;
3183
3184         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3185                 goto out;
3186
3187         if (cfg->fc_flags & RTF_GATEWAY &&
3188             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3189                 goto out;
3190         if (dst_hold_safe(&rt->dst))
3191                 rc = rt6_remove_exception_rt(rt);
3192 out:
3193         return rc;
3194 }
3195
3196 static int ip6_route_del(struct fib6_config *cfg,
3197                          struct netlink_ext_ack *extack)
3198 {
3199         struct rt6_info *rt_cache;
3200         struct fib6_table *table;
3201         struct fib6_info *rt;
3202         struct fib6_node *fn;
3203         int err = -ESRCH;
3204
3205         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3206         if (!table) {
3207                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3208                 return err;
3209         }
3210
3211         rcu_read_lock();
3212
3213         fn = fib6_locate(&table->tb6_root,
3214                          &cfg->fc_dst, cfg->fc_dst_len,
3215                          &cfg->fc_src, cfg->fc_src_len,
3216                          !(cfg->fc_flags & RTF_CACHE));
3217
3218         if (fn) {
3219                 for_each_fib6_node_rt_rcu(fn) {
3220                         if (cfg->fc_flags & RTF_CACHE) {
3221                                 int rc;
3222
3223                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3224                                                               &cfg->fc_src);
3225                                 if (rt_cache) {
3226                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3227                                         if (rc != -ESRCH)
3228                                                 return rc;
3229                                 }
3230                                 continue;
3231                         }
3232                         if (cfg->fc_ifindex &&
3233                             (!rt->fib6_nh.nh_dev ||
3234                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3235                                 continue;
3236                         if (cfg->fc_flags & RTF_GATEWAY &&
3237                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3238                                 continue;
3239                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3240                                 continue;
3241                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3242                                 continue;
3243                         fib6_info_hold(rt);
3244                         rcu_read_unlock();
3245
3246                         /* if gateway was specified only delete the one hop */
3247                         if (cfg->fc_flags & RTF_GATEWAY)
3248                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3249
3250                         return __ip6_del_rt_siblings(rt, cfg);
3251                 }
3252         }
3253         rcu_read_unlock();
3254
3255         return err;
3256 }
3257
3258 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3259 {
3260         struct netevent_redirect netevent;
3261         struct rt6_info *rt, *nrt = NULL;
3262         struct ndisc_options ndopts;
3263         struct inet6_dev *in6_dev;
3264         struct neighbour *neigh;
3265         struct fib6_info *from;
3266         struct rd_msg *msg;
3267         int optlen, on_link;
3268         u8 *lladdr;
3269
3270         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3271         optlen -= sizeof(*msg);
3272
3273         if (optlen < 0) {
3274                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3275                 return;
3276         }
3277
3278         msg = (struct rd_msg *)icmp6_hdr(skb);
3279
3280         if (ipv6_addr_is_multicast(&msg->dest)) {
3281                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3282                 return;
3283         }
3284
3285         on_link = 0;
3286         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3287                 on_link = 1;
3288         } else if (ipv6_addr_type(&msg->target) !=
3289                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3290                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3291                 return;
3292         }
3293
3294         in6_dev = __in6_dev_get(skb->dev);
3295         if (!in6_dev)
3296                 return;
3297         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3298                 return;
3299
3300         /* RFC2461 8.1:
3301          *      The IP source address of the Redirect MUST be the same as the current
3302          *      first-hop router for the specified ICMP Destination Address.
3303          */
3304
3305         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3306                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3307                 return;
3308         }
3309
3310         lladdr = NULL;
3311         if (ndopts.nd_opts_tgt_lladdr) {
3312                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3313                                              skb->dev);
3314                 if (!lladdr) {
3315                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3316                         return;
3317                 }
3318         }
3319
3320         rt = (struct rt6_info *) dst;
3321         if (rt->rt6i_flags & RTF_REJECT) {
3322                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3323                 return;
3324         }
3325
3326         /* Redirect received -> path was valid.
3327          * Look, redirects are sent only in response to data packets,
3328          * so that this nexthop apparently is reachable. --ANK
3329          */
3330         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3331
3332         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3333         if (!neigh)
3334                 return;
3335
3336         /*
3337          *      We have finally decided to accept it.
3338          */
3339
3340         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3341                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3342                      NEIGH_UPDATE_F_OVERRIDE|
3343                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3344                                      NEIGH_UPDATE_F_ISROUTER)),
3345                      NDISC_REDIRECT, &ndopts);
3346
3347         rcu_read_lock();
3348         from = rcu_dereference(rt->from);
3349         fib6_info_hold(from);
3350         rcu_read_unlock();
3351
3352         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3353         if (!nrt)
3354                 goto out;
3355
3356         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3357         if (on_link)
3358                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3359
3360         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3361
3362         /* No need to remove rt from the exception table if rt is
3363          * a cached route because rt6_insert_exception() will
3364          * takes care of it
3365          */
3366         if (rt6_insert_exception(nrt, from)) {
3367                 dst_release_immediate(&nrt->dst);
3368                 goto out;
3369         }
3370
3371         netevent.old = &rt->dst;
3372         netevent.new = &nrt->dst;
3373         netevent.daddr = &msg->dest;
3374         netevent.neigh = neigh;
3375         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3376
3377 out:
3378         fib6_info_release(from);
3379         neigh_release(neigh);
3380 }
3381
3382 #ifdef CONFIG_IPV6_ROUTE_INFO
3383 static struct fib6_info *rt6_get_route_info(struct net *net,
3384                                            const struct in6_addr *prefix, int prefixlen,
3385                                            const struct in6_addr *gwaddr,
3386                                            struct net_device *dev)
3387 {
3388         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3389         int ifindex = dev->ifindex;
3390         struct fib6_node *fn;
3391         struct fib6_info *rt = NULL;
3392         struct fib6_table *table;
3393
3394         table = fib6_get_table(net, tb_id);
3395         if (!table)
3396                 return NULL;
3397
3398         rcu_read_lock();
3399         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3400         if (!fn)
3401                 goto out;
3402
3403         for_each_fib6_node_rt_rcu(fn) {
3404                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3405                         continue;
3406                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3407                         continue;
3408                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3409                         continue;
3410                 fib6_info_hold(rt);
3411                 break;
3412         }
3413 out:
3414         rcu_read_unlock();
3415         return rt;
3416 }
3417
3418 static struct fib6_info *rt6_add_route_info(struct net *net,
3419                                            const struct in6_addr *prefix, int prefixlen,
3420                                            const struct in6_addr *gwaddr,
3421                                            struct net_device *dev,
3422                                            unsigned int pref)
3423 {
3424         struct fib6_config cfg = {
3425                 .fc_metric      = IP6_RT_PRIO_USER,
3426                 .fc_ifindex     = dev->ifindex,
3427                 .fc_dst_len     = prefixlen,
3428                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3429                                   RTF_UP | RTF_PREF(pref),
3430                 .fc_protocol = RTPROT_RA,
3431                 .fc_type = RTN_UNICAST,
3432                 .fc_nlinfo.portid = 0,
3433                 .fc_nlinfo.nlh = NULL,
3434                 .fc_nlinfo.nl_net = net,
3435         };
3436
3437         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3438         cfg.fc_dst = *prefix;
3439         cfg.fc_gateway = *gwaddr;
3440
3441         /* We should treat it as a default route if prefix length is 0. */
3442         if (!prefixlen)
3443                 cfg.fc_flags |= RTF_DEFAULT;
3444
3445         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3446
3447         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3448 }
3449 #endif
3450
3451 struct fib6_info *rt6_get_dflt_router(struct net *net,
3452                                      const struct in6_addr *addr,
3453                                      struct net_device *dev)
3454 {
3455         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3456         struct fib6_info *rt;
3457         struct fib6_table *table;
3458
3459         table = fib6_get_table(net, tb_id);
3460         if (!table)
3461                 return NULL;
3462
3463         rcu_read_lock();
3464         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3465                 if (dev == rt->fib6_nh.nh_dev &&
3466                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3467                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3468                         break;
3469         }
3470         if (rt)
3471                 fib6_info_hold(rt);
3472         rcu_read_unlock();
3473         return rt;
3474 }
3475
3476 struct fib6_info *rt6_add_dflt_router(struct net *net,
3477                                      const struct in6_addr *gwaddr,
3478                                      struct net_device *dev,
3479                                      unsigned int pref)
3480 {
3481         struct fib6_config cfg = {
3482                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3483                 .fc_metric      = IP6_RT_PRIO_USER,
3484                 .fc_ifindex     = dev->ifindex,
3485                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3486                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3487                 .fc_protocol = RTPROT_RA,
3488                 .fc_type = RTN_UNICAST,
3489                 .fc_nlinfo.portid = 0,
3490                 .fc_nlinfo.nlh = NULL,
3491                 .fc_nlinfo.nl_net = net,
3492         };
3493
3494         cfg.fc_gateway = *gwaddr;
3495
3496         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3497                 struct fib6_table *table;
3498
3499                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3500                 if (table)
3501                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3502         }
3503
3504         return rt6_get_dflt_router(net, gwaddr, dev);
3505 }
3506
3507 static void __rt6_purge_dflt_routers(struct net *net,
3508                                      struct fib6_table *table)
3509 {
3510         struct fib6_info *rt;
3511
3512 restart:
3513         rcu_read_lock();
3514         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3515                 struct net_device *dev = fib6_info_nh_dev(rt);
3516                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3517
3518                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3519                     (!idev || idev->cnf.accept_ra != 2)) {
3520                         fib6_info_hold(rt);
3521                         rcu_read_unlock();
3522                         ip6_del_rt(net, rt);
3523                         goto restart;
3524                 }
3525         }
3526         rcu_read_unlock();
3527
3528         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3529 }
3530
3531 void rt6_purge_dflt_routers(struct net *net)
3532 {
3533         struct fib6_table *table;
3534         struct hlist_head *head;
3535         unsigned int h;
3536
3537         rcu_read_lock();
3538
3539         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3540                 head = &net->ipv6.fib_table_hash[h];
3541                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3542                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3543                                 __rt6_purge_dflt_routers(net, table);
3544                 }
3545         }
3546
3547         rcu_read_unlock();
3548 }
3549
3550 static void rtmsg_to_fib6_config(struct net *net,
3551                                  struct in6_rtmsg *rtmsg,
3552                                  struct fib6_config *cfg)
3553 {
3554         memset(cfg, 0, sizeof(*cfg));
3555
3556         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3557                          : RT6_TABLE_MAIN;
3558         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3559         cfg->fc_metric = rtmsg->rtmsg_metric;
3560         cfg->fc_expires = rtmsg->rtmsg_info;
3561         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3562         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3563         cfg->fc_flags = rtmsg->rtmsg_flags;
3564         cfg->fc_type = rtmsg->rtmsg_type;
3565
3566         cfg->fc_nlinfo.nl_net = net;
3567
3568         cfg->fc_dst = rtmsg->rtmsg_dst;
3569         cfg->fc_src = rtmsg->rtmsg_src;
3570         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3571 }
3572
3573 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3574 {
3575         struct fib6_config cfg;
3576         struct in6_rtmsg rtmsg;
3577         int err;
3578
3579         switch (cmd) {
3580         case SIOCADDRT:         /* Add a route */
3581         case SIOCDELRT:         /* Delete a route */
3582                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3583                         return -EPERM;
3584                 err = copy_from_user(&rtmsg, arg,
3585                                      sizeof(struct in6_rtmsg));
3586                 if (err)
3587                         return -EFAULT;
3588
3589                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3590
3591                 rtnl_lock();
3592                 switch (cmd) {
3593                 case SIOCADDRT:
3594                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3595                         break;
3596                 case SIOCDELRT:
3597                         err = ip6_route_del(&cfg, NULL);
3598                         break;
3599                 default:
3600                         err = -EINVAL;
3601                 }
3602                 rtnl_unlock();
3603
3604                 return err;
3605         }
3606
3607         return -EINVAL;
3608 }
3609
3610 /*
3611  *      Drop the packet on the floor
3612  */
3613
3614 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3615 {
3616         int type;
3617         struct dst_entry *dst = skb_dst(skb);
3618         switch (ipstats_mib_noroutes) {
3619         case IPSTATS_MIB_INNOROUTES:
3620                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3621                 if (type == IPV6_ADDR_ANY) {
3622                         IP6_INC_STATS(dev_net(dst->dev),
3623                                       __in6_dev_get_safely(skb->dev),
3624                                       IPSTATS_MIB_INADDRERRORS);
3625                         break;
3626                 }
3627                 /* FALLTHROUGH */
3628         case IPSTATS_MIB_OUTNOROUTES:
3629                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3630                               ipstats_mib_noroutes);
3631                 break;
3632         }
3633         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3634         kfree_skb(skb);
3635         return 0;
3636 }
3637
3638 static int ip6_pkt_discard(struct sk_buff *skb)
3639 {
3640         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3641 }
3642
3643 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3644 {
3645         skb->dev = skb_dst(skb)->dev;
3646         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3647 }
3648
3649 static int ip6_pkt_prohibit(struct sk_buff *skb)
3650 {
3651         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3652 }
3653
3654 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3655 {
3656         skb->dev = skb_dst(skb)->dev;
3657         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3658 }
3659
3660 /*
3661  *      Allocate a dst for local (unicast / anycast) address.
3662  */
3663
3664 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3665                                      struct inet6_dev *idev,
3666                                      const struct in6_addr *addr,
3667                                      bool anycast, gfp_t gfp_flags)
3668 {
3669         u32 tb_id;
3670         struct net_device *dev = idev->dev;
3671         struct fib6_info *f6i;
3672
3673         f6i = fib6_info_alloc(gfp_flags);
3674         if (!f6i)
3675                 return ERR_PTR(-ENOMEM);
3676
3677         f6i->dst_nocount = true;
3678         f6i->dst_host = true;
3679         f6i->fib6_protocol = RTPROT_KERNEL;
3680         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3681         if (anycast) {
3682                 f6i->fib6_type = RTN_ANYCAST;
3683                 f6i->fib6_flags |= RTF_ANYCAST;
3684         } else {
3685                 f6i->fib6_type = RTN_LOCAL;
3686                 f6i->fib6_flags |= RTF_LOCAL;
3687         }
3688
3689         f6i->fib6_nh.nh_gw = *addr;
3690         dev_hold(dev);
3691         f6i->fib6_nh.nh_dev = dev;
3692         f6i->fib6_dst.addr = *addr;
3693         f6i->fib6_dst.plen = 128;
3694         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3695         f6i->fib6_table = fib6_get_table(net, tb_id);
3696
3697         return f6i;
3698 }
3699
3700 /* remove deleted ip from prefsrc entries */
3701 struct arg_dev_net_ip {
3702         struct net_device *dev;
3703         struct net *net;
3704         struct in6_addr *addr;
3705 };
3706
3707 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3708 {
3709         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3710         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3711         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3712
3713         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3714             rt != net->ipv6.fib6_null_entry &&
3715             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3716                 spin_lock_bh(&rt6_exception_lock);
3717                 /* remove prefsrc entry */
3718                 rt->fib6_prefsrc.plen = 0;
3719                 /* need to update cache as well */
3720                 rt6_exceptions_remove_prefsrc(rt);
3721                 spin_unlock_bh(&rt6_exception_lock);
3722         }
3723         return 0;
3724 }
3725
3726 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3727 {
3728         struct net *net = dev_net(ifp->idev->dev);
3729         struct arg_dev_net_ip adni = {
3730                 .dev = ifp->idev->dev,
3731                 .net = net,
3732                 .addr = &ifp->addr,
3733         };
3734         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3735 }
3736
3737 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3738
3739 /* Remove routers and update dst entries when gateway turn into host. */
3740 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3741 {
3742         struct in6_addr *gateway = (struct in6_addr *)arg;
3743
3744         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3745             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3746                 return -1;
3747         }
3748
3749         /* Further clean up cached routes in exception table.
3750          * This is needed because cached route may have a different
3751          * gateway than its 'parent' in the case of an ip redirect.
3752          */
3753         rt6_exceptions_clean_tohost(rt, gateway);
3754
3755         return 0;
3756 }
3757
3758 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3759 {
3760         fib6_clean_all(net, fib6_clean_tohost, gateway);
3761 }
3762
3763 struct arg_netdev_event {
3764         const struct net_device *dev;
3765         union {
3766                 unsigned int nh_flags;
3767                 unsigned long event;
3768         };
3769 };
3770
3771 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3772 {
3773         struct fib6_info *iter;
3774         struct fib6_node *fn;
3775
3776         fn = rcu_dereference_protected(rt->fib6_node,
3777                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3778         iter = rcu_dereference_protected(fn->leaf,
3779                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3780         while (iter) {
3781                 if (iter->fib6_metric == rt->fib6_metric &&
3782                     rt6_qualify_for_ecmp(iter))
3783                         return iter;
3784                 iter = rcu_dereference_protected(iter->fib6_next,
3785                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3786         }
3787
3788         return NULL;
3789 }
3790
3791 static bool rt6_is_dead(const struct fib6_info *rt)
3792 {
3793         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3794             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3795              fib6_ignore_linkdown(rt)))
3796                 return true;
3797
3798         return false;
3799 }
3800
3801 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3802 {
3803         struct fib6_info *iter;
3804         int total = 0;
3805
3806         if (!rt6_is_dead(rt))
3807                 total += rt->fib6_nh.nh_weight;
3808
3809         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3810                 if (!rt6_is_dead(iter))
3811                         total += iter->fib6_nh.nh_weight;
3812         }
3813
3814         return total;
3815 }
3816
3817 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3818 {
3819         int upper_bound = -1;
3820
3821         if (!rt6_is_dead(rt)) {
3822                 *weight += rt->fib6_nh.nh_weight;
3823                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3824                                                     total) - 1;
3825         }
3826         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3827 }
3828
3829 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3830 {
3831         struct fib6_info *iter;
3832         int weight = 0;
3833
3834         rt6_upper_bound_set(rt, &weight, total);
3835
3836         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3837                 rt6_upper_bound_set(iter, &weight, total);
3838 }
3839
3840 void rt6_multipath_rebalance(struct fib6_info *rt)
3841 {
3842         struct fib6_info *first;
3843         int total;
3844
3845         /* In case the entire multipath route was marked for flushing,
3846          * then there is no need to rebalance upon the removal of every
3847          * sibling route.
3848          */
3849         if (!rt->fib6_nsiblings || rt->should_flush)
3850                 return;
3851
3852         /* During lookup routes are evaluated in order, so we need to
3853          * make sure upper bounds are assigned from the first sibling
3854          * onwards.
3855          */
3856         first = rt6_multipath_first_sibling(rt);
3857         if (WARN_ON_ONCE(!first))
3858                 return;
3859
3860         total = rt6_multipath_total_weight(first);
3861         rt6_multipath_upper_bound_set(first, total);
3862 }
3863
3864 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3865 {
3866         const struct arg_netdev_event *arg = p_arg;
3867         struct net *net = dev_net(arg->dev);
3868
3869         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3870                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3871                 fib6_update_sernum_upto_root(net, rt);
3872                 rt6_multipath_rebalance(rt);
3873         }
3874
3875         return 0;
3876 }
3877
3878 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3879 {
3880         struct arg_netdev_event arg = {
3881                 .dev = dev,
3882                 {
3883                         .nh_flags = nh_flags,
3884                 },
3885         };
3886
3887         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3888                 arg.nh_flags |= RTNH_F_LINKDOWN;
3889
3890         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3891 }
3892
3893 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3894                                    const struct net_device *dev)
3895 {
3896         struct fib6_info *iter;
3897
3898         if (rt->fib6_nh.nh_dev == dev)
3899                 return true;
3900         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3901                 if (iter->fib6_nh.nh_dev == dev)
3902                         return true;
3903
3904         return false;
3905 }
3906
3907 static void rt6_multipath_flush(struct fib6_info *rt)
3908 {
3909         struct fib6_info *iter;
3910
3911         rt->should_flush = 1;
3912         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3913                 iter->should_flush = 1;
3914 }
3915
3916 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3917                                              const struct net_device *down_dev)
3918 {
3919         struct fib6_info *iter;
3920         unsigned int dead = 0;
3921
3922         if (rt->fib6_nh.nh_dev == down_dev ||
3923             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3924                 dead++;
3925         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3926                 if (iter->fib6_nh.nh_dev == down_dev ||
3927                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3928                         dead++;
3929
3930         return dead;
3931 }
3932
3933 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3934                                        const struct net_device *dev,
3935                                        unsigned int nh_flags)
3936 {
3937         struct fib6_info *iter;
3938
3939         if (rt->fib6_nh.nh_dev == dev)
3940                 rt->fib6_nh.nh_flags |= nh_flags;
3941         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3942                 if (iter->fib6_nh.nh_dev == dev)
3943                         iter->fib6_nh.nh_flags |= nh_flags;
3944 }
3945
3946 /* called with write lock held for table with rt */
3947 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3948 {
3949         const struct arg_netdev_event *arg = p_arg;
3950         const struct net_device *dev = arg->dev;
3951         struct net *net = dev_net(dev);
3952
3953         if (rt == net->ipv6.fib6_null_entry)
3954                 return 0;
3955
3956         switch (arg->event) {
3957         case NETDEV_UNREGISTER:
3958                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3959         case NETDEV_DOWN:
3960                 if (rt->should_flush)
3961                         return -1;
3962                 if (!rt->fib6_nsiblings)
3963                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3964                 if (rt6_multipath_uses_dev(rt, dev)) {
3965                         unsigned int count;
3966
3967                         count = rt6_multipath_dead_count(rt, dev);
3968                         if (rt->fib6_nsiblings + 1 == count) {
3969                                 rt6_multipath_flush(rt);
3970                                 return -1;
3971                         }
3972                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3973                                                    RTNH_F_LINKDOWN);
3974                         fib6_update_sernum(net, rt);
3975                         rt6_multipath_rebalance(rt);
3976                 }
3977                 return -2;
3978         case NETDEV_CHANGE:
3979                 if (rt->fib6_nh.nh_dev != dev ||
3980                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3981                         break;
3982                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3983                 rt6_multipath_rebalance(rt);
3984                 break;
3985         }
3986
3987         return 0;
3988 }
3989
3990 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3991 {
3992         struct arg_netdev_event arg = {
3993                 .dev = dev,
3994                 {
3995                         .event = event,
3996                 },
3997         };
3998
3999         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4000 }
4001
4002 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4003 {
4004         rt6_sync_down_dev(dev, event);
4005         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4006         neigh_ifdown(&nd_tbl, dev);
4007 }
4008
4009 struct rt6_mtu_change_arg {
4010         struct net_device *dev;
4011         unsigned int mtu;
4012 };
4013
4014 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4015 {
4016         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4017         struct inet6_dev *idev;
4018
4019         /* In IPv6 pmtu discovery is not optional,
4020            so that RTAX_MTU lock cannot disable it.
4021            We still use this lock to block changes
4022            caused by addrconf/ndisc.
4023         */
4024
4025         idev = __in6_dev_get(arg->dev);
4026         if (!idev)
4027                 return 0;
4028
4029         /* For administrative MTU increase, there is no way to discover
4030            IPv6 PMTU increase, so PMTU increase should be updated here.
4031            Since RFC 1981 doesn't include administrative MTU increase
4032            update PMTU increase is a MUST. (i.e. jumbo frame)
4033          */
4034         if (rt->fib6_nh.nh_dev == arg->dev &&
4035             !fib6_metric_locked(rt, RTAX_MTU)) {
4036                 u32 mtu = rt->fib6_pmtu;
4037
4038                 if (mtu >= arg->mtu ||
4039                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4040                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4041
4042                 spin_lock_bh(&rt6_exception_lock);
4043                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4044                 spin_unlock_bh(&rt6_exception_lock);
4045         }
4046         return 0;
4047 }
4048
4049 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4050 {
4051         struct rt6_mtu_change_arg arg = {
4052                 .dev = dev,
4053                 .mtu = mtu,
4054         };
4055
4056         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4057 }
4058
4059 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4060         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4061         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4062         [RTA_OIF]               = { .type = NLA_U32 },
4063         [RTA_IIF]               = { .type = NLA_U32 },
4064         [RTA_PRIORITY]          = { .type = NLA_U32 },
4065         [RTA_METRICS]           = { .type = NLA_NESTED },
4066         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4067         [RTA_PREF]              = { .type = NLA_U8 },
4068         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4069         [RTA_ENCAP]             = { .type = NLA_NESTED },
4070         [RTA_EXPIRES]           = { .type = NLA_U32 },
4071         [RTA_UID]               = { .type = NLA_U32 },
4072         [RTA_MARK]              = { .type = NLA_U32 },
4073         [RTA_TABLE]             = { .type = NLA_U32 },
4074 };
4075
4076 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4077                               struct fib6_config *cfg,
4078                               struct netlink_ext_ack *extack)
4079 {
4080         struct rtmsg *rtm;
4081         struct nlattr *tb[RTA_MAX+1];
4082         unsigned int pref;
4083         int err;
4084
4085         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4086                           NULL);
4087         if (err < 0)
4088                 goto errout;
4089
4090         err = -EINVAL;
4091         rtm = nlmsg_data(nlh);
4092         memset(cfg, 0, sizeof(*cfg));
4093
4094         cfg->fc_table = rtm->rtm_table;
4095         cfg->fc_dst_len = rtm->rtm_dst_len;
4096         cfg->fc_src_len = rtm->rtm_src_len;
4097         cfg->fc_flags = RTF_UP;
4098         cfg->fc_protocol = rtm->rtm_protocol;
4099         cfg->fc_type = rtm->rtm_type;
4100
4101         if (rtm->rtm_type == RTN_UNREACHABLE ||
4102             rtm->rtm_type == RTN_BLACKHOLE ||
4103             rtm->rtm_type == RTN_PROHIBIT ||
4104             rtm->rtm_type == RTN_THROW)
4105                 cfg->fc_flags |= RTF_REJECT;
4106
4107         if (rtm->rtm_type == RTN_LOCAL)
4108                 cfg->fc_flags |= RTF_LOCAL;
4109
4110         if (rtm->rtm_flags & RTM_F_CLONED)
4111                 cfg->fc_flags |= RTF_CACHE;
4112
4113         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4114
4115         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4116         cfg->fc_nlinfo.nlh = nlh;
4117         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4118
4119         if (tb[RTA_GATEWAY]) {
4120                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4121                 cfg->fc_flags |= RTF_GATEWAY;
4122         }
4123
4124         if (tb[RTA_DST]) {
4125                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4126
4127                 if (nla_len(tb[RTA_DST]) < plen)
4128                         goto errout;
4129
4130                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4131         }
4132
4133         if (tb[RTA_SRC]) {
4134                 int plen = (rtm->rtm_src_len + 7) >> 3;
4135
4136                 if (nla_len(tb[RTA_SRC]) < plen)
4137                         goto errout;
4138
4139                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4140         }
4141
4142         if (tb[RTA_PREFSRC])
4143                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4144
4145         if (tb[RTA_OIF])
4146                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4147
4148         if (tb[RTA_PRIORITY])
4149                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4150
4151         if (tb[RTA_METRICS]) {
4152                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4153                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4154         }
4155
4156         if (tb[RTA_TABLE])
4157                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4158
4159         if (tb[RTA_MULTIPATH]) {
4160                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4161                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4162
4163                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4164                                                      cfg->fc_mp_len, extack);
4165                 if (err < 0)
4166                         goto errout;
4167         }
4168
4169         if (tb[RTA_PREF]) {
4170                 pref = nla_get_u8(tb[RTA_PREF]);
4171                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4172                     pref != ICMPV6_ROUTER_PREF_HIGH)
4173                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4174                 cfg->fc_flags |= RTF_PREF(pref);
4175         }
4176
4177         if (tb[RTA_ENCAP])
4178                 cfg->fc_encap = tb[RTA_ENCAP];
4179
4180         if (tb[RTA_ENCAP_TYPE]) {
4181                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4182
4183                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4184                 if (err < 0)
4185                         goto errout;
4186         }
4187
4188         if (tb[RTA_EXPIRES]) {
4189                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4190
4191                 if (addrconf_finite_timeout(timeout)) {
4192                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4193                         cfg->fc_flags |= RTF_EXPIRES;
4194                 }
4195         }
4196
4197         err = 0;
4198 errout:
4199         return err;
4200 }
4201
4202 struct rt6_nh {
4203         struct fib6_info *fib6_info;
4204         struct fib6_config r_cfg;
4205         struct list_head next;
4206 };
4207
4208 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4209 {
4210         struct rt6_nh *nh;
4211
4212         list_for_each_entry(nh, rt6_nh_list, next) {
4213                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4214                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4215                         nh->r_cfg.fc_ifindex);
4216         }
4217 }
4218
4219 static int ip6_route_info_append(struct net *net,
4220                                  struct list_head *rt6_nh_list,
4221                                  struct fib6_info *rt,
4222                                  struct fib6_config *r_cfg)
4223 {
4224         struct rt6_nh *nh;
4225         int err = -EEXIST;
4226
4227         list_for_each_entry(nh, rt6_nh_list, next) {
4228                 /* check if fib6_info already exists */
4229                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4230                         return err;
4231         }
4232
4233         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4234         if (!nh)
4235                 return -ENOMEM;
4236         nh->fib6_info = rt;
4237         err = ip6_convert_metrics(net, rt, r_cfg);
4238         if (err) {
4239                 kfree(nh);
4240                 return err;
4241         }
4242         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4243         list_add_tail(&nh->next, rt6_nh_list);
4244
4245         return 0;
4246 }
4247
4248 static void ip6_route_mpath_notify(struct fib6_info *rt,
4249                                    struct fib6_info *rt_last,
4250                                    struct nl_info *info,
4251                                    __u16 nlflags)
4252 {
4253         /* if this is an APPEND route, then rt points to the first route
4254          * inserted and rt_last points to last route inserted. Userspace
4255          * wants a consistent dump of the route which starts at the first
4256          * nexthop. Since sibling routes are always added at the end of
4257          * the list, find the first sibling of the last route appended
4258          */
4259         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4260                 rt = list_first_entry(&rt_last->fib6_siblings,
4261                                       struct fib6_info,
4262                                       fib6_siblings);
4263         }
4264
4265         if (rt)
4266                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4267 }
4268
4269 static int ip6_route_multipath_add(struct fib6_config *cfg,
4270                                    struct netlink_ext_ack *extack)
4271 {
4272         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4273         struct nl_info *info = &cfg->fc_nlinfo;
4274         struct fib6_config r_cfg;
4275         struct rtnexthop *rtnh;
4276         struct fib6_info *rt;
4277         struct rt6_nh *err_nh;
4278         struct rt6_nh *nh, *nh_safe;
4279         __u16 nlflags;
4280         int remaining;
4281         int attrlen;
4282         int err = 1;
4283         int nhn = 0;
4284         int replace = (cfg->fc_nlinfo.nlh &&
4285                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4286         LIST_HEAD(rt6_nh_list);
4287
4288         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4289         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4290                 nlflags |= NLM_F_APPEND;
4291
4292         remaining = cfg->fc_mp_len;
4293         rtnh = (struct rtnexthop *)cfg->fc_mp;
4294
4295         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4296          * fib6_info structs per nexthop
4297          */
4298         while (rtnh_ok(rtnh, remaining)) {
4299                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4300                 if (rtnh->rtnh_ifindex)
4301                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4302
4303                 attrlen = rtnh_attrlen(rtnh);
4304                 if (attrlen > 0) {
4305                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4306
4307                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4308                         if (nla) {
4309                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4310                                 r_cfg.fc_flags |= RTF_GATEWAY;
4311                         }
4312                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4313                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4314                         if (nla)
4315                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4316                 }
4317
4318                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4319                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4320                 if (IS_ERR(rt)) {
4321                         err = PTR_ERR(rt);
4322                         rt = NULL;
4323                         goto cleanup;
4324                 }
4325
4326                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4327
4328                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4329                                             rt, &r_cfg);
4330                 if (err) {
4331                         fib6_info_release(rt);
4332                         goto cleanup;
4333                 }
4334
4335                 rtnh = rtnh_next(rtnh, &remaining);
4336         }
4337
4338         /* for add and replace send one notification with all nexthops.
4339          * Skip the notification in fib6_add_rt2node and send one with
4340          * the full route when done
4341          */
4342         info->skip_notify = 1;
4343
4344         err_nh = NULL;
4345         list_for_each_entry(nh, &rt6_nh_list, next) {
4346                 rt_last = nh->fib6_info;
4347                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4348                 fib6_info_release(nh->fib6_info);
4349
4350                 /* save reference to first route for notification */
4351                 if (!rt_notif && !err)
4352                         rt_notif = nh->fib6_info;
4353
4354                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4355                 nh->fib6_info = NULL;
4356                 if (err) {
4357                         if (replace && nhn)
4358                                 ip6_print_replace_route_err(&rt6_nh_list);
4359                         err_nh = nh;
4360                         goto add_errout;
4361                 }
4362
4363                 /* Because each route is added like a single route we remove
4364                  * these flags after the first nexthop: if there is a collision,
4365                  * we have already failed to add the first nexthop:
4366                  * fib6_add_rt2node() has rejected it; when replacing, old
4367                  * nexthops have been replaced by first new, the rest should
4368                  * be added to it.
4369                  */
4370                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4371                                                      NLM_F_REPLACE);
4372                 nhn++;
4373         }
4374
4375         /* success ... tell user about new route */
4376         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4377         goto cleanup;
4378
4379 add_errout:
4380         /* send notification for routes that were added so that
4381          * the delete notifications sent by ip6_route_del are
4382          * coherent
4383          */
4384         if (rt_notif)
4385                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4386
4387         /* Delete routes that were already added */
4388         list_for_each_entry(nh, &rt6_nh_list, next) {
4389                 if (err_nh == nh)
4390                         break;
4391                 ip6_route_del(&nh->r_cfg, extack);
4392         }
4393
4394 cleanup:
4395         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4396                 if (nh->fib6_info)
4397                         fib6_info_release(nh->fib6_info);
4398                 list_del(&nh->next);
4399                 kfree(nh);
4400         }
4401
4402         return err;
4403 }
4404
4405 static int ip6_route_multipath_del(struct fib6_config *cfg,
4406                                    struct netlink_ext_ack *extack)
4407 {
4408         struct fib6_config r_cfg;
4409         struct rtnexthop *rtnh;
4410         int remaining;
4411         int attrlen;
4412         int err = 1, last_err = 0;
4413
4414         remaining = cfg->fc_mp_len;
4415         rtnh = (struct rtnexthop *)cfg->fc_mp;
4416
4417         /* Parse a Multipath Entry */
4418         while (rtnh_ok(rtnh, remaining)) {
4419                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4420                 if (rtnh->rtnh_ifindex)
4421                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4422
4423                 attrlen = rtnh_attrlen(rtnh);
4424                 if (attrlen > 0) {
4425                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4426
4427                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4428                         if (nla) {
4429                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4430                                 r_cfg.fc_flags |= RTF_GATEWAY;
4431                         }
4432                 }
4433                 err = ip6_route_del(&r_cfg, extack);
4434                 if (err)
4435                         last_err = err;
4436
4437                 rtnh = rtnh_next(rtnh, &remaining);
4438         }
4439
4440         return last_err;
4441 }
4442
4443 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4444                               struct netlink_ext_ack *extack)
4445 {
4446         struct fib6_config cfg;
4447         int err;
4448
4449         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4450         if (err < 0)
4451                 return err;
4452
4453         if (cfg.fc_mp)
4454                 return ip6_route_multipath_del(&cfg, extack);
4455         else {
4456                 cfg.fc_delete_all_nh = 1;
4457                 return ip6_route_del(&cfg, extack);
4458         }
4459 }
4460
4461 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4462                               struct netlink_ext_ack *extack)
4463 {
4464         struct fib6_config cfg;
4465         int err;
4466
4467         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4468         if (err < 0)
4469                 return err;
4470
4471         if (cfg.fc_mp)
4472                 return ip6_route_multipath_add(&cfg, extack);
4473         else
4474                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4475 }
4476
4477 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4478 {
4479         int nexthop_len = 0;
4480
4481         if (rt->fib6_nsiblings) {
4482                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4483                             + NLA_ALIGN(sizeof(struct rtnexthop))
4484                             + nla_total_size(16) /* RTA_GATEWAY */
4485                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4486
4487                 nexthop_len *= rt->fib6_nsiblings;
4488         }
4489
4490         return NLMSG_ALIGN(sizeof(struct rtmsg))
4491                + nla_total_size(16) /* RTA_SRC */
4492                + nla_total_size(16) /* RTA_DST */
4493                + nla_total_size(16) /* RTA_GATEWAY */
4494                + nla_total_size(16) /* RTA_PREFSRC */
4495                + nla_total_size(4) /* RTA_TABLE */
4496                + nla_total_size(4) /* RTA_IIF */
4497                + nla_total_size(4) /* RTA_OIF */
4498                + nla_total_size(4) /* RTA_PRIORITY */
4499                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4500                + nla_total_size(sizeof(struct rta_cacheinfo))
4501                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4502                + nla_total_size(1) /* RTA_PREF */
4503                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4504                + nexthop_len;
4505 }
4506
4507 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4508                             unsigned int *flags, bool skip_oif)
4509 {
4510         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4511                 *flags |= RTNH_F_DEAD;
4512
4513         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4514                 *flags |= RTNH_F_LINKDOWN;
4515
4516                 rcu_read_lock();
4517                 if (fib6_ignore_linkdown(rt))
4518                         *flags |= RTNH_F_DEAD;
4519                 rcu_read_unlock();
4520         }
4521
4522         if (rt->fib6_flags & RTF_GATEWAY) {
4523                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4524                         goto nla_put_failure;
4525         }
4526
4527         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4528         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4529                 *flags |= RTNH_F_OFFLOAD;
4530
4531         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4532         if (!skip_oif && rt->fib6_nh.nh_dev &&
4533             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4534                 goto nla_put_failure;
4535
4536         if (rt->fib6_nh.nh_lwtstate &&
4537             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4538                 goto nla_put_failure;
4539
4540         return 0;
4541
4542 nla_put_failure:
4543         return -EMSGSIZE;
4544 }
4545
4546 /* add multipath next hop */
4547 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4548 {
4549         const struct net_device *dev = rt->fib6_nh.nh_dev;
4550         struct rtnexthop *rtnh;
4551         unsigned int flags = 0;
4552
4553         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4554         if (!rtnh)
4555                 goto nla_put_failure;
4556
4557         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4558         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4559
4560         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4561                 goto nla_put_failure;
4562
4563         rtnh->rtnh_flags = flags;
4564
4565         /* length of rtnetlink header + attributes */
4566         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4567
4568         return 0;
4569
4570 nla_put_failure:
4571         return -EMSGSIZE;
4572 }
4573
4574 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4575                          struct fib6_info *rt, struct dst_entry *dst,
4576                          struct in6_addr *dest, struct in6_addr *src,
4577                          int iif, int type, u32 portid, u32 seq,
4578                          unsigned int flags)
4579 {
4580         struct rtmsg *rtm;
4581         struct nlmsghdr *nlh;
4582         long expires = 0;
4583         u32 *pmetrics;
4584         u32 table;
4585
4586         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4587         if (!nlh)
4588                 return -EMSGSIZE;
4589
4590         rtm = nlmsg_data(nlh);
4591         rtm->rtm_family = AF_INET6;
4592         rtm->rtm_dst_len = rt->fib6_dst.plen;
4593         rtm->rtm_src_len = rt->fib6_src.plen;
4594         rtm->rtm_tos = 0;
4595         if (rt->fib6_table)
4596                 table = rt->fib6_table->tb6_id;
4597         else
4598                 table = RT6_TABLE_UNSPEC;
4599         rtm->rtm_table = table;
4600         if (nla_put_u32(skb, RTA_TABLE, table))
4601                 goto nla_put_failure;
4602
4603         rtm->rtm_type = rt->fib6_type;
4604         rtm->rtm_flags = 0;
4605         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4606         rtm->rtm_protocol = rt->fib6_protocol;
4607
4608         if (rt->fib6_flags & RTF_CACHE)
4609                 rtm->rtm_flags |= RTM_F_CLONED;
4610
4611         if (dest) {
4612                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4613                         goto nla_put_failure;
4614                 rtm->rtm_dst_len = 128;
4615         } else if (rtm->rtm_dst_len)
4616                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4617                         goto nla_put_failure;
4618 #ifdef CONFIG_IPV6_SUBTREES
4619         if (src) {
4620                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4621                         goto nla_put_failure;
4622                 rtm->rtm_src_len = 128;
4623         } else if (rtm->rtm_src_len &&
4624                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4625                 goto nla_put_failure;
4626 #endif
4627         if (iif) {
4628 #ifdef CONFIG_IPV6_MROUTE
4629                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4630                         int err = ip6mr_get_route(net, skb, rtm, portid);
4631
4632                         if (err == 0)
4633                                 return 0;
4634                         if (err < 0)
4635                                 goto nla_put_failure;
4636                 } else
4637 #endif
4638                         if (nla_put_u32(skb, RTA_IIF, iif))
4639                                 goto nla_put_failure;
4640         } else if (dest) {
4641                 struct in6_addr saddr_buf;
4642                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4643                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4644                         goto nla_put_failure;
4645         }
4646
4647         if (rt->fib6_prefsrc.plen) {
4648                 struct in6_addr saddr_buf;
4649                 saddr_buf = rt->fib6_prefsrc.addr;
4650                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4651                         goto nla_put_failure;
4652         }
4653
4654         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4655         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4656                 goto nla_put_failure;
4657
4658         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4659                 goto nla_put_failure;
4660
4661         /* For multipath routes, walk the siblings list and add
4662          * each as a nexthop within RTA_MULTIPATH.
4663          */
4664         if (rt->fib6_nsiblings) {
4665                 struct fib6_info *sibling, *next_sibling;
4666                 struct nlattr *mp;
4667
4668                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4669                 if (!mp)
4670                         goto nla_put_failure;
4671
4672                 if (rt6_add_nexthop(skb, rt) < 0)
4673                         goto nla_put_failure;
4674
4675                 list_for_each_entry_safe(sibling, next_sibling,
4676                                          &rt->fib6_siblings, fib6_siblings) {
4677                         if (rt6_add_nexthop(skb, sibling) < 0)
4678                                 goto nla_put_failure;
4679                 }
4680
4681                 nla_nest_end(skb, mp);
4682         } else {
4683                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4684                         goto nla_put_failure;
4685         }
4686
4687         if (rt->fib6_flags & RTF_EXPIRES) {
4688                 expires = dst ? dst->expires : rt->expires;
4689                 expires -= jiffies;
4690         }
4691
4692         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4693                 goto nla_put_failure;
4694
4695         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4696                 goto nla_put_failure;
4697
4698
4699         nlmsg_end(skb, nlh);
4700         return 0;
4701
4702 nla_put_failure:
4703         nlmsg_cancel(skb, nlh);
4704         return -EMSGSIZE;
4705 }
4706
4707 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4708 {
4709         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4710         struct net *net = arg->net;
4711
4712         if (rt == net->ipv6.fib6_null_entry)
4713                 return 0;
4714
4715         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4716                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4717
4718                 /* user wants prefix routes only */
4719                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4720                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4721                         /* success since this is not a prefix route */
4722                         return 1;
4723                 }
4724         }
4725
4726         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4727                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4728                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4729 }
4730
4731 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4732                               struct netlink_ext_ack *extack)
4733 {
4734         struct net *net = sock_net(in_skb->sk);
4735         struct nlattr *tb[RTA_MAX+1];
4736         int err, iif = 0, oif = 0;
4737         struct fib6_info *from;
4738         struct dst_entry *dst;
4739         struct rt6_info *rt;
4740         struct sk_buff *skb;
4741         struct rtmsg *rtm;
4742         struct flowi6 fl6;
4743         bool fibmatch;
4744
4745         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4746                           extack);
4747         if (err < 0)
4748                 goto errout;
4749
4750         err = -EINVAL;
4751         memset(&fl6, 0, sizeof(fl6));
4752         rtm = nlmsg_data(nlh);
4753         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4754         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4755
4756         if (tb[RTA_SRC]) {
4757                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4758                         goto errout;
4759
4760                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4761         }
4762
4763         if (tb[RTA_DST]) {
4764                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4765                         goto errout;
4766
4767                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4768         }
4769
4770         if (tb[RTA_IIF])
4771                 iif = nla_get_u32(tb[RTA_IIF]);
4772
4773         if (tb[RTA_OIF])
4774                 oif = nla_get_u32(tb[RTA_OIF]);
4775
4776         if (tb[RTA_MARK])
4777                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4778
4779         if (tb[RTA_UID])
4780                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4781                                            nla_get_u32(tb[RTA_UID]));
4782         else
4783                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4784
4785         if (iif) {
4786                 struct net_device *dev;
4787                 int flags = 0;
4788
4789                 rcu_read_lock();
4790
4791                 dev = dev_get_by_index_rcu(net, iif);
4792                 if (!dev) {
4793                         rcu_read_unlock();
4794                         err = -ENODEV;
4795                         goto errout;
4796                 }
4797
4798                 fl6.flowi6_iif = iif;
4799
4800                 if (!ipv6_addr_any(&fl6.saddr))
4801                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4802
4803                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4804
4805                 rcu_read_unlock();
4806         } else {
4807                 fl6.flowi6_oif = oif;
4808
4809                 dst = ip6_route_output(net, NULL, &fl6);
4810         }
4811
4812
4813         rt = container_of(dst, struct rt6_info, dst);
4814         if (rt->dst.error) {
4815                 err = rt->dst.error;
4816                 ip6_rt_put(rt);
4817                 goto errout;
4818         }
4819
4820         if (rt == net->ipv6.ip6_null_entry) {
4821                 err = rt->dst.error;
4822                 ip6_rt_put(rt);
4823                 goto errout;
4824         }
4825
4826         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4827         if (!skb) {
4828                 ip6_rt_put(rt);
4829                 err = -ENOBUFS;
4830                 goto errout;
4831         }
4832
4833         skb_dst_set(skb, &rt->dst);
4834
4835         rcu_read_lock();
4836         from = rcu_dereference(rt->from);
4837
4838         if (fibmatch)
4839                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4840                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4841                                     nlh->nlmsg_seq, 0);
4842         else
4843                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4844                                     &fl6.saddr, iif, RTM_NEWROUTE,
4845                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4846                                     0);
4847         rcu_read_unlock();
4848
4849         if (err < 0) {
4850                 kfree_skb(skb);
4851                 goto errout;
4852         }
4853
4854         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4855 errout:
4856         return err;
4857 }
4858
4859 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4860                      unsigned int nlm_flags)
4861 {
4862         struct sk_buff *skb;
4863         struct net *net = info->nl_net;
4864         u32 seq;
4865         int err;
4866
4867         err = -ENOBUFS;
4868         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4869
4870         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4871         if (!skb)
4872                 goto errout;
4873
4874         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4875                             event, info->portid, seq, nlm_flags);
4876         if (err < 0) {
4877                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4878                 WARN_ON(err == -EMSGSIZE);
4879                 kfree_skb(skb);
4880                 goto errout;
4881         }
4882         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4883                     info->nlh, gfp_any());
4884         return;
4885 errout:
4886         if (err < 0)
4887                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4888 }
4889
4890 static int ip6_route_dev_notify(struct notifier_block *this,
4891                                 unsigned long event, void *ptr)
4892 {
4893         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4894         struct net *net = dev_net(dev);
4895
4896         if (!(dev->flags & IFF_LOOPBACK))
4897                 return NOTIFY_OK;
4898
4899         if (event == NETDEV_REGISTER) {
4900                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4901                 net->ipv6.ip6_null_entry->dst.dev = dev;
4902                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4903 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4904                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4905                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4906                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4907                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4908 #endif
4909          } else if (event == NETDEV_UNREGISTER &&
4910                     dev->reg_state != NETREG_UNREGISTERED) {
4911                 /* NETDEV_UNREGISTER could be fired for multiple times by
4912                  * netdev_wait_allrefs(). Make sure we only call this once.
4913                  */
4914                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4915 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4916                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4917                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4918 #endif
4919         }
4920
4921         return NOTIFY_OK;
4922 }
4923
4924 /*
4925  *      /proc
4926  */
4927
4928 #ifdef CONFIG_PROC_FS
4929
4930 static const struct file_operations ipv6_route_proc_fops = {
4931         .open           = ipv6_route_open,
4932         .read           = seq_read,
4933         .llseek         = seq_lseek,
4934         .release        = seq_release_net,
4935 };
4936
4937 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4938 {
4939         struct net *net = (struct net *)seq->private;
4940         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4941                    net->ipv6.rt6_stats->fib_nodes,
4942                    net->ipv6.rt6_stats->fib_route_nodes,
4943                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4944                    net->ipv6.rt6_stats->fib_rt_entries,
4945                    net->ipv6.rt6_stats->fib_rt_cache,
4946                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4947                    net->ipv6.rt6_stats->fib_discarded_routes);
4948
4949         return 0;
4950 }
4951
4952 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4953 {
4954         return single_open_net(inode, file, rt6_stats_seq_show);
4955 }
4956
4957 static const struct file_operations rt6_stats_seq_fops = {
4958         .open    = rt6_stats_seq_open,
4959         .read    = seq_read,
4960         .llseek  = seq_lseek,
4961         .release = single_release_net,
4962 };
4963 #endif  /* CONFIG_PROC_FS */
4964
4965 #ifdef CONFIG_SYSCTL
4966
4967 static
4968 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4969                               void __user *buffer, size_t *lenp, loff_t *ppos)
4970 {
4971         struct net *net;
4972         int delay;
4973         if (!write)
4974                 return -EINVAL;
4975
4976         net = (struct net *)ctl->extra1;
4977         delay = net->ipv6.sysctl.flush_delay;
4978         proc_dointvec(ctl, write, buffer, lenp, ppos);
4979         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4980         return 0;
4981 }
4982
4983 struct ctl_table ipv6_route_table_template[] = {
4984         {
4985                 .procname       =       "flush",
4986                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4987                 .maxlen         =       sizeof(int),
4988                 .mode           =       0200,
4989                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4990         },
4991         {
4992                 .procname       =       "gc_thresh",
4993                 .data           =       &ip6_dst_ops_template.gc_thresh,
4994                 .maxlen         =       sizeof(int),
4995                 .mode           =       0644,
4996                 .proc_handler   =       proc_dointvec,
4997         },
4998         {
4999                 .procname       =       "max_size",
5000                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5001                 .maxlen         =       sizeof(int),
5002                 .mode           =       0644,
5003                 .proc_handler   =       proc_dointvec,
5004         },
5005         {
5006                 .procname       =       "gc_min_interval",
5007                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5008                 .maxlen         =       sizeof(int),
5009                 .mode           =       0644,
5010                 .proc_handler   =       proc_dointvec_jiffies,
5011         },
5012         {
5013                 .procname       =       "gc_timeout",
5014                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5015                 .maxlen         =       sizeof(int),
5016                 .mode           =       0644,
5017                 .proc_handler   =       proc_dointvec_jiffies,
5018         },
5019         {
5020                 .procname       =       "gc_interval",
5021                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5022                 .maxlen         =       sizeof(int),
5023                 .mode           =       0644,
5024                 .proc_handler   =       proc_dointvec_jiffies,
5025         },
5026         {
5027                 .procname       =       "gc_elasticity",
5028                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5029                 .maxlen         =       sizeof(int),
5030                 .mode           =       0644,
5031                 .proc_handler   =       proc_dointvec,
5032         },
5033         {
5034                 .procname       =       "mtu_expires",
5035                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5036                 .maxlen         =       sizeof(int),
5037                 .mode           =       0644,
5038                 .proc_handler   =       proc_dointvec_jiffies,
5039         },
5040         {
5041                 .procname       =       "min_adv_mss",
5042                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5043                 .maxlen         =       sizeof(int),
5044                 .mode           =       0644,
5045                 .proc_handler   =       proc_dointvec,
5046         },
5047         {
5048                 .procname       =       "gc_min_interval_ms",
5049                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5050                 .maxlen         =       sizeof(int),
5051                 .mode           =       0644,
5052                 .proc_handler   =       proc_dointvec_ms_jiffies,
5053         },
5054         { }
5055 };
5056
5057 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5058 {
5059         struct ctl_table *table;
5060
5061         table = kmemdup(ipv6_route_table_template,
5062                         sizeof(ipv6_route_table_template),
5063                         GFP_KERNEL);
5064
5065         if (table) {
5066                 table[0].data = &net->ipv6.sysctl.flush_delay;
5067                 table[0].extra1 = net;
5068                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5069                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5070                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5071                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5072                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5073                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5074                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5075                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5076                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5077
5078                 /* Don't export sysctls to unprivileged users */
5079                 if (net->user_ns != &init_user_ns)
5080                         table[0].procname = NULL;
5081         }
5082
5083         return table;
5084 }
5085 #endif
5086
5087 static int __net_init ip6_route_net_init(struct net *net)
5088 {
5089         int ret = -ENOMEM;
5090
5091         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5092                sizeof(net->ipv6.ip6_dst_ops));
5093
5094         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5095                 goto out_ip6_dst_ops;
5096
5097         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5098                                             sizeof(*net->ipv6.fib6_null_entry),
5099                                             GFP_KERNEL);
5100         if (!net->ipv6.fib6_null_entry)
5101                 goto out_ip6_dst_entries;
5102
5103         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5104                                            sizeof(*net->ipv6.ip6_null_entry),
5105                                            GFP_KERNEL);
5106         if (!net->ipv6.ip6_null_entry)
5107                 goto out_fib6_null_entry;
5108         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5109         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5110                          ip6_template_metrics, true);
5111
5112 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5113         net->ipv6.fib6_has_custom_rules = false;
5114         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5115                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5116                                                GFP_KERNEL);
5117         if (!net->ipv6.ip6_prohibit_entry)
5118                 goto out_ip6_null_entry;
5119         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5120         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5121                          ip6_template_metrics, true);
5122
5123         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5124                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5125                                                GFP_KERNEL);
5126         if (!net->ipv6.ip6_blk_hole_entry)
5127                 goto out_ip6_prohibit_entry;
5128         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5129         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5130                          ip6_template_metrics, true);
5131 #endif
5132
5133         net->ipv6.sysctl.flush_delay = 0;
5134         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5135         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5136         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5137         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5138         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5139         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5140         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5141
5142         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5143
5144         ret = 0;
5145 out:
5146         return ret;
5147
5148 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5149 out_ip6_prohibit_entry:
5150         kfree(net->ipv6.ip6_prohibit_entry);
5151 out_ip6_null_entry:
5152         kfree(net->ipv6.ip6_null_entry);
5153 #endif
5154 out_fib6_null_entry:
5155         kfree(net->ipv6.fib6_null_entry);
5156 out_ip6_dst_entries:
5157         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5158 out_ip6_dst_ops:
5159         goto out;
5160 }
5161
5162 static void __net_exit ip6_route_net_exit(struct net *net)
5163 {
5164         kfree(net->ipv6.fib6_null_entry);
5165         kfree(net->ipv6.ip6_null_entry);
5166 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5167         kfree(net->ipv6.ip6_prohibit_entry);
5168         kfree(net->ipv6.ip6_blk_hole_entry);
5169 #endif
5170         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5171 }
5172
5173 static int __net_init ip6_route_net_init_late(struct net *net)
5174 {
5175 #ifdef CONFIG_PROC_FS
5176         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5177         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5178 #endif
5179         return 0;
5180 }
5181
5182 static void __net_exit ip6_route_net_exit_late(struct net *net)
5183 {
5184 #ifdef CONFIG_PROC_FS
5185         remove_proc_entry("ipv6_route", net->proc_net);
5186         remove_proc_entry("rt6_stats", net->proc_net);
5187 #endif
5188 }
5189
5190 static struct pernet_operations ip6_route_net_ops = {
5191         .init = ip6_route_net_init,
5192         .exit = ip6_route_net_exit,
5193 };
5194
5195 static int __net_init ipv6_inetpeer_init(struct net *net)
5196 {
5197         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5198
5199         if (!bp)
5200                 return -ENOMEM;
5201         inet_peer_base_init(bp);
5202         net->ipv6.peers = bp;
5203         return 0;
5204 }
5205
5206 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5207 {
5208         struct inet_peer_base *bp = net->ipv6.peers;
5209
5210         net->ipv6.peers = NULL;
5211         inetpeer_invalidate_tree(bp);
5212         kfree(bp);
5213 }
5214
5215 static struct pernet_operations ipv6_inetpeer_ops = {
5216         .init   =       ipv6_inetpeer_init,
5217         .exit   =       ipv6_inetpeer_exit,
5218 };
5219
5220 static struct pernet_operations ip6_route_net_late_ops = {
5221         .init = ip6_route_net_init_late,
5222         .exit = ip6_route_net_exit_late,
5223 };
5224
5225 static struct notifier_block ip6_route_dev_notifier = {
5226         .notifier_call = ip6_route_dev_notify,
5227         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5228 };
5229
5230 void __init ip6_route_init_special_entries(void)
5231 {
5232         /* Registering of the loopback is done before this portion of code,
5233          * the loopback reference in rt6_info will not be taken, do it
5234          * manually for init_net */
5235         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5236         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5237         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5238   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5239         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5240         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5241         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5242         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5243   #endif
5244 }
5245
5246 int __init ip6_route_init(void)
5247 {
5248         int ret;
5249         int cpu;
5250
5251         ret = -ENOMEM;
5252         ip6_dst_ops_template.kmem_cachep =
5253                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5254                                   SLAB_HWCACHE_ALIGN, NULL);
5255         if (!ip6_dst_ops_template.kmem_cachep)
5256                 goto out;
5257
5258         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5259         if (ret)
5260                 goto out_kmem_cache;
5261
5262         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5263         if (ret)
5264                 goto out_dst_entries;
5265
5266         ret = register_pernet_subsys(&ip6_route_net_ops);
5267         if (ret)
5268                 goto out_register_inetpeer;
5269
5270         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5271
5272         ret = fib6_init();
5273         if (ret)
5274                 goto out_register_subsys;
5275
5276         ret = xfrm6_init();
5277         if (ret)
5278                 goto out_fib6_init;
5279
5280         ret = fib6_rules_init();
5281         if (ret)
5282                 goto xfrm6_init;
5283
5284         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5285         if (ret)
5286                 goto fib6_rules_init;
5287
5288         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5289                                    inet6_rtm_newroute, NULL, 0);
5290         if (ret < 0)
5291                 goto out_register_late_subsys;
5292
5293         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5294                                    inet6_rtm_delroute, NULL, 0);
5295         if (ret < 0)
5296                 goto out_register_late_subsys;
5297
5298         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5299                                    inet6_rtm_getroute, NULL,
5300                                    RTNL_FLAG_DOIT_UNLOCKED);
5301         if (ret < 0)
5302                 goto out_register_late_subsys;
5303
5304         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5305         if (ret)
5306                 goto out_register_late_subsys;
5307
5308         for_each_possible_cpu(cpu) {
5309                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5310
5311                 INIT_LIST_HEAD(&ul->head);
5312                 spin_lock_init(&ul->lock);
5313         }
5314
5315 out:
5316         return ret;
5317
5318 out_register_late_subsys:
5319         rtnl_unregister_all(PF_INET6);
5320         unregister_pernet_subsys(&ip6_route_net_late_ops);
5321 fib6_rules_init:
5322         fib6_rules_cleanup();
5323 xfrm6_init:
5324         xfrm6_fini();
5325 out_fib6_init:
5326         fib6_gc_cleanup();
5327 out_register_subsys:
5328         unregister_pernet_subsys(&ip6_route_net_ops);
5329 out_register_inetpeer:
5330         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5331 out_dst_entries:
5332         dst_entries_destroy(&ip6_dst_blackhole_ops);
5333 out_kmem_cache:
5334         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5335         goto out;
5336 }
5337
5338 void ip6_route_cleanup(void)
5339 {
5340         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5341         unregister_pernet_subsys(&ip6_route_net_late_ops);
5342         fib6_rules_cleanup();
5343         xfrm6_fini();
5344         fib6_gc_cleanup();
5345         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5346         unregister_pernet_subsys(&ip6_route_net_ops);
5347         dst_entries_destroy(&ip6_dst_blackhole_ops);
5348         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5349 }