net/ipv6: fix lock imbalance in ip6_route_del()
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102                          struct fib6_info *rt, struct dst_entry *dst,
103                          struct in6_addr *dest, struct in6_addr *src,
104                          int iif, int type, u32 portid, u32 seq,
105                          unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107                                            struct in6_addr *daddr,
108                                            struct in6_addr *saddr);
109
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112                                            const struct in6_addr *prefix, int prefixlen,
113                                            const struct in6_addr *gwaddr,
114                                            struct net_device *dev,
115                                            unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117                                            const struct in6_addr *prefix, int prefixlen,
118                                            const struct in6_addr *gwaddr,
119                                            struct net_device *dev);
120 #endif
121
122 struct uncached_list {
123         spinlock_t              lock;
124         struct list_head        head;
125 };
126
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132
133         rt->rt6i_uncached_list = ul;
134
135         spin_lock_bh(&ul->lock);
136         list_add_tail(&rt->rt6i_uncached, &ul->head);
137         spin_unlock_bh(&ul->lock);
138 }
139
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142         if (!list_empty(&rt->rt6i_uncached)) {
143                 struct uncached_list *ul = rt->rt6i_uncached_list;
144                 struct net *net = dev_net(rt->dst.dev);
145
146                 spin_lock_bh(&ul->lock);
147                 list_del(&rt->rt6i_uncached);
148                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         if (!ipv6_addr_any(p))
190                 return (const void *) p;
191         else if (skb)
192                 return &ipv6_hdr(skb)->daddr;
193         return daddr;
194 }
195
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197                                    struct net_device *dev,
198                                    struct sk_buff *skb,
199                                    const void *daddr)
200 {
201         struct neighbour *n;
202
203         daddr = choose_neigh_daddr(gw, skb, daddr);
204         n = __ipv6_neigh_lookup(dev, daddr);
205         if (n)
206                 return n;
207         return neigh_create(&nd_tbl, daddr, dev);
208 }
209
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211                                               struct sk_buff *skb,
212                                               const void *daddr)
213 {
214         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215
216         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221         struct net_device *dev = dst->dev;
222         struct rt6_info *rt = (struct rt6_info *)dst;
223
224         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225         if (!daddr)
226                 return;
227         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228                 return;
229         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230                 return;
231         __ipv6_confirm_neigh(dev, daddr);
232 }
233
234 static struct dst_ops ip6_dst_ops_template = {
235         .family                 =       AF_INET6,
236         .gc                     =       ip6_dst_gc,
237         .gc_thresh              =       1024,
238         .check                  =       ip6_dst_check,
239         .default_advmss         =       ip6_default_advmss,
240         .mtu                    =       ip6_mtu,
241         .cow_metrics            =       dst_cow_metrics_generic,
242         .destroy                =       ip6_dst_destroy,
243         .ifdown                 =       ip6_dst_ifdown,
244         .negative_advice        =       ip6_negative_advice,
245         .link_failure           =       ip6_link_failure,
246         .update_pmtu            =       ip6_rt_update_pmtu,
247         .redirect               =       rt6_do_redirect,
248         .local_out              =       __ip6_local_out,
249         .neigh_lookup           =       ip6_dst_neigh_lookup,
250         .confirm_neigh          =       ip6_confirm_neigh,
251 };
252
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256
257         return mtu ? : dst->dev->mtu;
258 }
259
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261                                          struct sk_buff *skb, u32 mtu)
262 {
263 }
264
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266                                       struct sk_buff *skb)
267 {
268 }
269
270 static struct dst_ops ip6_dst_blackhole_ops = {
271         .family                 =       AF_INET6,
272         .destroy                =       ip6_dst_destroy,
273         .check                  =       ip6_dst_check,
274         .mtu                    =       ip6_blackhole_mtu,
275         .default_advmss         =       ip6_default_advmss,
276         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
277         .redirect               =       ip6_rt_blackhole_redirect,
278         .cow_metrics            =       dst_cow_metrics_generic,
279         .neigh_lookup           =       ip6_dst_neigh_lookup,
280 };
281
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283         [RTAX_HOPLIMIT - 1] = 0,
284 };
285
286 static const struct fib6_info fib6_null_entry_template = {
287         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
288         .fib6_protocol  = RTPROT_KERNEL,
289         .fib6_metric    = ~(u32)0,
290         .fib6_ref       = ATOMIC_INIT(1),
291         .fib6_type      = RTN_UNREACHABLE,
292         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
293 };
294
295 static const struct rt6_info ip6_null_entry_template = {
296         .dst = {
297                 .__refcnt       = ATOMIC_INIT(1),
298                 .__use          = 1,
299                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
300                 .error          = -ENETUNREACH,
301                 .input          = ip6_pkt_discard,
302                 .output         = ip6_pkt_discard_out,
303         },
304         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310         .dst = {
311                 .__refcnt       = ATOMIC_INIT(1),
312                 .__use          = 1,
313                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
314                 .error          = -EACCES,
315                 .input          = ip6_pkt_prohibit,
316                 .output         = ip6_pkt_prohibit_out,
317         },
318         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
319 };
320
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322         .dst = {
323                 .__refcnt       = ATOMIC_INIT(1),
324                 .__use          = 1,
325                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
326                 .error          = -EINVAL,
327                 .input          = dst_discard,
328                 .output         = dst_discard_out,
329         },
330         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
331 };
332
333 #endif
334
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337         struct dst_entry *dst = &rt->dst;
338
339         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340         INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348                                         1, DST_OBSOLETE_FORCE_CHK, flags);
349
350         if (rt) {
351                 rt6_info_init(rt);
352                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353         }
354
355         return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361         struct rt6_info *rt = (struct rt6_info *)dst;
362         struct fib6_info *from;
363         struct inet6_dev *idev;
364
365         dst_destroy_metrics_generic(dst);
366         rt6_uncached_list_del(rt);
367
368         idev = rt->rt6i_idev;
369         if (idev) {
370                 rt->rt6i_idev = NULL;
371                 in6_dev_put(idev);
372         }
373
374         rcu_read_lock();
375         from = rcu_dereference(rt->from);
376         rcu_assign_pointer(rt->from, NULL);
377         fib6_info_release(from);
378         rcu_read_unlock();
379 }
380
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
382                            int how)
383 {
384         struct rt6_info *rt = (struct rt6_info *)dst;
385         struct inet6_dev *idev = rt->rt6i_idev;
386         struct net_device *loopback_dev =
387                 dev_net(dev)->loopback_dev;
388
389         if (idev && idev->dev != loopback_dev) {
390                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
391                 if (loopback_idev) {
392                         rt->rt6i_idev = loopback_idev;
393                         in6_dev_put(idev);
394                 }
395         }
396 }
397
398 static bool __rt6_check_expired(const struct rt6_info *rt)
399 {
400         if (rt->rt6i_flags & RTF_EXPIRES)
401                 return time_after(jiffies, rt->dst.expires);
402         else
403                 return false;
404 }
405
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408         struct fib6_info *from;
409
410         from = rcu_dereference(rt->from);
411
412         if (rt->rt6i_flags & RTF_EXPIRES) {
413                 if (time_after(jiffies, rt->dst.expires))
414                         return true;
415         } else if (from) {
416                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417                         fib6_check_expired(from);
418         }
419         return false;
420 }
421
422 static struct fib6_info *rt6_multipath_select(const struct net *net,
423                                               struct fib6_info *match,
424                                              struct flowi6 *fl6, int oif,
425                                              const struct sk_buff *skb,
426                                              int strict)
427 {
428         struct fib6_info *sibling, *next_sibling;
429
430         /* We might have already computed the hash for ICMPv6 errors. In such
431          * case it will always be non-zero. Otherwise now is the time to do it.
432          */
433         if (!fl6->mp_hash)
434                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435
436         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437                 return match;
438
439         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
440                                  fib6_siblings) {
441                 int nh_upper_bound;
442
443                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444                 if (fl6->mp_hash > nh_upper_bound)
445                         continue;
446                 if (rt6_score_route(sibling, oif, strict) < 0)
447                         break;
448                 match = sibling;
449                 break;
450         }
451
452         return match;
453 }
454
455 /*
456  *      Route lookup. rcu_read_lock() should be held.
457  */
458
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460                                                  struct fib6_info *rt,
461                                                     const struct in6_addr *saddr,
462                                                     int oif,
463                                                     int flags)
464 {
465         struct fib6_info *sprt;
466
467         if (!oif && ipv6_addr_any(saddr) &&
468             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469                 return rt;
470
471         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
472                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
473
474                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475                         continue;
476
477                 if (oif) {
478                         if (dev->ifindex == oif)
479                                 return sprt;
480                 } else {
481                         if (ipv6_chk_addr(net, saddr, dev,
482                                           flags & RT6_LOOKUP_F_IFACE))
483                                 return sprt;
484                 }
485         }
486
487         if (oif && flags & RT6_LOOKUP_F_IFACE)
488                 return net->ipv6.fib6_null_entry;
489
490         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
491 }
492
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495         struct work_struct work;
496         struct in6_addr target;
497         struct net_device *dev;
498 };
499
500 static void rt6_probe_deferred(struct work_struct *w)
501 {
502         struct in6_addr mcaddr;
503         struct __rt6_probe_work *work =
504                 container_of(w, struct __rt6_probe_work, work);
505
506         addrconf_addr_solict_mult(&work->target, &mcaddr);
507         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508         dev_put(work->dev);
509         kfree(work);
510 }
511
512 static void rt6_probe(struct fib6_info *rt)
513 {
514         struct __rt6_probe_work *work;
515         const struct in6_addr *nh_gw;
516         struct neighbour *neigh;
517         struct net_device *dev;
518
519         /*
520          * Okay, this does not seem to be appropriate
521          * for now, however, we need to check if it
522          * is really so; aka Router Reachability Probing.
523          *
524          * Router Reachability Probe MUST be rate-limited
525          * to no more than one per minute.
526          */
527         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528                 return;
529
530         nh_gw = &rt->fib6_nh.nh_gw;
531         dev = rt->fib6_nh.nh_dev;
532         rcu_read_lock_bh();
533         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534         if (neigh) {
535                 struct inet6_dev *idev;
536
537                 if (neigh->nud_state & NUD_VALID)
538                         goto out;
539
540                 idev = __in6_dev_get(dev);
541                 work = NULL;
542                 write_lock(&neigh->lock);
543                 if (!(neigh->nud_state & NUD_VALID) &&
544                     time_after(jiffies,
545                                neigh->updated + idev->cnf.rtr_probe_interval)) {
546                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
547                         if (work)
548                                 __neigh_set_probe_once(neigh);
549                 }
550                 write_unlock(&neigh->lock);
551         } else {
552                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
553         }
554
555         if (work) {
556                 INIT_WORK(&work->work, rt6_probe_deferred);
557                 work->target = *nh_gw;
558                 dev_hold(dev);
559                 work->dev = dev;
560                 schedule_work(&work->work);
561         }
562
563 out:
564         rcu_read_unlock_bh();
565 }
566 #else
567 static inline void rt6_probe(struct fib6_info *rt)
568 {
569 }
570 #endif
571
572 /*
573  * Default Router Selection (RFC 2461 6.3.6)
574  */
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576 {
577         const struct net_device *dev = rt->fib6_nh.nh_dev;
578
579         if (!oif || dev->ifindex == oif)
580                 return 2;
581         return 0;
582 }
583
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
585 {
586         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587         struct neighbour *neigh;
588
589         if (rt->fib6_flags & RTF_NONEXTHOP ||
590             !(rt->fib6_flags & RTF_GATEWAY))
591                 return RT6_NUD_SUCCEED;
592
593         rcu_read_lock_bh();
594         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
595                                           &rt->fib6_nh.nh_gw);
596         if (neigh) {
597                 read_lock(&neigh->lock);
598                 if (neigh->nud_state & NUD_VALID)
599                         ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601                 else if (!(neigh->nud_state & NUD_FAILED))
602                         ret = RT6_NUD_SUCCEED;
603                 else
604                         ret = RT6_NUD_FAIL_PROBE;
605 #endif
606                 read_unlock(&neigh->lock);
607         } else {
608                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610         }
611         rcu_read_unlock_bh();
612
613         return ret;
614 }
615
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
617 {
618         int m;
619
620         m = rt6_check_dev(rt, oif);
621         if (!m && (strict & RT6_LOOKUP_F_IFACE))
622                 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625 #endif
626         if (strict & RT6_LOOKUP_F_REACHABLE) {
627                 int n = rt6_check_neigh(rt);
628                 if (n < 0)
629                         return n;
630         }
631         return m;
632 }
633
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
636 {
637         const struct net_device *dev = fib6_info_nh_dev(f6i);
638         bool rc = false;
639
640         if (dev) {
641                 const struct inet6_dev *idev = __in6_dev_get(dev);
642
643                 rc = !!idev->cnf.ignore_routes_with_linkdown;
644         }
645
646         return rc;
647 }
648
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650                                    int *mpri, struct fib6_info *match,
651                                    bool *do_rr)
652 {
653         int m;
654         bool match_do_rr = false;
655
656         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657                 goto out;
658
659         if (fib6_ignore_linkdown(rt) &&
660             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662                 goto out;
663
664         if (fib6_check_expired(rt))
665                 goto out;
666
667         m = rt6_score_route(rt, oif, strict);
668         if (m == RT6_NUD_FAIL_DO_RR) {
669                 match_do_rr = true;
670                 m = 0; /* lowest valid score */
671         } else if (m == RT6_NUD_FAIL_HARD) {
672                 goto out;
673         }
674
675         if (strict & RT6_LOOKUP_F_REACHABLE)
676                 rt6_probe(rt);
677
678         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
679         if (m > *mpri) {
680                 *do_rr = match_do_rr;
681                 *mpri = m;
682                 match = rt;
683         }
684 out:
685         return match;
686 }
687
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689                                      struct fib6_info *leaf,
690                                      struct fib6_info *rr_head,
691                                      u32 metric, int oif, int strict,
692                                      bool *do_rr)
693 {
694         struct fib6_info *rt, *match, *cont;
695         int mpri = -1;
696
697         match = NULL;
698         cont = NULL;
699         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
700                 if (rt->fib6_metric != metric) {
701                         cont = rt;
702                         break;
703                 }
704
705                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
706         }
707
708         for (rt = leaf; rt && rt != rr_head;
709              rt = rcu_dereference(rt->fib6_next)) {
710                 if (rt->fib6_metric != metric) {
711                         cont = rt;
712                         break;
713                 }
714
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716         }
717
718         if (match || !cont)
719                 return match;
720
721         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
722                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723
724         return match;
725 }
726
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
728                                    int oif, int strict)
729 {
730         struct fib6_info *leaf = rcu_dereference(fn->leaf);
731         struct fib6_info *match, *rt0;
732         bool do_rr = false;
733         int key_plen;
734
735         if (!leaf || leaf == net->ipv6.fib6_null_entry)
736                 return net->ipv6.fib6_null_entry;
737
738         rt0 = rcu_dereference(fn->rr_ptr);
739         if (!rt0)
740                 rt0 = leaf;
741
742         /* Double check to make sure fn is not an intermediate node
743          * and fn->leaf does not points to its child's leaf
744          * (This might happen if all routes under fn are deleted from
745          * the tree and fib6_repair_tree() is called on the node.)
746          */
747         key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749         if (rt0->fib6_src.plen)
750                 key_plen = rt0->fib6_src.plen;
751 #endif
752         if (fn->fn_bit != key_plen)
753                 return net->ipv6.fib6_null_entry;
754
755         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756                              &do_rr);
757
758         if (do_rr) {
759                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
760
761                 /* no entries matched; do round-robin */
762                 if (!next || next->fib6_metric != rt0->fib6_metric)
763                         next = leaf;
764
765                 if (next != rt0) {
766                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
767                         /* make sure next is not being deleted from the tree */
768                         if (next->fib6_node)
769                                 rcu_assign_pointer(fn->rr_ptr, next);
770                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771                 }
772         }
773
774         return match ? match : net->ipv6.fib6_null_entry;
775 }
776
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778 {
779         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 }
781
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784                   const struct in6_addr *gwaddr)
785 {
786         struct net *net = dev_net(dev);
787         struct route_info *rinfo = (struct route_info *) opt;
788         struct in6_addr prefix_buf, *prefix;
789         unsigned int pref;
790         unsigned long lifetime;
791         struct fib6_info *rt;
792
793         if (len < sizeof(struct route_info)) {
794                 return -EINVAL;
795         }
796
797         /* Sanity check for prefix_len and length */
798         if (rinfo->length > 3) {
799                 return -EINVAL;
800         } else if (rinfo->prefix_len > 128) {
801                 return -EINVAL;
802         } else if (rinfo->prefix_len > 64) {
803                 if (rinfo->length < 2) {
804                         return -EINVAL;
805                 }
806         } else if (rinfo->prefix_len > 0) {
807                 if (rinfo->length < 1) {
808                         return -EINVAL;
809                 }
810         }
811
812         pref = rinfo->route_pref;
813         if (pref == ICMPV6_ROUTER_PREF_INVALID)
814                 return -EINVAL;
815
816         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817
818         if (rinfo->length == 3)
819                 prefix = (struct in6_addr *)rinfo->prefix;
820         else {
821                 /* this function is safe */
822                 ipv6_addr_prefix(&prefix_buf,
823                                  (struct in6_addr *)rinfo->prefix,
824                                  rinfo->prefix_len);
825                 prefix = &prefix_buf;
826         }
827
828         if (rinfo->prefix_len == 0)
829                 rt = rt6_get_dflt_router(net, gwaddr, dev);
830         else
831                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832                                         gwaddr, dev);
833
834         if (rt && !lifetime) {
835                 ip6_del_rt(net, rt);
836                 rt = NULL;
837         }
838
839         if (!rt && lifetime)
840                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
841                                         dev, pref);
842         else if (rt)
843                 rt->fib6_flags = RTF_ROUTEINFO |
844                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845
846         if (rt) {
847                 if (!addrconf_finite_timeout(lifetime))
848                         fib6_clean_expires(rt);
849                 else
850                         fib6_set_expires(rt, jiffies + HZ * lifetime);
851
852                 fib6_info_release(rt);
853         }
854         return 0;
855 }
856 #endif
857
858 /*
859  *      Misc support functions
860  */
861
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864 {
865         struct net_device *dev = rt->fib6_nh.nh_dev;
866
867         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868                 /* for copies of local routes, dst->dev needs to be the
869                  * device if it is a master device, the master device if
870                  * device is enslaved, and the loopback as the default
871                  */
872                 if (netif_is_l3_slave(dev) &&
873                     !rt6_need_strict(&rt->fib6_dst.addr))
874                         dev = l3mdev_master_dev_rcu(dev);
875                 else if (!netif_is_l3_master(dev))
876                         dev = dev_net(dev)->loopback_dev;
877                 /* last case is netif_is_l3_master(dev) is true in which
878                  * case we want dev returned to be dev
879                  */
880         }
881
882         return dev;
883 }
884
885 static const int fib6_prop[RTN_MAX + 1] = {
886         [RTN_UNSPEC]    = 0,
887         [RTN_UNICAST]   = 0,
888         [RTN_LOCAL]     = 0,
889         [RTN_BROADCAST] = 0,
890         [RTN_ANYCAST]   = 0,
891         [RTN_MULTICAST] = 0,
892         [RTN_BLACKHOLE] = -EINVAL,
893         [RTN_UNREACHABLE] = -EHOSTUNREACH,
894         [RTN_PROHIBIT]  = -EACCES,
895         [RTN_THROW]     = -EAGAIN,
896         [RTN_NAT]       = -EINVAL,
897         [RTN_XRESOLVE]  = -EINVAL,
898 };
899
900 static int ip6_rt_type_to_error(u8 fib6_type)
901 {
902         return fib6_prop[fib6_type];
903 }
904
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 {
907         unsigned short flags = 0;
908
909         if (rt->dst_nocount)
910                 flags |= DST_NOCOUNT;
911         if (rt->dst_nopolicy)
912                 flags |= DST_NOPOLICY;
913         if (rt->dst_host)
914                 flags |= DST_HOST;
915
916         return flags;
917 }
918
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 {
921         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
922
923         switch (ort->fib6_type) {
924         case RTN_BLACKHOLE:
925                 rt->dst.output = dst_discard_out;
926                 rt->dst.input = dst_discard;
927                 break;
928         case RTN_PROHIBIT:
929                 rt->dst.output = ip6_pkt_prohibit_out;
930                 rt->dst.input = ip6_pkt_prohibit;
931                 break;
932         case RTN_THROW:
933         case RTN_UNREACHABLE:
934         default:
935                 rt->dst.output = ip6_pkt_discard_out;
936                 rt->dst.input = ip6_pkt_discard;
937                 break;
938         }
939 }
940
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942 {
943         rt->dst.flags |= fib6_info_dst_flags(ort);
944
945         if (ort->fib6_flags & RTF_REJECT) {
946                 ip6_rt_init_dst_reject(rt, ort);
947                 return;
948         }
949
950         rt->dst.error = 0;
951         rt->dst.output = ip6_output;
952
953         if (ort->fib6_type == RTN_LOCAL) {
954                 rt->dst.input = ip6_input;
955         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956                 rt->dst.input = ip6_mc_input;
957         } else {
958                 rt->dst.input = ip6_forward;
959         }
960
961         if (ort->fib6_nh.nh_lwtstate) {
962                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963                 lwtunnel_set_redirect(&rt->dst);
964         }
965
966         rt->dst.lastuse = jiffies;
967 }
968
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 {
971         rt->rt6i_flags &= ~RTF_EXPIRES;
972         fib6_info_hold(from);
973         rcu_assign_pointer(rt->from, from);
974         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975         if (from->fib6_metrics != &dst_default_metrics) {
976                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977                 refcount_inc(&from->fib6_metrics->refcnt);
978         }
979 }
980
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983         struct net_device *dev = fib6_info_nh_dev(ort);
984
985         ip6_rt_init_dst(rt, ort);
986
987         rt->rt6i_dst = ort->fib6_dst;
988         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990         rt->rt6i_flags = ort->fib6_flags;
991         rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993         rt->rt6i_src = ort->fib6_src;
994 #endif
995         rt->rt6i_prefsrc = ort->fib6_prefsrc;
996         rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 }
998
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000                                         struct in6_addr *saddr)
1001 {
1002         struct fib6_node *pn, *sn;
1003         while (1) {
1004                 if (fn->fn_flags & RTN_TL_ROOT)
1005                         return NULL;
1006                 pn = rcu_dereference(fn->parent);
1007                 sn = FIB6_SUBTREE(pn);
1008                 if (sn && sn != fn)
1009                         fn = fib6_lookup(sn, NULL, saddr);
1010                 else
1011                         fn = pn;
1012                 if (fn->fn_flags & RTN_RTINFO)
1013                         return fn;
1014         }
1015 }
1016
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018                           bool null_fallback)
1019 {
1020         struct rt6_info *rt = *prt;
1021
1022         if (dst_hold_safe(&rt->dst))
1023                 return true;
1024         if (null_fallback) {
1025                 rt = net->ipv6.ip6_null_entry;
1026                 dst_hold(&rt->dst);
1027         } else {
1028                 rt = NULL;
1029         }
1030         *prt = rt;
1031         return false;
1032 }
1033
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037         unsigned short flags = fib6_info_dst_flags(rt);
1038         struct net_device *dev = rt->fib6_nh.nh_dev;
1039         struct rt6_info *nrt;
1040
1041         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042         if (nrt)
1043                 ip6_rt_copy_init(nrt, rt);
1044
1045         return nrt;
1046 }
1047
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049                                              struct fib6_table *table,
1050                                              struct flowi6 *fl6,
1051                                              const struct sk_buff *skb,
1052                                              int flags)
1053 {
1054         struct fib6_info *f6i;
1055         struct fib6_node *fn;
1056         struct rt6_info *rt;
1057
1058         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059                 flags &= ~RT6_LOOKUP_F_IFACE;
1060
1061         rcu_read_lock();
1062         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064         f6i = rcu_dereference(fn->leaf);
1065         if (!f6i) {
1066                 f6i = net->ipv6.fib6_null_entry;
1067         } else {
1068                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069                                       fl6->flowi6_oif, flags);
1070                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071                         f6i = rt6_multipath_select(net, f6i, fl6,
1072                                                    fl6->flowi6_oif, skb, flags);
1073         }
1074         if (f6i == net->ipv6.fib6_null_entry) {
1075                 fn = fib6_backtrack(fn, &fl6->saddr);
1076                 if (fn)
1077                         goto restart;
1078         }
1079
1080         /* Search through exception table */
1081         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1082         if (rt) {
1083                 if (ip6_hold_safe(net, &rt, true))
1084                         dst_use_noref(&rt->dst, jiffies);
1085         } else if (f6i == net->ipv6.fib6_null_entry) {
1086                 rt = net->ipv6.ip6_null_entry;
1087                 dst_hold(&rt->dst);
1088         } else {
1089                 rt = ip6_create_rt_rcu(f6i);
1090                 if (!rt) {
1091                         rt = net->ipv6.ip6_null_entry;
1092                         dst_hold(&rt->dst);
1093                 }
1094         }
1095
1096         rcu_read_unlock();
1097
1098         trace_fib6_table_lookup(net, rt, table, fl6);
1099
1100         return rt;
1101 }
1102
1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1104                                    const struct sk_buff *skb, int flags)
1105 {
1106         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1107 }
1108 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1109
1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1111                             const struct in6_addr *saddr, int oif,
1112                             const struct sk_buff *skb, int strict)
1113 {
1114         struct flowi6 fl6 = {
1115                 .flowi6_oif = oif,
1116                 .daddr = *daddr,
1117         };
1118         struct dst_entry *dst;
1119         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1120
1121         if (saddr) {
1122                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1124         }
1125
1126         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1127         if (dst->error == 0)
1128                 return (struct rt6_info *) dst;
1129
1130         dst_release(dst);
1131
1132         return NULL;
1133 }
1134 EXPORT_SYMBOL(rt6_lookup);
1135
1136 /* ip6_ins_rt is called with FREE table->tb6_lock.
1137  * It takes new route entry, the addition fails by any reason the
1138  * route is released.
1139  * Caller must hold dst before calling it.
1140  */
1141
1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143                         struct netlink_ext_ack *extack)
1144 {
1145         int err;
1146         struct fib6_table *table;
1147
1148         table = rt->fib6_table;
1149         spin_lock_bh(&table->tb6_lock);
1150         err = fib6_add(&table->tb6_root, rt, info, extack);
1151         spin_unlock_bh(&table->tb6_lock);
1152
1153         return err;
1154 }
1155
1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1157 {
1158         struct nl_info info = { .nl_net = net, };
1159
1160         return __ip6_ins_rt(rt, &info, NULL);
1161 }
1162
1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164                                            const struct in6_addr *daddr,
1165                                            const struct in6_addr *saddr)
1166 {
1167         struct net_device *dev;
1168         struct rt6_info *rt;
1169
1170         /*
1171          *      Clone the route.
1172          */
1173
1174         dev = ip6_rt_get_dev_rcu(ort);
1175         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1176         if (!rt)
1177                 return NULL;
1178
1179         ip6_rt_copy_init(rt, ort);
1180         rt->rt6i_flags |= RTF_CACHE;
1181         rt->dst.flags |= DST_HOST;
1182         rt->rt6i_dst.addr = *daddr;
1183         rt->rt6i_dst.plen = 128;
1184
1185         if (!rt6_is_gw_or_nonexthop(ort)) {
1186                 if (ort->fib6_dst.plen != 128 &&
1187                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1188                         rt->rt6i_flags |= RTF_ANYCAST;
1189 #ifdef CONFIG_IPV6_SUBTREES
1190                 if (rt->rt6i_src.plen && saddr) {
1191                         rt->rt6i_src.addr = *saddr;
1192                         rt->rt6i_src.plen = 128;
1193                 }
1194 #endif
1195         }
1196
1197         return rt;
1198 }
1199
1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1201 {
1202         unsigned short flags = fib6_info_dst_flags(rt);
1203         struct net_device *dev;
1204         struct rt6_info *pcpu_rt;
1205
1206         rcu_read_lock();
1207         dev = ip6_rt_get_dev_rcu(rt);
1208         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1209         rcu_read_unlock();
1210         if (!pcpu_rt)
1211                 return NULL;
1212         ip6_rt_copy_init(pcpu_rt, rt);
1213         pcpu_rt->rt6i_flags |= RTF_PCPU;
1214         return pcpu_rt;
1215 }
1216
1217 /* It should be called with rcu_read_lock() acquired */
1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1219 {
1220         struct rt6_info *pcpu_rt, **p;
1221
1222         p = this_cpu_ptr(rt->rt6i_pcpu);
1223         pcpu_rt = *p;
1224
1225         if (pcpu_rt)
1226                 ip6_hold_safe(NULL, &pcpu_rt, false);
1227
1228         return pcpu_rt;
1229 }
1230
1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232                                             struct fib6_info *rt)
1233 {
1234         struct rt6_info *pcpu_rt, *prev, **p;
1235
1236         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1237         if (!pcpu_rt) {
1238                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1239                 return net->ipv6.ip6_null_entry;
1240         }
1241
1242         dst_hold(&pcpu_rt->dst);
1243         p = this_cpu_ptr(rt->rt6i_pcpu);
1244         prev = cmpxchg(p, NULL, pcpu_rt);
1245         BUG_ON(prev);
1246
1247         return pcpu_rt;
1248 }
1249
1250 /* exception hash table implementation
1251  */
1252 static DEFINE_SPINLOCK(rt6_exception_lock);
1253
1254 /* Remove rt6_ex from hash table and free the memory
1255  * Caller must hold rt6_exception_lock
1256  */
1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1258                                  struct rt6_exception *rt6_ex)
1259 {
1260         struct net *net;
1261
1262         if (!bucket || !rt6_ex)
1263                 return;
1264
1265         net = dev_net(rt6_ex->rt6i->dst.dev);
1266         hlist_del_rcu(&rt6_ex->hlist);
1267         dst_release(&rt6_ex->rt6i->dst);
1268         kfree_rcu(rt6_ex, rcu);
1269         WARN_ON_ONCE(!bucket->depth);
1270         bucket->depth--;
1271         net->ipv6.rt6_stats->fib_rt_cache--;
1272 }
1273
1274 /* Remove oldest rt6_ex in bucket and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1278 {
1279         struct rt6_exception *rt6_ex, *oldest = NULL;
1280
1281         if (!bucket)
1282                 return;
1283
1284         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1285                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1286                         oldest = rt6_ex;
1287         }
1288         rt6_remove_exception(bucket, oldest);
1289 }
1290
1291 static u32 rt6_exception_hash(const struct in6_addr *dst,
1292                               const struct in6_addr *src)
1293 {
1294         static u32 seed __read_mostly;
1295         u32 val;
1296
1297         net_get_random_once(&seed, sizeof(seed));
1298         val = jhash(dst, sizeof(*dst), seed);
1299
1300 #ifdef CONFIG_IPV6_SUBTREES
1301         if (src)
1302                 val = jhash(src, sizeof(*src), val);
1303 #endif
1304         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1305 }
1306
1307 /* Helper function to find the cached rt in the hash table
1308  * and update bucket pointer to point to the bucket for this
1309  * (daddr, saddr) pair
1310  * Caller must hold rt6_exception_lock
1311  */
1312 static struct rt6_exception *
1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1314                               const struct in6_addr *daddr,
1315                               const struct in6_addr *saddr)
1316 {
1317         struct rt6_exception *rt6_ex;
1318         u32 hval;
1319
1320         if (!(*bucket) || !daddr)
1321                 return NULL;
1322
1323         hval = rt6_exception_hash(daddr, saddr);
1324         *bucket += hval;
1325
1326         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1327                 struct rt6_info *rt6 = rt6_ex->rt6i;
1328                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1329
1330 #ifdef CONFIG_IPV6_SUBTREES
1331                 if (matched && saddr)
1332                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1333 #endif
1334                 if (matched)
1335                         return rt6_ex;
1336         }
1337         return NULL;
1338 }
1339
1340 /* Helper function to find the cached rt in the hash table
1341  * and update bucket pointer to point to the bucket for this
1342  * (daddr, saddr) pair
1343  * Caller must hold rcu_read_lock()
1344  */
1345 static struct rt6_exception *
1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1347                          const struct in6_addr *daddr,
1348                          const struct in6_addr *saddr)
1349 {
1350         struct rt6_exception *rt6_ex;
1351         u32 hval;
1352
1353         WARN_ON_ONCE(!rcu_read_lock_held());
1354
1355         if (!(*bucket) || !daddr)
1356                 return NULL;
1357
1358         hval = rt6_exception_hash(daddr, saddr);
1359         *bucket += hval;
1360
1361         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1362                 struct rt6_info *rt6 = rt6_ex->rt6i;
1363                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1364
1365 #ifdef CONFIG_IPV6_SUBTREES
1366                 if (matched && saddr)
1367                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1368 #endif
1369                 if (matched)
1370                         return rt6_ex;
1371         }
1372         return NULL;
1373 }
1374
1375 static unsigned int fib6_mtu(const struct fib6_info *rt)
1376 {
1377         unsigned int mtu;
1378
1379         if (rt->fib6_pmtu) {
1380                 mtu = rt->fib6_pmtu;
1381         } else {
1382                 struct net_device *dev = fib6_info_nh_dev(rt);
1383                 struct inet6_dev *idev;
1384
1385                 rcu_read_lock();
1386                 idev = __in6_dev_get(dev);
1387                 mtu = idev->cnf.mtu6;
1388                 rcu_read_unlock();
1389         }
1390
1391         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1392
1393         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1394 }
1395
1396 static int rt6_insert_exception(struct rt6_info *nrt,
1397                                 struct fib6_info *ort)
1398 {
1399         struct net *net = dev_net(nrt->dst.dev);
1400         struct rt6_exception_bucket *bucket;
1401         struct in6_addr *src_key = NULL;
1402         struct rt6_exception *rt6_ex;
1403         int err = 0;
1404
1405         spin_lock_bh(&rt6_exception_lock);
1406
1407         if (ort->exception_bucket_flushed) {
1408                 err = -EINVAL;
1409                 goto out;
1410         }
1411
1412         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1413                                         lockdep_is_held(&rt6_exception_lock));
1414         if (!bucket) {
1415                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1416                                  GFP_ATOMIC);
1417                 if (!bucket) {
1418                         err = -ENOMEM;
1419                         goto out;
1420                 }
1421                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1422         }
1423
1424 #ifdef CONFIG_IPV6_SUBTREES
1425         /* rt6i_src.plen != 0 indicates ort is in subtree
1426          * and exception table is indexed by a hash of
1427          * both rt6i_dst and rt6i_src.
1428          * Otherwise, the exception table is indexed by
1429          * a hash of only rt6i_dst.
1430          */
1431         if (ort->fib6_src.plen)
1432                 src_key = &nrt->rt6i_src.addr;
1433 #endif
1434
1435         /* Update rt6i_prefsrc as it could be changed
1436          * in rt6_remove_prefsrc()
1437          */
1438         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439         /* rt6_mtu_change() might lower mtu on ort.
1440          * Only insert this exception route if its mtu
1441          * is less than ort's mtu value.
1442          */
1443         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1444                 err = -EINVAL;
1445                 goto out;
1446         }
1447
1448         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1449                                                src_key);
1450         if (rt6_ex)
1451                 rt6_remove_exception(bucket, rt6_ex);
1452
1453         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1454         if (!rt6_ex) {
1455                 err = -ENOMEM;
1456                 goto out;
1457         }
1458         rt6_ex->rt6i = nrt;
1459         rt6_ex->stamp = jiffies;
1460         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1461         bucket->depth++;
1462         net->ipv6.rt6_stats->fib_rt_cache++;
1463
1464         if (bucket->depth > FIB6_MAX_DEPTH)
1465                 rt6_exception_remove_oldest(bucket);
1466
1467 out:
1468         spin_unlock_bh(&rt6_exception_lock);
1469
1470         /* Update fn->fn_sernum to invalidate all cached dst */
1471         if (!err) {
1472                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1473                 fib6_update_sernum(net, ort);
1474                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475                 fib6_force_start_gc(net);
1476         }
1477
1478         return err;
1479 }
1480
1481 void rt6_flush_exceptions(struct fib6_info *rt)
1482 {
1483         struct rt6_exception_bucket *bucket;
1484         struct rt6_exception *rt6_ex;
1485         struct hlist_node *tmp;
1486         int i;
1487
1488         spin_lock_bh(&rt6_exception_lock);
1489         /* Prevent rt6_insert_exception() to recreate the bucket list */
1490         rt->exception_bucket_flushed = 1;
1491
1492         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493                                     lockdep_is_held(&rt6_exception_lock));
1494         if (!bucket)
1495                 goto out;
1496
1497         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1499                         rt6_remove_exception(bucket, rt6_ex);
1500                 WARN_ON_ONCE(bucket->depth);
1501                 bucket++;
1502         }
1503
1504 out:
1505         spin_unlock_bh(&rt6_exception_lock);
1506 }
1507
1508 /* Find cached rt in the hash table inside passed in rt
1509  * Caller has to hold rcu_read_lock()
1510  */
1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512                                            struct in6_addr *daddr,
1513                                            struct in6_addr *saddr)
1514 {
1515         struct rt6_exception_bucket *bucket;
1516         struct in6_addr *src_key = NULL;
1517         struct rt6_exception *rt6_ex;
1518         struct rt6_info *res = NULL;
1519
1520         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1521
1522 #ifdef CONFIG_IPV6_SUBTREES
1523         /* rt6i_src.plen != 0 indicates rt is in subtree
1524          * and exception table is indexed by a hash of
1525          * both rt6i_dst and rt6i_src.
1526          * Otherwise, the exception table is indexed by
1527          * a hash of only rt6i_dst.
1528          */
1529         if (rt->fib6_src.plen)
1530                 src_key = saddr;
1531 #endif
1532         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1533
1534         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1535                 res = rt6_ex->rt6i;
1536
1537         return res;
1538 }
1539
1540 /* Remove the passed in cached rt from the hash table that contains it */
1541 static int rt6_remove_exception_rt(struct rt6_info *rt)
1542 {
1543         struct rt6_exception_bucket *bucket;
1544         struct in6_addr *src_key = NULL;
1545         struct rt6_exception *rt6_ex;
1546         struct fib6_info *from;
1547         int err;
1548
1549         from = rcu_dereference(rt->from);
1550         if (!from ||
1551             !(rt->rt6i_flags & RTF_CACHE))
1552                 return -EINVAL;
1553
1554         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1555                 return -ENOENT;
1556
1557         spin_lock_bh(&rt6_exception_lock);
1558         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1559                                     lockdep_is_held(&rt6_exception_lock));
1560 #ifdef CONFIG_IPV6_SUBTREES
1561         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1562          * and exception table is indexed by a hash of
1563          * both rt6i_dst and rt6i_src.
1564          * Otherwise, the exception table is indexed by
1565          * a hash of only rt6i_dst.
1566          */
1567         if (from->fib6_src.plen)
1568                 src_key = &rt->rt6i_src.addr;
1569 #endif
1570         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1571                                                &rt->rt6i_dst.addr,
1572                                                src_key);
1573         if (rt6_ex) {
1574                 rt6_remove_exception(bucket, rt6_ex);
1575                 err = 0;
1576         } else {
1577                 err = -ENOENT;
1578         }
1579
1580         spin_unlock_bh(&rt6_exception_lock);
1581         return err;
1582 }
1583
1584 /* Find rt6_ex which contains the passed in rt cache and
1585  * refresh its stamp
1586  */
1587 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1588 {
1589         struct rt6_exception_bucket *bucket;
1590         struct fib6_info *from = rt->from;
1591         struct in6_addr *src_key = NULL;
1592         struct rt6_exception *rt6_ex;
1593
1594         if (!from ||
1595             !(rt->rt6i_flags & RTF_CACHE))
1596                 return;
1597
1598         rcu_read_lock();
1599         bucket = rcu_dereference(from->rt6i_exception_bucket);
1600
1601 #ifdef CONFIG_IPV6_SUBTREES
1602         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1603          * and exception table is indexed by a hash of
1604          * both rt6i_dst and rt6i_src.
1605          * Otherwise, the exception table is indexed by
1606          * a hash of only rt6i_dst.
1607          */
1608         if (from->fib6_src.plen)
1609                 src_key = &rt->rt6i_src.addr;
1610 #endif
1611         rt6_ex = __rt6_find_exception_rcu(&bucket,
1612                                           &rt->rt6i_dst.addr,
1613                                           src_key);
1614         if (rt6_ex)
1615                 rt6_ex->stamp = jiffies;
1616
1617         rcu_read_unlock();
1618 }
1619
1620 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1621 {
1622         struct rt6_exception_bucket *bucket;
1623         struct rt6_exception *rt6_ex;
1624         int i;
1625
1626         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1627                                         lockdep_is_held(&rt6_exception_lock));
1628
1629         if (bucket) {
1630                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1631                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1632                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1633                         }
1634                         bucket++;
1635                 }
1636         }
1637 }
1638
1639 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1640                                          struct rt6_info *rt, int mtu)
1641 {
1642         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1643          * lowest MTU in the path: always allow updating the route PMTU to
1644          * reflect PMTU decreases.
1645          *
1646          * If the new MTU is higher, and the route PMTU is equal to the local
1647          * MTU, this means the old MTU is the lowest in the path, so allow
1648          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1649          * handle this.
1650          */
1651
1652         if (dst_mtu(&rt->dst) >= mtu)
1653                 return true;
1654
1655         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1656                 return true;
1657
1658         return false;
1659 }
1660
1661 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1662                                        struct fib6_info *rt, int mtu)
1663 {
1664         struct rt6_exception_bucket *bucket;
1665         struct rt6_exception *rt6_ex;
1666         int i;
1667
1668         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1669                                         lockdep_is_held(&rt6_exception_lock));
1670
1671         if (!bucket)
1672                 return;
1673
1674         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1675                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1676                         struct rt6_info *entry = rt6_ex->rt6i;
1677
1678                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1679                          * route), the metrics of its rt->from have already
1680                          * been updated.
1681                          */
1682                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1683                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1684                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1685                 }
1686                 bucket++;
1687         }
1688 }
1689
1690 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1691
1692 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1693                                         struct in6_addr *gateway)
1694 {
1695         struct rt6_exception_bucket *bucket;
1696         struct rt6_exception *rt6_ex;
1697         struct hlist_node *tmp;
1698         int i;
1699
1700         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1701                 return;
1702
1703         spin_lock_bh(&rt6_exception_lock);
1704         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1705                                      lockdep_is_held(&rt6_exception_lock));
1706
1707         if (bucket) {
1708                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1709                         hlist_for_each_entry_safe(rt6_ex, tmp,
1710                                                   &bucket->chain, hlist) {
1711                                 struct rt6_info *entry = rt6_ex->rt6i;
1712
1713                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1714                                     RTF_CACHE_GATEWAY &&
1715                                     ipv6_addr_equal(gateway,
1716                                                     &entry->rt6i_gateway)) {
1717                                         rt6_remove_exception(bucket, rt6_ex);
1718                                 }
1719                         }
1720                         bucket++;
1721                 }
1722         }
1723
1724         spin_unlock_bh(&rt6_exception_lock);
1725 }
1726
1727 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1728                                       struct rt6_exception *rt6_ex,
1729                                       struct fib6_gc_args *gc_args,
1730                                       unsigned long now)
1731 {
1732         struct rt6_info *rt = rt6_ex->rt6i;
1733
1734         /* we are pruning and obsoleting aged-out and non gateway exceptions
1735          * even if others have still references to them, so that on next
1736          * dst_check() such references can be dropped.
1737          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1738          * expired, independently from their aging, as per RFC 8201 section 4
1739          */
1740         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1741                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1742                         RT6_TRACE("aging clone %p\n", rt);
1743                         rt6_remove_exception(bucket, rt6_ex);
1744                         return;
1745                 }
1746         } else if (time_after(jiffies, rt->dst.expires)) {
1747                 RT6_TRACE("purging expired route %p\n", rt);
1748                 rt6_remove_exception(bucket, rt6_ex);
1749                 return;
1750         }
1751
1752         if (rt->rt6i_flags & RTF_GATEWAY) {
1753                 struct neighbour *neigh;
1754                 __u8 neigh_flags = 0;
1755
1756                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1757                 if (neigh)
1758                         neigh_flags = neigh->flags;
1759
1760                 if (!(neigh_flags & NTF_ROUTER)) {
1761                         RT6_TRACE("purging route %p via non-router but gateway\n",
1762                                   rt);
1763                         rt6_remove_exception(bucket, rt6_ex);
1764                         return;
1765                 }
1766         }
1767
1768         gc_args->more++;
1769 }
1770
1771 void rt6_age_exceptions(struct fib6_info *rt,
1772                         struct fib6_gc_args *gc_args,
1773                         unsigned long now)
1774 {
1775         struct rt6_exception_bucket *bucket;
1776         struct rt6_exception *rt6_ex;
1777         struct hlist_node *tmp;
1778         int i;
1779
1780         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1781                 return;
1782
1783         rcu_read_lock_bh();
1784         spin_lock(&rt6_exception_lock);
1785         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1786                                     lockdep_is_held(&rt6_exception_lock));
1787
1788         if (bucket) {
1789                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1790                         hlist_for_each_entry_safe(rt6_ex, tmp,
1791                                                   &bucket->chain, hlist) {
1792                                 rt6_age_examine_exception(bucket, rt6_ex,
1793                                                           gc_args, now);
1794                         }
1795                         bucket++;
1796                 }
1797         }
1798         spin_unlock(&rt6_exception_lock);
1799         rcu_read_unlock_bh();
1800 }
1801
1802 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1803                                int oif, struct flowi6 *fl6,
1804                                const struct sk_buff *skb, int flags)
1805 {
1806         struct fib6_node *fn, *saved_fn;
1807         struct fib6_info *f6i;
1808         struct rt6_info *rt;
1809         int strict = 0;
1810
1811         strict |= flags & RT6_LOOKUP_F_IFACE;
1812         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1813         if (net->ipv6.devconf_all->forwarding == 0)
1814                 strict |= RT6_LOOKUP_F_REACHABLE;
1815
1816         rcu_read_lock();
1817
1818         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1819         saved_fn = fn;
1820
1821         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1822                 oif = 0;
1823
1824 redo_rt6_select:
1825         f6i = rt6_select(net, fn, oif, strict);
1826         if (f6i->fib6_nsiblings)
1827                 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1828         if (f6i == net->ipv6.fib6_null_entry) {
1829                 fn = fib6_backtrack(fn, &fl6->saddr);
1830                 if (fn)
1831                         goto redo_rt6_select;
1832                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1833                         /* also consider unreachable route */
1834                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1835                         fn = saved_fn;
1836                         goto redo_rt6_select;
1837                 }
1838         }
1839
1840         if (f6i == net->ipv6.fib6_null_entry) {
1841                 rt = net->ipv6.ip6_null_entry;
1842                 rcu_read_unlock();
1843                 dst_hold(&rt->dst);
1844                 trace_fib6_table_lookup(net, rt, table, fl6);
1845                 return rt;
1846         }
1847
1848         /*Search through exception table */
1849         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1850         if (rt) {
1851                 if (ip6_hold_safe(net, &rt, true))
1852                         dst_use_noref(&rt->dst, jiffies);
1853
1854                 rcu_read_unlock();
1855                 trace_fib6_table_lookup(net, rt, table, fl6);
1856                 return rt;
1857         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1858                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1859                 /* Create a RTF_CACHE clone which will not be
1860                  * owned by the fib6 tree.  It is for the special case where
1861                  * the daddr in the skb during the neighbor look-up is different
1862                  * from the fl6->daddr used to look-up route here.
1863                  */
1864                 struct rt6_info *uncached_rt;
1865
1866                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1867
1868                 rcu_read_unlock();
1869
1870                 if (uncached_rt) {
1871                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1872                          * No need for another dst_hold()
1873                          */
1874                         rt6_uncached_list_add(uncached_rt);
1875                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1876                 } else {
1877                         uncached_rt = net->ipv6.ip6_null_entry;
1878                         dst_hold(&uncached_rt->dst);
1879                 }
1880
1881                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1882                 return uncached_rt;
1883
1884         } else {
1885                 /* Get a percpu copy */
1886
1887                 struct rt6_info *pcpu_rt;
1888
1889                 local_bh_disable();
1890                 pcpu_rt = rt6_get_pcpu_route(f6i);
1891
1892                 if (!pcpu_rt)
1893                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1894
1895                 local_bh_enable();
1896                 rcu_read_unlock();
1897                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1898                 return pcpu_rt;
1899         }
1900 }
1901 EXPORT_SYMBOL_GPL(ip6_pol_route);
1902
1903 static struct rt6_info *ip6_pol_route_input(struct net *net,
1904                                             struct fib6_table *table,
1905                                             struct flowi6 *fl6,
1906                                             const struct sk_buff *skb,
1907                                             int flags)
1908 {
1909         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1910 }
1911
1912 struct dst_entry *ip6_route_input_lookup(struct net *net,
1913                                          struct net_device *dev,
1914                                          struct flowi6 *fl6,
1915                                          const struct sk_buff *skb,
1916                                          int flags)
1917 {
1918         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1919                 flags |= RT6_LOOKUP_F_IFACE;
1920
1921         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1922 }
1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1924
1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1926                                   struct flow_keys *keys,
1927                                   struct flow_keys *flkeys)
1928 {
1929         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1930         const struct ipv6hdr *key_iph = outer_iph;
1931         struct flow_keys *_flkeys = flkeys;
1932         const struct ipv6hdr *inner_iph;
1933         const struct icmp6hdr *icmph;
1934         struct ipv6hdr _inner_iph;
1935         struct icmp6hdr _icmph;
1936
1937         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938                 goto out;
1939
1940         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1941                                    sizeof(_icmph), &_icmph);
1942         if (!icmph)
1943                 goto out;
1944
1945         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1946             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1947             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1948             icmph->icmp6_type != ICMPV6_PARAMPROB)
1949                 goto out;
1950
1951         inner_iph = skb_header_pointer(skb,
1952                                        skb_transport_offset(skb) + sizeof(*icmph),
1953                                        sizeof(_inner_iph), &_inner_iph);
1954         if (!inner_iph)
1955                 goto out;
1956
1957         key_iph = inner_iph;
1958         _flkeys = NULL;
1959 out:
1960         if (_flkeys) {
1961                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1962                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1963                 keys->tags.flow_label = _flkeys->tags.flow_label;
1964                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1965         } else {
1966                 keys->addrs.v6addrs.src = key_iph->saddr;
1967                 keys->addrs.v6addrs.dst = key_iph->daddr;
1968                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1969                 keys->basic.ip_proto = key_iph->nexthdr;
1970         }
1971 }
1972
1973 /* if skb is set it will be used and fl6 can be NULL */
1974 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1975                        const struct sk_buff *skb, struct flow_keys *flkeys)
1976 {
1977         struct flow_keys hash_keys;
1978         u32 mhash;
1979
1980         switch (ip6_multipath_hash_policy(net)) {
1981         case 0:
1982                 memset(&hash_keys, 0, sizeof(hash_keys));
1983                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1984                 if (skb) {
1985                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1986                 } else {
1987                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1988                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1989                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1990                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1991                 }
1992                 break;
1993         case 1:
1994                 if (skb) {
1995                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1996                         struct flow_keys keys;
1997
1998                         /* short-circuit if we already have L4 hash present */
1999                         if (skb->l4_hash)
2000                                 return skb_get_hash_raw(skb) >> 1;
2001
2002                         memset(&hash_keys, 0, sizeof(hash_keys));
2003
2004                         if (!flkeys) {
2005                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2006                                 flkeys = &keys;
2007                         }
2008                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2010                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2011                         hash_keys.ports.src = flkeys->ports.src;
2012                         hash_keys.ports.dst = flkeys->ports.dst;
2013                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2014                 } else {
2015                         memset(&hash_keys, 0, sizeof(hash_keys));
2016                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2017                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2018                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2019                         hash_keys.ports.src = fl6->fl6_sport;
2020                         hash_keys.ports.dst = fl6->fl6_dport;
2021                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2022                 }
2023                 break;
2024         }
2025         mhash = flow_hash_from_keys(&hash_keys);
2026
2027         return mhash >> 1;
2028 }
2029
2030 void ip6_route_input(struct sk_buff *skb)
2031 {
2032         const struct ipv6hdr *iph = ipv6_hdr(skb);
2033         struct net *net = dev_net(skb->dev);
2034         int flags = RT6_LOOKUP_F_HAS_SADDR;
2035         struct ip_tunnel_info *tun_info;
2036         struct flowi6 fl6 = {
2037                 .flowi6_iif = skb->dev->ifindex,
2038                 .daddr = iph->daddr,
2039                 .saddr = iph->saddr,
2040                 .flowlabel = ip6_flowinfo(iph),
2041                 .flowi6_mark = skb->mark,
2042                 .flowi6_proto = iph->nexthdr,
2043         };
2044         struct flow_keys *flkeys = NULL, _flkeys;
2045
2046         tun_info = skb_tunnel_info(skb);
2047         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2048                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2049
2050         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2051                 flkeys = &_flkeys;
2052
2053         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2054                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2055         skb_dst_drop(skb);
2056         skb_dst_set(skb,
2057                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2058 }
2059
2060 static struct rt6_info *ip6_pol_route_output(struct net *net,
2061                                              struct fib6_table *table,
2062                                              struct flowi6 *fl6,
2063                                              const struct sk_buff *skb,
2064                                              int flags)
2065 {
2066         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2067 }
2068
2069 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2070                                          struct flowi6 *fl6, int flags)
2071 {
2072         bool any_src;
2073
2074         if (rt6_need_strict(&fl6->daddr)) {
2075                 struct dst_entry *dst;
2076
2077                 dst = l3mdev_link_scope_lookup(net, fl6);
2078                 if (dst)
2079                         return dst;
2080         }
2081
2082         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2083
2084         any_src = ipv6_addr_any(&fl6->saddr);
2085         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2086             (fl6->flowi6_oif && any_src))
2087                 flags |= RT6_LOOKUP_F_IFACE;
2088
2089         if (!any_src)
2090                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2091         else if (sk)
2092                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2093
2094         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2095 }
2096 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2097
2098 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2099 {
2100         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2101         struct net_device *loopback_dev = net->loopback_dev;
2102         struct dst_entry *new = NULL;
2103
2104         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2105                        DST_OBSOLETE_DEAD, 0);
2106         if (rt) {
2107                 rt6_info_init(rt);
2108                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2109
2110                 new = &rt->dst;
2111                 new->__use = 1;
2112                 new->input = dst_discard;
2113                 new->output = dst_discard_out;
2114
2115                 dst_copy_metrics(new, &ort->dst);
2116
2117                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2118                 rt->rt6i_gateway = ort->rt6i_gateway;
2119                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2120
2121                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2122 #ifdef CONFIG_IPV6_SUBTREES
2123                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2124 #endif
2125         }
2126
2127         dst_release(dst_orig);
2128         return new ? new : ERR_PTR(-ENOMEM);
2129 }
2130
2131 /*
2132  *      Destination cache support functions
2133  */
2134
2135 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2136 {
2137         u32 rt_cookie = 0;
2138
2139         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2140                 return false;
2141
2142         if (fib6_check_expired(f6i))
2143                 return false;
2144
2145         return true;
2146 }
2147
2148 static struct dst_entry *rt6_check(struct rt6_info *rt,
2149                                    struct fib6_info *from,
2150                                    u32 cookie)
2151 {
2152         u32 rt_cookie = 0;
2153
2154         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2155             rt_cookie != cookie)
2156                 return NULL;
2157
2158         if (rt6_check_expired(rt))
2159                 return NULL;
2160
2161         return &rt->dst;
2162 }
2163
2164 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2165                                             struct fib6_info *from,
2166                                             u32 cookie)
2167 {
2168         if (!__rt6_check_expired(rt) &&
2169             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2170             fib6_check(from, cookie))
2171                 return &rt->dst;
2172         else
2173                 return NULL;
2174 }
2175
2176 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2177 {
2178         struct dst_entry *dst_ret;
2179         struct fib6_info *from;
2180         struct rt6_info *rt;
2181
2182         rt = container_of(dst, struct rt6_info, dst);
2183
2184         rcu_read_lock();
2185
2186         /* All IPV6 dsts are created with ->obsolete set to the value
2187          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2188          * into this function always.
2189          */
2190
2191         from = rcu_dereference(rt->from);
2192
2193         if (from && (rt->rt6i_flags & RTF_PCPU ||
2194             unlikely(!list_empty(&rt->rt6i_uncached))))
2195                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2196         else
2197                 dst_ret = rt6_check(rt, from, cookie);
2198
2199         rcu_read_unlock();
2200
2201         return dst_ret;
2202 }
2203
2204 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2205 {
2206         struct rt6_info *rt = (struct rt6_info *) dst;
2207
2208         if (rt) {
2209                 if (rt->rt6i_flags & RTF_CACHE) {
2210                         rcu_read_lock();
2211                         if (rt6_check_expired(rt)) {
2212                                 rt6_remove_exception_rt(rt);
2213                                 dst = NULL;
2214                         }
2215                         rcu_read_unlock();
2216                 } else {
2217                         dst_release(dst);
2218                         dst = NULL;
2219                 }
2220         }
2221         return dst;
2222 }
2223
2224 static void ip6_link_failure(struct sk_buff *skb)
2225 {
2226         struct rt6_info *rt;
2227
2228         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2229
2230         rt = (struct rt6_info *) skb_dst(skb);
2231         if (rt) {
2232                 rcu_read_lock();
2233                 if (rt->rt6i_flags & RTF_CACHE) {
2234                         if (dst_hold_safe(&rt->dst))
2235                                 rt6_remove_exception_rt(rt);
2236                 } else {
2237                         struct fib6_info *from;
2238                         struct fib6_node *fn;
2239
2240                         from = rcu_dereference(rt->from);
2241                         if (from) {
2242                                 fn = rcu_dereference(from->fib6_node);
2243                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2244                                         fn->fn_sernum = -1;
2245                         }
2246                 }
2247                 rcu_read_unlock();
2248         }
2249 }
2250
2251 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2252 {
2253         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2254                 struct fib6_info *from;
2255
2256                 rcu_read_lock();
2257                 from = rcu_dereference(rt0->from);
2258                 if (from)
2259                         rt0->dst.expires = from->expires;
2260                 rcu_read_unlock();
2261         }
2262
2263         dst_set_expires(&rt0->dst, timeout);
2264         rt0->rt6i_flags |= RTF_EXPIRES;
2265 }
2266
2267 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2268 {
2269         struct net *net = dev_net(rt->dst.dev);
2270
2271         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2272         rt->rt6i_flags |= RTF_MODIFIED;
2273         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2274 }
2275
2276 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2277 {
2278         bool from_set;
2279
2280         rcu_read_lock();
2281         from_set = !!rcu_dereference(rt->from);
2282         rcu_read_unlock();
2283
2284         return !(rt->rt6i_flags & RTF_CACHE) &&
2285                 (rt->rt6i_flags & RTF_PCPU || from_set);
2286 }
2287
2288 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2289                                  const struct ipv6hdr *iph, u32 mtu)
2290 {
2291         const struct in6_addr *daddr, *saddr;
2292         struct rt6_info *rt6 = (struct rt6_info *)dst;
2293
2294         if (rt6->rt6i_flags & RTF_LOCAL)
2295                 return;
2296
2297         if (dst_metric_locked(dst, RTAX_MTU))
2298                 return;
2299
2300         if (iph) {
2301                 daddr = &iph->daddr;
2302                 saddr = &iph->saddr;
2303         } else if (sk) {
2304                 daddr = &sk->sk_v6_daddr;
2305                 saddr = &inet6_sk(sk)->saddr;
2306         } else {
2307                 daddr = NULL;
2308                 saddr = NULL;
2309         }
2310         dst_confirm_neigh(dst, daddr);
2311         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2312         if (mtu >= dst_mtu(dst))
2313                 return;
2314
2315         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2316                 rt6_do_update_pmtu(rt6, mtu);
2317                 /* update rt6_ex->stamp for cache */
2318                 if (rt6->rt6i_flags & RTF_CACHE)
2319                         rt6_update_exception_stamp_rt(rt6);
2320         } else if (daddr) {
2321                 struct fib6_info *from;
2322                 struct rt6_info *nrt6;
2323
2324                 rcu_read_lock();
2325                 from = rcu_dereference(rt6->from);
2326                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2327                 if (nrt6) {
2328                         rt6_do_update_pmtu(nrt6, mtu);
2329                         if (rt6_insert_exception(nrt6, from))
2330                                 dst_release_immediate(&nrt6->dst);
2331                 }
2332                 rcu_read_unlock();
2333         }
2334 }
2335
2336 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2337                                struct sk_buff *skb, u32 mtu)
2338 {
2339         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2340 }
2341
2342 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2343                      int oif, u32 mark, kuid_t uid)
2344 {
2345         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2346         struct dst_entry *dst;
2347         struct flowi6 fl6;
2348
2349         memset(&fl6, 0, sizeof(fl6));
2350         fl6.flowi6_oif = oif;
2351         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2352         fl6.daddr = iph->daddr;
2353         fl6.saddr = iph->saddr;
2354         fl6.flowlabel = ip6_flowinfo(iph);
2355         fl6.flowi6_uid = uid;
2356
2357         dst = ip6_route_output(net, NULL, &fl6);
2358         if (!dst->error)
2359                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2360         dst_release(dst);
2361 }
2362 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2363
2364 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2365 {
2366         struct dst_entry *dst;
2367
2368         ip6_update_pmtu(skb, sock_net(sk), mtu,
2369                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2370
2371         dst = __sk_dst_get(sk);
2372         if (!dst || !dst->obsolete ||
2373             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2374                 return;
2375
2376         bh_lock_sock(sk);
2377         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2378                 ip6_datagram_dst_update(sk, false);
2379         bh_unlock_sock(sk);
2380 }
2381 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2382
2383 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2384                            const struct flowi6 *fl6)
2385 {
2386 #ifdef CONFIG_IPV6_SUBTREES
2387         struct ipv6_pinfo *np = inet6_sk(sk);
2388 #endif
2389
2390         ip6_dst_store(sk, dst,
2391                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2392                       &sk->sk_v6_daddr : NULL,
2393 #ifdef CONFIG_IPV6_SUBTREES
2394                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2395                       &np->saddr :
2396 #endif
2397                       NULL);
2398 }
2399
2400 /* Handle redirects */
2401 struct ip6rd_flowi {
2402         struct flowi6 fl6;
2403         struct in6_addr gateway;
2404 };
2405
2406 static struct rt6_info *__ip6_route_redirect(struct net *net,
2407                                              struct fib6_table *table,
2408                                              struct flowi6 *fl6,
2409                                              const struct sk_buff *skb,
2410                                              int flags)
2411 {
2412         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2413         struct rt6_info *ret = NULL, *rt_cache;
2414         struct fib6_info *rt;
2415         struct fib6_node *fn;
2416
2417         /* Get the "current" route for this destination and
2418          * check if the redirect has come from appropriate router.
2419          *
2420          * RFC 4861 specifies that redirects should only be
2421          * accepted if they come from the nexthop to the target.
2422          * Due to the way the routes are chosen, this notion
2423          * is a bit fuzzy and one might need to check all possible
2424          * routes.
2425          */
2426
2427         rcu_read_lock();
2428         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2429 restart:
2430         for_each_fib6_node_rt_rcu(fn) {
2431                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2432                         continue;
2433                 if (fib6_check_expired(rt))
2434                         continue;
2435                 if (rt->fib6_flags & RTF_REJECT)
2436                         break;
2437                 if (!(rt->fib6_flags & RTF_GATEWAY))
2438                         continue;
2439                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2440                         continue;
2441                 /* rt_cache's gateway might be different from its 'parent'
2442                  * in the case of an ip redirect.
2443                  * So we keep searching in the exception table if the gateway
2444                  * is different.
2445                  */
2446                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2447                         rt_cache = rt6_find_cached_rt(rt,
2448                                                       &fl6->daddr,
2449                                                       &fl6->saddr);
2450                         if (rt_cache &&
2451                             ipv6_addr_equal(&rdfl->gateway,
2452                                             &rt_cache->rt6i_gateway)) {
2453                                 ret = rt_cache;
2454                                 break;
2455                         }
2456                         continue;
2457                 }
2458                 break;
2459         }
2460
2461         if (!rt)
2462                 rt = net->ipv6.fib6_null_entry;
2463         else if (rt->fib6_flags & RTF_REJECT) {
2464                 ret = net->ipv6.ip6_null_entry;
2465                 goto out;
2466         }
2467
2468         if (rt == net->ipv6.fib6_null_entry) {
2469                 fn = fib6_backtrack(fn, &fl6->saddr);
2470                 if (fn)
2471                         goto restart;
2472         }
2473
2474 out:
2475         if (ret)
2476                 dst_hold(&ret->dst);
2477         else
2478                 ret = ip6_create_rt_rcu(rt);
2479
2480         rcu_read_unlock();
2481
2482         trace_fib6_table_lookup(net, ret, table, fl6);
2483         return ret;
2484 };
2485
2486 static struct dst_entry *ip6_route_redirect(struct net *net,
2487                                             const struct flowi6 *fl6,
2488                                             const struct sk_buff *skb,
2489                                             const struct in6_addr *gateway)
2490 {
2491         int flags = RT6_LOOKUP_F_HAS_SADDR;
2492         struct ip6rd_flowi rdfl;
2493
2494         rdfl.fl6 = *fl6;
2495         rdfl.gateway = *gateway;
2496
2497         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2498                                 flags, __ip6_route_redirect);
2499 }
2500
2501 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2502                   kuid_t uid)
2503 {
2504         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2505         struct dst_entry *dst;
2506         struct flowi6 fl6;
2507
2508         memset(&fl6, 0, sizeof(fl6));
2509         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2510         fl6.flowi6_oif = oif;
2511         fl6.flowi6_mark = mark;
2512         fl6.daddr = iph->daddr;
2513         fl6.saddr = iph->saddr;
2514         fl6.flowlabel = ip6_flowinfo(iph);
2515         fl6.flowi6_uid = uid;
2516
2517         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2518         rt6_do_redirect(dst, NULL, skb);
2519         dst_release(dst);
2520 }
2521 EXPORT_SYMBOL_GPL(ip6_redirect);
2522
2523 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2524                             u32 mark)
2525 {
2526         const struct ipv6hdr *iph = ipv6_hdr(skb);
2527         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2528         struct dst_entry *dst;
2529         struct flowi6 fl6;
2530
2531         memset(&fl6, 0, sizeof(fl6));
2532         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2533         fl6.flowi6_oif = oif;
2534         fl6.flowi6_mark = mark;
2535         fl6.daddr = msg->dest;
2536         fl6.saddr = iph->daddr;
2537         fl6.flowi6_uid = sock_net_uid(net, NULL);
2538
2539         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2540         rt6_do_redirect(dst, NULL, skb);
2541         dst_release(dst);
2542 }
2543
2544 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2545 {
2546         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2547                      sk->sk_uid);
2548 }
2549 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2550
2551 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2552 {
2553         struct net_device *dev = dst->dev;
2554         unsigned int mtu = dst_mtu(dst);
2555         struct net *net = dev_net(dev);
2556
2557         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2558
2559         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2560                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2561
2562         /*
2563          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2564          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2565          * IPV6_MAXPLEN is also valid and means: "any MSS,
2566          * rely only on pmtu discovery"
2567          */
2568         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2569                 mtu = IPV6_MAXPLEN;
2570         return mtu;
2571 }
2572
2573 static unsigned int ip6_mtu(const struct dst_entry *dst)
2574 {
2575         struct inet6_dev *idev;
2576         unsigned int mtu;
2577
2578         mtu = dst_metric_raw(dst, RTAX_MTU);
2579         if (mtu)
2580                 goto out;
2581
2582         mtu = IPV6_MIN_MTU;
2583
2584         rcu_read_lock();
2585         idev = __in6_dev_get(dst->dev);
2586         if (idev)
2587                 mtu = idev->cnf.mtu6;
2588         rcu_read_unlock();
2589
2590 out:
2591         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2592
2593         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2594 }
2595
2596 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2597                                   struct flowi6 *fl6)
2598 {
2599         struct dst_entry *dst;
2600         struct rt6_info *rt;
2601         struct inet6_dev *idev = in6_dev_get(dev);
2602         struct net *net = dev_net(dev);
2603
2604         if (unlikely(!idev))
2605                 return ERR_PTR(-ENODEV);
2606
2607         rt = ip6_dst_alloc(net, dev, 0);
2608         if (unlikely(!rt)) {
2609                 in6_dev_put(idev);
2610                 dst = ERR_PTR(-ENOMEM);
2611                 goto out;
2612         }
2613
2614         rt->dst.flags |= DST_HOST;
2615         rt->dst.input = ip6_input;
2616         rt->dst.output  = ip6_output;
2617         rt->rt6i_gateway  = fl6->daddr;
2618         rt->rt6i_dst.addr = fl6->daddr;
2619         rt->rt6i_dst.plen = 128;
2620         rt->rt6i_idev     = idev;
2621         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2622
2623         /* Add this dst into uncached_list so that rt6_disable_ip() can
2624          * do proper release of the net_device
2625          */
2626         rt6_uncached_list_add(rt);
2627         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2628
2629         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2630
2631 out:
2632         return dst;
2633 }
2634
2635 static int ip6_dst_gc(struct dst_ops *ops)
2636 {
2637         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2638         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2639         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2640         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2641         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2642         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2643         int entries;
2644
2645         entries = dst_entries_get_fast(ops);
2646         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2647             entries <= rt_max_size)
2648                 goto out;
2649
2650         net->ipv6.ip6_rt_gc_expire++;
2651         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2652         entries = dst_entries_get_slow(ops);
2653         if (entries < ops->gc_thresh)
2654                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2655 out:
2656         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2657         return entries > rt_max_size;
2658 }
2659
2660 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2661                                struct fib6_config *cfg)
2662 {
2663         struct dst_metrics *p;
2664
2665         if (!cfg->fc_mx)
2666                 return 0;
2667
2668         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2669         if (unlikely(!p))
2670                 return -ENOMEM;
2671
2672         refcount_set(&p->refcnt, 1);
2673         rt->fib6_metrics = p;
2674
2675         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2676 }
2677
2678 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2679                                             struct fib6_config *cfg,
2680                                             const struct in6_addr *gw_addr,
2681                                             u32 tbid, int flags)
2682 {
2683         struct flowi6 fl6 = {
2684                 .flowi6_oif = cfg->fc_ifindex,
2685                 .daddr = *gw_addr,
2686                 .saddr = cfg->fc_prefsrc,
2687         };
2688         struct fib6_table *table;
2689         struct rt6_info *rt;
2690
2691         table = fib6_get_table(net, tbid);
2692         if (!table)
2693                 return NULL;
2694
2695         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2696                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2697
2698         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2699         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2700
2701         /* if table lookup failed, fall back to full lookup */
2702         if (rt == net->ipv6.ip6_null_entry) {
2703                 ip6_rt_put(rt);
2704                 rt = NULL;
2705         }
2706
2707         return rt;
2708 }
2709
2710 static int ip6_route_check_nh_onlink(struct net *net,
2711                                      struct fib6_config *cfg,
2712                                      const struct net_device *dev,
2713                                      struct netlink_ext_ack *extack)
2714 {
2715         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2716         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2717         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2718         struct rt6_info *grt;
2719         int err;
2720
2721         err = 0;
2722         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2723         if (grt) {
2724                 if (!grt->dst.error &&
2725                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2726                         NL_SET_ERR_MSG(extack,
2727                                        "Nexthop has invalid gateway or device mismatch");
2728                         err = -EINVAL;
2729                 }
2730
2731                 ip6_rt_put(grt);
2732         }
2733
2734         return err;
2735 }
2736
2737 static int ip6_route_check_nh(struct net *net,
2738                               struct fib6_config *cfg,
2739                               struct net_device **_dev,
2740                               struct inet6_dev **idev)
2741 {
2742         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2743         struct net_device *dev = _dev ? *_dev : NULL;
2744         struct rt6_info *grt = NULL;
2745         int err = -EHOSTUNREACH;
2746
2747         if (cfg->fc_table) {
2748                 int flags = RT6_LOOKUP_F_IFACE;
2749
2750                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2751                                           cfg->fc_table, flags);
2752                 if (grt) {
2753                         if (grt->rt6i_flags & RTF_GATEWAY ||
2754                             (dev && dev != grt->dst.dev)) {
2755                                 ip6_rt_put(grt);
2756                                 grt = NULL;
2757                         }
2758                 }
2759         }
2760
2761         if (!grt)
2762                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2763
2764         if (!grt)
2765                 goto out;
2766
2767         if (dev) {
2768                 if (dev != grt->dst.dev) {
2769                         ip6_rt_put(grt);
2770                         goto out;
2771                 }
2772         } else {
2773                 *_dev = dev = grt->dst.dev;
2774                 *idev = grt->rt6i_idev;
2775                 dev_hold(dev);
2776                 in6_dev_hold(grt->rt6i_idev);
2777         }
2778
2779         if (!(grt->rt6i_flags & RTF_GATEWAY))
2780                 err = 0;
2781
2782         ip6_rt_put(grt);
2783
2784 out:
2785         return err;
2786 }
2787
2788 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2789                            struct net_device **_dev, struct inet6_dev **idev,
2790                            struct netlink_ext_ack *extack)
2791 {
2792         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2793         int gwa_type = ipv6_addr_type(gw_addr);
2794         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2795         const struct net_device *dev = *_dev;
2796         bool need_addr_check = !dev;
2797         int err = -EINVAL;
2798
2799         /* if gw_addr is local we will fail to detect this in case
2800          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2801          * will return already-added prefix route via interface that
2802          * prefix route was assigned to, which might be non-loopback.
2803          */
2804         if (dev &&
2805             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2806                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2807                 goto out;
2808         }
2809
2810         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2811                 /* IPv6 strictly inhibits using not link-local
2812                  * addresses as nexthop address.
2813                  * Otherwise, router will not able to send redirects.
2814                  * It is very good, but in some (rare!) circumstances
2815                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2816                  * some exceptions. --ANK
2817                  * We allow IPv4-mapped nexthops to support RFC4798-type
2818                  * addressing
2819                  */
2820                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2821                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2822                         goto out;
2823                 }
2824
2825                 if (cfg->fc_flags & RTNH_F_ONLINK)
2826                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2827                 else
2828                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2829
2830                 if (err)
2831                         goto out;
2832         }
2833
2834         /* reload in case device was changed */
2835         dev = *_dev;
2836
2837         err = -EINVAL;
2838         if (!dev) {
2839                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2840                 goto out;
2841         } else if (dev->flags & IFF_LOOPBACK) {
2842                 NL_SET_ERR_MSG(extack,
2843                                "Egress device can not be loopback device for this route");
2844                 goto out;
2845         }
2846
2847         /* if we did not check gw_addr above, do so now that the
2848          * egress device has been resolved.
2849          */
2850         if (need_addr_check &&
2851             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2852                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2853                 goto out;
2854         }
2855
2856         err = 0;
2857 out:
2858         return err;
2859 }
2860
2861 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2862                                               gfp_t gfp_flags,
2863                                               struct netlink_ext_ack *extack)
2864 {
2865         struct net *net = cfg->fc_nlinfo.nl_net;
2866         struct fib6_info *rt = NULL;
2867         struct net_device *dev = NULL;
2868         struct inet6_dev *idev = NULL;
2869         struct fib6_table *table;
2870         int addr_type;
2871         int err = -EINVAL;
2872
2873         /* RTF_PCPU is an internal flag; can not be set by userspace */
2874         if (cfg->fc_flags & RTF_PCPU) {
2875                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2876                 goto out;
2877         }
2878
2879         /* RTF_CACHE is an internal flag; can not be set by userspace */
2880         if (cfg->fc_flags & RTF_CACHE) {
2881                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2882                 goto out;
2883         }
2884
2885         if (cfg->fc_type > RTN_MAX) {
2886                 NL_SET_ERR_MSG(extack, "Invalid route type");
2887                 goto out;
2888         }
2889
2890         if (cfg->fc_dst_len > 128) {
2891                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2892                 goto out;
2893         }
2894         if (cfg->fc_src_len > 128) {
2895                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2896                 goto out;
2897         }
2898 #ifndef CONFIG_IPV6_SUBTREES
2899         if (cfg->fc_src_len) {
2900                 NL_SET_ERR_MSG(extack,
2901                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2902                 goto out;
2903         }
2904 #endif
2905         if (cfg->fc_ifindex) {
2906                 err = -ENODEV;
2907                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2908                 if (!dev)
2909                         goto out;
2910                 idev = in6_dev_get(dev);
2911                 if (!idev)
2912                         goto out;
2913         }
2914
2915         if (cfg->fc_metric == 0)
2916                 cfg->fc_metric = IP6_RT_PRIO_USER;
2917
2918         if (cfg->fc_flags & RTNH_F_ONLINK) {
2919                 if (!dev) {
2920                         NL_SET_ERR_MSG(extack,
2921                                        "Nexthop device required for onlink");
2922                         err = -ENODEV;
2923                         goto out;
2924                 }
2925
2926                 if (!(dev->flags & IFF_UP)) {
2927                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2928                         err = -ENETDOWN;
2929                         goto out;
2930                 }
2931         }
2932
2933         err = -ENOBUFS;
2934         if (cfg->fc_nlinfo.nlh &&
2935             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2936                 table = fib6_get_table(net, cfg->fc_table);
2937                 if (!table) {
2938                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2939                         table = fib6_new_table(net, cfg->fc_table);
2940                 }
2941         } else {
2942                 table = fib6_new_table(net, cfg->fc_table);
2943         }
2944
2945         if (!table)
2946                 goto out;
2947
2948         err = -ENOMEM;
2949         rt = fib6_info_alloc(gfp_flags);
2950         if (!rt)
2951                 goto out;
2952
2953         if (cfg->fc_flags & RTF_ADDRCONF)
2954                 rt->dst_nocount = true;
2955
2956         err = ip6_convert_metrics(net, rt, cfg);
2957         if (err < 0)
2958                 goto out;
2959
2960         if (cfg->fc_flags & RTF_EXPIRES)
2961                 fib6_set_expires(rt, jiffies +
2962                                 clock_t_to_jiffies(cfg->fc_expires));
2963         else
2964                 fib6_clean_expires(rt);
2965
2966         if (cfg->fc_protocol == RTPROT_UNSPEC)
2967                 cfg->fc_protocol = RTPROT_BOOT;
2968         rt->fib6_protocol = cfg->fc_protocol;
2969
2970         addr_type = ipv6_addr_type(&cfg->fc_dst);
2971
2972         if (cfg->fc_encap) {
2973                 struct lwtunnel_state *lwtstate;
2974
2975                 err = lwtunnel_build_state(cfg->fc_encap_type,
2976                                            cfg->fc_encap, AF_INET6, cfg,
2977                                            &lwtstate, extack);
2978                 if (err)
2979                         goto out;
2980                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2981         }
2982
2983         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2984         rt->fib6_dst.plen = cfg->fc_dst_len;
2985         if (rt->fib6_dst.plen == 128)
2986                 rt->dst_host = true;
2987
2988 #ifdef CONFIG_IPV6_SUBTREES
2989         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2990         rt->fib6_src.plen = cfg->fc_src_len;
2991 #endif
2992
2993         rt->fib6_metric = cfg->fc_metric;
2994         rt->fib6_nh.nh_weight = 1;
2995
2996         rt->fib6_type = cfg->fc_type;
2997
2998         /* We cannot add true routes via loopback here,
2999            they would result in kernel looping; promote them to reject routes
3000          */
3001         if ((cfg->fc_flags & RTF_REJECT) ||
3002             (dev && (dev->flags & IFF_LOOPBACK) &&
3003              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3004              !(cfg->fc_flags & RTF_LOCAL))) {
3005                 /* hold loopback dev/idev if we haven't done so. */
3006                 if (dev != net->loopback_dev) {
3007                         if (dev) {
3008                                 dev_put(dev);
3009                                 in6_dev_put(idev);
3010                         }
3011                         dev = net->loopback_dev;
3012                         dev_hold(dev);
3013                         idev = in6_dev_get(dev);
3014                         if (!idev) {
3015                                 err = -ENODEV;
3016                                 goto out;
3017                         }
3018                 }
3019                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3020                 goto install_route;
3021         }
3022
3023         if (cfg->fc_flags & RTF_GATEWAY) {
3024                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3025                 if (err)
3026                         goto out;
3027
3028                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3029         }
3030
3031         err = -ENODEV;
3032         if (!dev)
3033                 goto out;
3034
3035         if (idev->cnf.disable_ipv6) {
3036                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3037                 err = -EACCES;
3038                 goto out;
3039         }
3040
3041         if (!(dev->flags & IFF_UP)) {
3042                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3043                 err = -ENETDOWN;
3044                 goto out;
3045         }
3046
3047         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3048                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3049                         NL_SET_ERR_MSG(extack, "Invalid source address");
3050                         err = -EINVAL;
3051                         goto out;
3052                 }
3053                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3054                 rt->fib6_prefsrc.plen = 128;
3055         } else
3056                 rt->fib6_prefsrc.plen = 0;
3057
3058         rt->fib6_flags = cfg->fc_flags;
3059
3060 install_route:
3061         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3062             !netif_carrier_ok(dev))
3063                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3064         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3065         rt->fib6_nh.nh_dev = dev;
3066         rt->fib6_table = table;
3067
3068         cfg->fc_nlinfo.nl_net = dev_net(dev);
3069
3070         if (idev)
3071                 in6_dev_put(idev);
3072
3073         return rt;
3074 out:
3075         if (dev)
3076                 dev_put(dev);
3077         if (idev)
3078                 in6_dev_put(idev);
3079
3080         fib6_info_release(rt);
3081         return ERR_PTR(err);
3082 }
3083
3084 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3085                   struct netlink_ext_ack *extack)
3086 {
3087         struct fib6_info *rt;
3088         int err;
3089
3090         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3091         if (IS_ERR(rt))
3092                 return PTR_ERR(rt);
3093
3094         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3095         fib6_info_release(rt);
3096
3097         return err;
3098 }
3099
3100 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3101 {
3102         struct net *net = info->nl_net;
3103         struct fib6_table *table;
3104         int err;
3105
3106         if (rt == net->ipv6.fib6_null_entry) {
3107                 err = -ENOENT;
3108                 goto out;
3109         }
3110
3111         table = rt->fib6_table;
3112         spin_lock_bh(&table->tb6_lock);
3113         err = fib6_del(rt, info);
3114         spin_unlock_bh(&table->tb6_lock);
3115
3116 out:
3117         fib6_info_release(rt);
3118         return err;
3119 }
3120
3121 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3122 {
3123         struct nl_info info = { .nl_net = net };
3124
3125         return __ip6_del_rt(rt, &info);
3126 }
3127
3128 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3129 {
3130         struct nl_info *info = &cfg->fc_nlinfo;
3131         struct net *net = info->nl_net;
3132         struct sk_buff *skb = NULL;
3133         struct fib6_table *table;
3134         int err = -ENOENT;
3135
3136         if (rt == net->ipv6.fib6_null_entry)
3137                 goto out_put;
3138         table = rt->fib6_table;
3139         spin_lock_bh(&table->tb6_lock);
3140
3141         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3142                 struct fib6_info *sibling, *next_sibling;
3143
3144                 /* prefer to send a single notification with all hops */
3145                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3146                 if (skb) {
3147                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3148
3149                         if (rt6_fill_node(net, skb, rt, NULL,
3150                                           NULL, NULL, 0, RTM_DELROUTE,
3151                                           info->portid, seq, 0) < 0) {
3152                                 kfree_skb(skb);
3153                                 skb = NULL;
3154                         } else
3155                                 info->skip_notify = 1;
3156                 }
3157
3158                 list_for_each_entry_safe(sibling, next_sibling,
3159                                          &rt->fib6_siblings,
3160                                          fib6_siblings) {
3161                         err = fib6_del(sibling, info);
3162                         if (err)
3163                                 goto out_unlock;
3164                 }
3165         }
3166
3167         err = fib6_del(rt, info);
3168 out_unlock:
3169         spin_unlock_bh(&table->tb6_lock);
3170 out_put:
3171         fib6_info_release(rt);
3172
3173         if (skb) {
3174                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3175                             info->nlh, gfp_any());
3176         }
3177         return err;
3178 }
3179
3180 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3181 {
3182         int rc = -ESRCH;
3183
3184         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3185                 goto out;
3186
3187         if (cfg->fc_flags & RTF_GATEWAY &&
3188             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3189                 goto out;
3190         if (dst_hold_safe(&rt->dst))
3191                 rc = rt6_remove_exception_rt(rt);
3192 out:
3193         return rc;
3194 }
3195
3196 static int ip6_route_del(struct fib6_config *cfg,
3197                          struct netlink_ext_ack *extack)
3198 {
3199         struct rt6_info *rt_cache;
3200         struct fib6_table *table;
3201         struct fib6_info *rt;
3202         struct fib6_node *fn;
3203         int err = -ESRCH;
3204
3205         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3206         if (!table) {
3207                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3208                 return err;
3209         }
3210
3211         rcu_read_lock();
3212
3213         fn = fib6_locate(&table->tb6_root,
3214                          &cfg->fc_dst, cfg->fc_dst_len,
3215                          &cfg->fc_src, cfg->fc_src_len,
3216                          !(cfg->fc_flags & RTF_CACHE));
3217
3218         if (fn) {
3219                 for_each_fib6_node_rt_rcu(fn) {
3220                         if (cfg->fc_flags & RTF_CACHE) {
3221                                 int rc;
3222
3223                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3224                                                               &cfg->fc_src);
3225                                 if (rt_cache) {
3226                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3227                                         if (rc != -ESRCH) {
3228                                                 rcu_read_unlock();
3229                                                 return rc;
3230                                         }
3231                                 }
3232                                 continue;
3233                         }
3234                         if (cfg->fc_ifindex &&
3235                             (!rt->fib6_nh.nh_dev ||
3236                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3237                                 continue;
3238                         if (cfg->fc_flags & RTF_GATEWAY &&
3239                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3240                                 continue;
3241                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3242                                 continue;
3243                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3244                                 continue;
3245                         fib6_info_hold(rt);
3246                         rcu_read_unlock();
3247
3248                         /* if gateway was specified only delete the one hop */
3249                         if (cfg->fc_flags & RTF_GATEWAY)
3250                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3251
3252                         return __ip6_del_rt_siblings(rt, cfg);
3253                 }
3254         }
3255         rcu_read_unlock();
3256
3257         return err;
3258 }
3259
3260 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3261 {
3262         struct netevent_redirect netevent;
3263         struct rt6_info *rt, *nrt = NULL;
3264         struct ndisc_options ndopts;
3265         struct inet6_dev *in6_dev;
3266         struct neighbour *neigh;
3267         struct fib6_info *from;
3268         struct rd_msg *msg;
3269         int optlen, on_link;
3270         u8 *lladdr;
3271
3272         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3273         optlen -= sizeof(*msg);
3274
3275         if (optlen < 0) {
3276                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3277                 return;
3278         }
3279
3280         msg = (struct rd_msg *)icmp6_hdr(skb);
3281
3282         if (ipv6_addr_is_multicast(&msg->dest)) {
3283                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3284                 return;
3285         }
3286
3287         on_link = 0;
3288         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3289                 on_link = 1;
3290         } else if (ipv6_addr_type(&msg->target) !=
3291                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3292                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3293                 return;
3294         }
3295
3296         in6_dev = __in6_dev_get(skb->dev);
3297         if (!in6_dev)
3298                 return;
3299         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3300                 return;
3301
3302         /* RFC2461 8.1:
3303          *      The IP source address of the Redirect MUST be the same as the current
3304          *      first-hop router for the specified ICMP Destination Address.
3305          */
3306
3307         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3308                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3309                 return;
3310         }
3311
3312         lladdr = NULL;
3313         if (ndopts.nd_opts_tgt_lladdr) {
3314                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3315                                              skb->dev);
3316                 if (!lladdr) {
3317                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3318                         return;
3319                 }
3320         }
3321
3322         rt = (struct rt6_info *) dst;
3323         if (rt->rt6i_flags & RTF_REJECT) {
3324                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3325                 return;
3326         }
3327
3328         /* Redirect received -> path was valid.
3329          * Look, redirects are sent only in response to data packets,
3330          * so that this nexthop apparently is reachable. --ANK
3331          */
3332         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3333
3334         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3335         if (!neigh)
3336                 return;
3337
3338         /*
3339          *      We have finally decided to accept it.
3340          */
3341
3342         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3343                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3344                      NEIGH_UPDATE_F_OVERRIDE|
3345                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3346                                      NEIGH_UPDATE_F_ISROUTER)),
3347                      NDISC_REDIRECT, &ndopts);
3348
3349         rcu_read_lock();
3350         from = rcu_dereference(rt->from);
3351         fib6_info_hold(from);
3352         rcu_read_unlock();
3353
3354         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3355         if (!nrt)
3356                 goto out;
3357
3358         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3359         if (on_link)
3360                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3361
3362         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3363
3364         /* No need to remove rt from the exception table if rt is
3365          * a cached route because rt6_insert_exception() will
3366          * takes care of it
3367          */
3368         if (rt6_insert_exception(nrt, from)) {
3369                 dst_release_immediate(&nrt->dst);
3370                 goto out;
3371         }
3372
3373         netevent.old = &rt->dst;
3374         netevent.new = &nrt->dst;
3375         netevent.daddr = &msg->dest;
3376         netevent.neigh = neigh;
3377         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3378
3379 out:
3380         fib6_info_release(from);
3381         neigh_release(neigh);
3382 }
3383
3384 #ifdef CONFIG_IPV6_ROUTE_INFO
3385 static struct fib6_info *rt6_get_route_info(struct net *net,
3386                                            const struct in6_addr *prefix, int prefixlen,
3387                                            const struct in6_addr *gwaddr,
3388                                            struct net_device *dev)
3389 {
3390         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3391         int ifindex = dev->ifindex;
3392         struct fib6_node *fn;
3393         struct fib6_info *rt = NULL;
3394         struct fib6_table *table;
3395
3396         table = fib6_get_table(net, tb_id);
3397         if (!table)
3398                 return NULL;
3399
3400         rcu_read_lock();
3401         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3402         if (!fn)
3403                 goto out;
3404
3405         for_each_fib6_node_rt_rcu(fn) {
3406                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3407                         continue;
3408                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3409                         continue;
3410                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3411                         continue;
3412                 fib6_info_hold(rt);
3413                 break;
3414         }
3415 out:
3416         rcu_read_unlock();
3417         return rt;
3418 }
3419
3420 static struct fib6_info *rt6_add_route_info(struct net *net,
3421                                            const struct in6_addr *prefix, int prefixlen,
3422                                            const struct in6_addr *gwaddr,
3423                                            struct net_device *dev,
3424                                            unsigned int pref)
3425 {
3426         struct fib6_config cfg = {
3427                 .fc_metric      = IP6_RT_PRIO_USER,
3428                 .fc_ifindex     = dev->ifindex,
3429                 .fc_dst_len     = prefixlen,
3430                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3431                                   RTF_UP | RTF_PREF(pref),
3432                 .fc_protocol = RTPROT_RA,
3433                 .fc_type = RTN_UNICAST,
3434                 .fc_nlinfo.portid = 0,
3435                 .fc_nlinfo.nlh = NULL,
3436                 .fc_nlinfo.nl_net = net,
3437         };
3438
3439         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3440         cfg.fc_dst = *prefix;
3441         cfg.fc_gateway = *gwaddr;
3442
3443         /* We should treat it as a default route if prefix length is 0. */
3444         if (!prefixlen)
3445                 cfg.fc_flags |= RTF_DEFAULT;
3446
3447         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3448
3449         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3450 }
3451 #endif
3452
3453 struct fib6_info *rt6_get_dflt_router(struct net *net,
3454                                      const struct in6_addr *addr,
3455                                      struct net_device *dev)
3456 {
3457         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3458         struct fib6_info *rt;
3459         struct fib6_table *table;
3460
3461         table = fib6_get_table(net, tb_id);
3462         if (!table)
3463                 return NULL;
3464
3465         rcu_read_lock();
3466         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3467                 if (dev == rt->fib6_nh.nh_dev &&
3468                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3469                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3470                         break;
3471         }
3472         if (rt)
3473                 fib6_info_hold(rt);
3474         rcu_read_unlock();
3475         return rt;
3476 }
3477
3478 struct fib6_info *rt6_add_dflt_router(struct net *net,
3479                                      const struct in6_addr *gwaddr,
3480                                      struct net_device *dev,
3481                                      unsigned int pref)
3482 {
3483         struct fib6_config cfg = {
3484                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3485                 .fc_metric      = IP6_RT_PRIO_USER,
3486                 .fc_ifindex     = dev->ifindex,
3487                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3488                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3489                 .fc_protocol = RTPROT_RA,
3490                 .fc_type = RTN_UNICAST,
3491                 .fc_nlinfo.portid = 0,
3492                 .fc_nlinfo.nlh = NULL,
3493                 .fc_nlinfo.nl_net = net,
3494         };
3495
3496         cfg.fc_gateway = *gwaddr;
3497
3498         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3499                 struct fib6_table *table;
3500
3501                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3502                 if (table)
3503                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3504         }
3505
3506         return rt6_get_dflt_router(net, gwaddr, dev);
3507 }
3508
3509 static void __rt6_purge_dflt_routers(struct net *net,
3510                                      struct fib6_table *table)
3511 {
3512         struct fib6_info *rt;
3513
3514 restart:
3515         rcu_read_lock();
3516         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3517                 struct net_device *dev = fib6_info_nh_dev(rt);
3518                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3519
3520                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3521                     (!idev || idev->cnf.accept_ra != 2)) {
3522                         fib6_info_hold(rt);
3523                         rcu_read_unlock();
3524                         ip6_del_rt(net, rt);
3525                         goto restart;
3526                 }
3527         }
3528         rcu_read_unlock();
3529
3530         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3531 }
3532
3533 void rt6_purge_dflt_routers(struct net *net)
3534 {
3535         struct fib6_table *table;
3536         struct hlist_head *head;
3537         unsigned int h;
3538
3539         rcu_read_lock();
3540
3541         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3542                 head = &net->ipv6.fib_table_hash[h];
3543                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3544                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3545                                 __rt6_purge_dflt_routers(net, table);
3546                 }
3547         }
3548
3549         rcu_read_unlock();
3550 }
3551
3552 static void rtmsg_to_fib6_config(struct net *net,
3553                                  struct in6_rtmsg *rtmsg,
3554                                  struct fib6_config *cfg)
3555 {
3556         memset(cfg, 0, sizeof(*cfg));
3557
3558         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3559                          : RT6_TABLE_MAIN;
3560         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3561         cfg->fc_metric = rtmsg->rtmsg_metric;
3562         cfg->fc_expires = rtmsg->rtmsg_info;
3563         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3564         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3565         cfg->fc_flags = rtmsg->rtmsg_flags;
3566         cfg->fc_type = rtmsg->rtmsg_type;
3567
3568         cfg->fc_nlinfo.nl_net = net;
3569
3570         cfg->fc_dst = rtmsg->rtmsg_dst;
3571         cfg->fc_src = rtmsg->rtmsg_src;
3572         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3573 }
3574
3575 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3576 {
3577         struct fib6_config cfg;
3578         struct in6_rtmsg rtmsg;
3579         int err;
3580
3581         switch (cmd) {
3582         case SIOCADDRT:         /* Add a route */
3583         case SIOCDELRT:         /* Delete a route */
3584                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3585                         return -EPERM;
3586                 err = copy_from_user(&rtmsg, arg,
3587                                      sizeof(struct in6_rtmsg));
3588                 if (err)
3589                         return -EFAULT;
3590
3591                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3592
3593                 rtnl_lock();
3594                 switch (cmd) {
3595                 case SIOCADDRT:
3596                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3597                         break;
3598                 case SIOCDELRT:
3599                         err = ip6_route_del(&cfg, NULL);
3600                         break;
3601                 default:
3602                         err = -EINVAL;
3603                 }
3604                 rtnl_unlock();
3605
3606                 return err;
3607         }
3608
3609         return -EINVAL;
3610 }
3611
3612 /*
3613  *      Drop the packet on the floor
3614  */
3615
3616 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3617 {
3618         int type;
3619         struct dst_entry *dst = skb_dst(skb);
3620         switch (ipstats_mib_noroutes) {
3621         case IPSTATS_MIB_INNOROUTES:
3622                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3623                 if (type == IPV6_ADDR_ANY) {
3624                         IP6_INC_STATS(dev_net(dst->dev),
3625                                       __in6_dev_get_safely(skb->dev),
3626                                       IPSTATS_MIB_INADDRERRORS);
3627                         break;
3628                 }
3629                 /* FALLTHROUGH */
3630         case IPSTATS_MIB_OUTNOROUTES:
3631                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3632                               ipstats_mib_noroutes);
3633                 break;
3634         }
3635         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3636         kfree_skb(skb);
3637         return 0;
3638 }
3639
3640 static int ip6_pkt_discard(struct sk_buff *skb)
3641 {
3642         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3643 }
3644
3645 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3646 {
3647         skb->dev = skb_dst(skb)->dev;
3648         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3649 }
3650
3651 static int ip6_pkt_prohibit(struct sk_buff *skb)
3652 {
3653         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3654 }
3655
3656 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3657 {
3658         skb->dev = skb_dst(skb)->dev;
3659         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3660 }
3661
3662 /*
3663  *      Allocate a dst for local (unicast / anycast) address.
3664  */
3665
3666 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3667                                      struct inet6_dev *idev,
3668                                      const struct in6_addr *addr,
3669                                      bool anycast, gfp_t gfp_flags)
3670 {
3671         u32 tb_id;
3672         struct net_device *dev = idev->dev;
3673         struct fib6_info *f6i;
3674
3675         f6i = fib6_info_alloc(gfp_flags);
3676         if (!f6i)
3677                 return ERR_PTR(-ENOMEM);
3678
3679         f6i->dst_nocount = true;
3680         f6i->dst_host = true;
3681         f6i->fib6_protocol = RTPROT_KERNEL;
3682         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3683         if (anycast) {
3684                 f6i->fib6_type = RTN_ANYCAST;
3685                 f6i->fib6_flags |= RTF_ANYCAST;
3686         } else {
3687                 f6i->fib6_type = RTN_LOCAL;
3688                 f6i->fib6_flags |= RTF_LOCAL;
3689         }
3690
3691         f6i->fib6_nh.nh_gw = *addr;
3692         dev_hold(dev);
3693         f6i->fib6_nh.nh_dev = dev;
3694         f6i->fib6_dst.addr = *addr;
3695         f6i->fib6_dst.plen = 128;
3696         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3697         f6i->fib6_table = fib6_get_table(net, tb_id);
3698
3699         return f6i;
3700 }
3701
3702 /* remove deleted ip from prefsrc entries */
3703 struct arg_dev_net_ip {
3704         struct net_device *dev;
3705         struct net *net;
3706         struct in6_addr *addr;
3707 };
3708
3709 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3710 {
3711         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3712         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3713         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3714
3715         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3716             rt != net->ipv6.fib6_null_entry &&
3717             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3718                 spin_lock_bh(&rt6_exception_lock);
3719                 /* remove prefsrc entry */
3720                 rt->fib6_prefsrc.plen = 0;
3721                 /* need to update cache as well */
3722                 rt6_exceptions_remove_prefsrc(rt);
3723                 spin_unlock_bh(&rt6_exception_lock);
3724         }
3725         return 0;
3726 }
3727
3728 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3729 {
3730         struct net *net = dev_net(ifp->idev->dev);
3731         struct arg_dev_net_ip adni = {
3732                 .dev = ifp->idev->dev,
3733                 .net = net,
3734                 .addr = &ifp->addr,
3735         };
3736         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3737 }
3738
3739 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3740
3741 /* Remove routers and update dst entries when gateway turn into host. */
3742 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3743 {
3744         struct in6_addr *gateway = (struct in6_addr *)arg;
3745
3746         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3747             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3748                 return -1;
3749         }
3750
3751         /* Further clean up cached routes in exception table.
3752          * This is needed because cached route may have a different
3753          * gateway than its 'parent' in the case of an ip redirect.
3754          */
3755         rt6_exceptions_clean_tohost(rt, gateway);
3756
3757         return 0;
3758 }
3759
3760 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3761 {
3762         fib6_clean_all(net, fib6_clean_tohost, gateway);
3763 }
3764
3765 struct arg_netdev_event {
3766         const struct net_device *dev;
3767         union {
3768                 unsigned int nh_flags;
3769                 unsigned long event;
3770         };
3771 };
3772
3773 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3774 {
3775         struct fib6_info *iter;
3776         struct fib6_node *fn;
3777
3778         fn = rcu_dereference_protected(rt->fib6_node,
3779                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3780         iter = rcu_dereference_protected(fn->leaf,
3781                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3782         while (iter) {
3783                 if (iter->fib6_metric == rt->fib6_metric &&
3784                     rt6_qualify_for_ecmp(iter))
3785                         return iter;
3786                 iter = rcu_dereference_protected(iter->fib6_next,
3787                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3788         }
3789
3790         return NULL;
3791 }
3792
3793 static bool rt6_is_dead(const struct fib6_info *rt)
3794 {
3795         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3796             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3797              fib6_ignore_linkdown(rt)))
3798                 return true;
3799
3800         return false;
3801 }
3802
3803 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3804 {
3805         struct fib6_info *iter;
3806         int total = 0;
3807
3808         if (!rt6_is_dead(rt))
3809                 total += rt->fib6_nh.nh_weight;
3810
3811         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3812                 if (!rt6_is_dead(iter))
3813                         total += iter->fib6_nh.nh_weight;
3814         }
3815
3816         return total;
3817 }
3818
3819 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3820 {
3821         int upper_bound = -1;
3822
3823         if (!rt6_is_dead(rt)) {
3824                 *weight += rt->fib6_nh.nh_weight;
3825                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3826                                                     total) - 1;
3827         }
3828         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3829 }
3830
3831 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3832 {
3833         struct fib6_info *iter;
3834         int weight = 0;
3835
3836         rt6_upper_bound_set(rt, &weight, total);
3837
3838         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3839                 rt6_upper_bound_set(iter, &weight, total);
3840 }
3841
3842 void rt6_multipath_rebalance(struct fib6_info *rt)
3843 {
3844         struct fib6_info *first;
3845         int total;
3846
3847         /* In case the entire multipath route was marked for flushing,
3848          * then there is no need to rebalance upon the removal of every
3849          * sibling route.
3850          */
3851         if (!rt->fib6_nsiblings || rt->should_flush)
3852                 return;
3853
3854         /* During lookup routes are evaluated in order, so we need to
3855          * make sure upper bounds are assigned from the first sibling
3856          * onwards.
3857          */
3858         first = rt6_multipath_first_sibling(rt);
3859         if (WARN_ON_ONCE(!first))
3860                 return;
3861
3862         total = rt6_multipath_total_weight(first);
3863         rt6_multipath_upper_bound_set(first, total);
3864 }
3865
3866 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3867 {
3868         const struct arg_netdev_event *arg = p_arg;
3869         struct net *net = dev_net(arg->dev);
3870
3871         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3872                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3873                 fib6_update_sernum_upto_root(net, rt);
3874                 rt6_multipath_rebalance(rt);
3875         }
3876
3877         return 0;
3878 }
3879
3880 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3881 {
3882         struct arg_netdev_event arg = {
3883                 .dev = dev,
3884                 {
3885                         .nh_flags = nh_flags,
3886                 },
3887         };
3888
3889         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3890                 arg.nh_flags |= RTNH_F_LINKDOWN;
3891
3892         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3893 }
3894
3895 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3896                                    const struct net_device *dev)
3897 {
3898         struct fib6_info *iter;
3899
3900         if (rt->fib6_nh.nh_dev == dev)
3901                 return true;
3902         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3903                 if (iter->fib6_nh.nh_dev == dev)
3904                         return true;
3905
3906         return false;
3907 }
3908
3909 static void rt6_multipath_flush(struct fib6_info *rt)
3910 {
3911         struct fib6_info *iter;
3912
3913         rt->should_flush = 1;
3914         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3915                 iter->should_flush = 1;
3916 }
3917
3918 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3919                                              const struct net_device *down_dev)
3920 {
3921         struct fib6_info *iter;
3922         unsigned int dead = 0;
3923
3924         if (rt->fib6_nh.nh_dev == down_dev ||
3925             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3926                 dead++;
3927         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3928                 if (iter->fib6_nh.nh_dev == down_dev ||
3929                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3930                         dead++;
3931
3932         return dead;
3933 }
3934
3935 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3936                                        const struct net_device *dev,
3937                                        unsigned int nh_flags)
3938 {
3939         struct fib6_info *iter;
3940
3941         if (rt->fib6_nh.nh_dev == dev)
3942                 rt->fib6_nh.nh_flags |= nh_flags;
3943         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3944                 if (iter->fib6_nh.nh_dev == dev)
3945                         iter->fib6_nh.nh_flags |= nh_flags;
3946 }
3947
3948 /* called with write lock held for table with rt */
3949 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3950 {
3951         const struct arg_netdev_event *arg = p_arg;
3952         const struct net_device *dev = arg->dev;
3953         struct net *net = dev_net(dev);
3954
3955         if (rt == net->ipv6.fib6_null_entry)
3956                 return 0;
3957
3958         switch (arg->event) {
3959         case NETDEV_UNREGISTER:
3960                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3961         case NETDEV_DOWN:
3962                 if (rt->should_flush)
3963                         return -1;
3964                 if (!rt->fib6_nsiblings)
3965                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3966                 if (rt6_multipath_uses_dev(rt, dev)) {
3967                         unsigned int count;
3968
3969                         count = rt6_multipath_dead_count(rt, dev);
3970                         if (rt->fib6_nsiblings + 1 == count) {
3971                                 rt6_multipath_flush(rt);
3972                                 return -1;
3973                         }
3974                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3975                                                    RTNH_F_LINKDOWN);
3976                         fib6_update_sernum(net, rt);
3977                         rt6_multipath_rebalance(rt);
3978                 }
3979                 return -2;
3980         case NETDEV_CHANGE:
3981                 if (rt->fib6_nh.nh_dev != dev ||
3982                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3983                         break;
3984                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3985                 rt6_multipath_rebalance(rt);
3986                 break;
3987         }
3988
3989         return 0;
3990 }
3991
3992 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3993 {
3994         struct arg_netdev_event arg = {
3995                 .dev = dev,
3996                 {
3997                         .event = event,
3998                 },
3999         };
4000
4001         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4002 }
4003
4004 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4005 {
4006         rt6_sync_down_dev(dev, event);
4007         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4008         neigh_ifdown(&nd_tbl, dev);
4009 }
4010
4011 struct rt6_mtu_change_arg {
4012         struct net_device *dev;
4013         unsigned int mtu;
4014 };
4015
4016 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4017 {
4018         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4019         struct inet6_dev *idev;
4020
4021         /* In IPv6 pmtu discovery is not optional,
4022            so that RTAX_MTU lock cannot disable it.
4023            We still use this lock to block changes
4024            caused by addrconf/ndisc.
4025         */
4026
4027         idev = __in6_dev_get(arg->dev);
4028         if (!idev)
4029                 return 0;
4030
4031         /* For administrative MTU increase, there is no way to discover
4032            IPv6 PMTU increase, so PMTU increase should be updated here.
4033            Since RFC 1981 doesn't include administrative MTU increase
4034            update PMTU increase is a MUST. (i.e. jumbo frame)
4035          */
4036         if (rt->fib6_nh.nh_dev == arg->dev &&
4037             !fib6_metric_locked(rt, RTAX_MTU)) {
4038                 u32 mtu = rt->fib6_pmtu;
4039
4040                 if (mtu >= arg->mtu ||
4041                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4042                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4043
4044                 spin_lock_bh(&rt6_exception_lock);
4045                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4046                 spin_unlock_bh(&rt6_exception_lock);
4047         }
4048         return 0;
4049 }
4050
4051 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4052 {
4053         struct rt6_mtu_change_arg arg = {
4054                 .dev = dev,
4055                 .mtu = mtu,
4056         };
4057
4058         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4059 }
4060
4061 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4062         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4063         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4064         [RTA_OIF]               = { .type = NLA_U32 },
4065         [RTA_IIF]               = { .type = NLA_U32 },
4066         [RTA_PRIORITY]          = { .type = NLA_U32 },
4067         [RTA_METRICS]           = { .type = NLA_NESTED },
4068         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4069         [RTA_PREF]              = { .type = NLA_U8 },
4070         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4071         [RTA_ENCAP]             = { .type = NLA_NESTED },
4072         [RTA_EXPIRES]           = { .type = NLA_U32 },
4073         [RTA_UID]               = { .type = NLA_U32 },
4074         [RTA_MARK]              = { .type = NLA_U32 },
4075         [RTA_TABLE]             = { .type = NLA_U32 },
4076 };
4077
4078 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4079                               struct fib6_config *cfg,
4080                               struct netlink_ext_ack *extack)
4081 {
4082         struct rtmsg *rtm;
4083         struct nlattr *tb[RTA_MAX+1];
4084         unsigned int pref;
4085         int err;
4086
4087         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4088                           NULL);
4089         if (err < 0)
4090                 goto errout;
4091
4092         err = -EINVAL;
4093         rtm = nlmsg_data(nlh);
4094         memset(cfg, 0, sizeof(*cfg));
4095
4096         cfg->fc_table = rtm->rtm_table;
4097         cfg->fc_dst_len = rtm->rtm_dst_len;
4098         cfg->fc_src_len = rtm->rtm_src_len;
4099         cfg->fc_flags = RTF_UP;
4100         cfg->fc_protocol = rtm->rtm_protocol;
4101         cfg->fc_type = rtm->rtm_type;
4102
4103         if (rtm->rtm_type == RTN_UNREACHABLE ||
4104             rtm->rtm_type == RTN_BLACKHOLE ||
4105             rtm->rtm_type == RTN_PROHIBIT ||
4106             rtm->rtm_type == RTN_THROW)
4107                 cfg->fc_flags |= RTF_REJECT;
4108
4109         if (rtm->rtm_type == RTN_LOCAL)
4110                 cfg->fc_flags |= RTF_LOCAL;
4111
4112         if (rtm->rtm_flags & RTM_F_CLONED)
4113                 cfg->fc_flags |= RTF_CACHE;
4114
4115         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4116
4117         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4118         cfg->fc_nlinfo.nlh = nlh;
4119         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4120
4121         if (tb[RTA_GATEWAY]) {
4122                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4123                 cfg->fc_flags |= RTF_GATEWAY;
4124         }
4125
4126         if (tb[RTA_DST]) {
4127                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4128
4129                 if (nla_len(tb[RTA_DST]) < plen)
4130                         goto errout;
4131
4132                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4133         }
4134
4135         if (tb[RTA_SRC]) {
4136                 int plen = (rtm->rtm_src_len + 7) >> 3;
4137
4138                 if (nla_len(tb[RTA_SRC]) < plen)
4139                         goto errout;
4140
4141                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4142         }
4143
4144         if (tb[RTA_PREFSRC])
4145                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4146
4147         if (tb[RTA_OIF])
4148                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4149
4150         if (tb[RTA_PRIORITY])
4151                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4152
4153         if (tb[RTA_METRICS]) {
4154                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4155                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4156         }
4157
4158         if (tb[RTA_TABLE])
4159                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4160
4161         if (tb[RTA_MULTIPATH]) {
4162                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4163                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4164
4165                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4166                                                      cfg->fc_mp_len, extack);
4167                 if (err < 0)
4168                         goto errout;
4169         }
4170
4171         if (tb[RTA_PREF]) {
4172                 pref = nla_get_u8(tb[RTA_PREF]);
4173                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4174                     pref != ICMPV6_ROUTER_PREF_HIGH)
4175                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4176                 cfg->fc_flags |= RTF_PREF(pref);
4177         }
4178
4179         if (tb[RTA_ENCAP])
4180                 cfg->fc_encap = tb[RTA_ENCAP];
4181
4182         if (tb[RTA_ENCAP_TYPE]) {
4183                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4184
4185                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4186                 if (err < 0)
4187                         goto errout;
4188         }
4189
4190         if (tb[RTA_EXPIRES]) {
4191                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4192
4193                 if (addrconf_finite_timeout(timeout)) {
4194                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4195                         cfg->fc_flags |= RTF_EXPIRES;
4196                 }
4197         }
4198
4199         err = 0;
4200 errout:
4201         return err;
4202 }
4203
4204 struct rt6_nh {
4205         struct fib6_info *fib6_info;
4206         struct fib6_config r_cfg;
4207         struct list_head next;
4208 };
4209
4210 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4211 {
4212         struct rt6_nh *nh;
4213
4214         list_for_each_entry(nh, rt6_nh_list, next) {
4215                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4216                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4217                         nh->r_cfg.fc_ifindex);
4218         }
4219 }
4220
4221 static int ip6_route_info_append(struct net *net,
4222                                  struct list_head *rt6_nh_list,
4223                                  struct fib6_info *rt,
4224                                  struct fib6_config *r_cfg)
4225 {
4226         struct rt6_nh *nh;
4227         int err = -EEXIST;
4228
4229         list_for_each_entry(nh, rt6_nh_list, next) {
4230                 /* check if fib6_info already exists */
4231                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4232                         return err;
4233         }
4234
4235         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4236         if (!nh)
4237                 return -ENOMEM;
4238         nh->fib6_info = rt;
4239         err = ip6_convert_metrics(net, rt, r_cfg);
4240         if (err) {
4241                 kfree(nh);
4242                 return err;
4243         }
4244         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4245         list_add_tail(&nh->next, rt6_nh_list);
4246
4247         return 0;
4248 }
4249
4250 static void ip6_route_mpath_notify(struct fib6_info *rt,
4251                                    struct fib6_info *rt_last,
4252                                    struct nl_info *info,
4253                                    __u16 nlflags)
4254 {
4255         /* if this is an APPEND route, then rt points to the first route
4256          * inserted and rt_last points to last route inserted. Userspace
4257          * wants a consistent dump of the route which starts at the first
4258          * nexthop. Since sibling routes are always added at the end of
4259          * the list, find the first sibling of the last route appended
4260          */
4261         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4262                 rt = list_first_entry(&rt_last->fib6_siblings,
4263                                       struct fib6_info,
4264                                       fib6_siblings);
4265         }
4266
4267         if (rt)
4268                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4269 }
4270
4271 static int ip6_route_multipath_add(struct fib6_config *cfg,
4272                                    struct netlink_ext_ack *extack)
4273 {
4274         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4275         struct nl_info *info = &cfg->fc_nlinfo;
4276         struct fib6_config r_cfg;
4277         struct rtnexthop *rtnh;
4278         struct fib6_info *rt;
4279         struct rt6_nh *err_nh;
4280         struct rt6_nh *nh, *nh_safe;
4281         __u16 nlflags;
4282         int remaining;
4283         int attrlen;
4284         int err = 1;
4285         int nhn = 0;
4286         int replace = (cfg->fc_nlinfo.nlh &&
4287                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4288         LIST_HEAD(rt6_nh_list);
4289
4290         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4291         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4292                 nlflags |= NLM_F_APPEND;
4293
4294         remaining = cfg->fc_mp_len;
4295         rtnh = (struct rtnexthop *)cfg->fc_mp;
4296
4297         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4298          * fib6_info structs per nexthop
4299          */
4300         while (rtnh_ok(rtnh, remaining)) {
4301                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4302                 if (rtnh->rtnh_ifindex)
4303                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4304
4305                 attrlen = rtnh_attrlen(rtnh);
4306                 if (attrlen > 0) {
4307                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4308
4309                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4310                         if (nla) {
4311                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4312                                 r_cfg.fc_flags |= RTF_GATEWAY;
4313                         }
4314                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4315                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4316                         if (nla)
4317                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4318                 }
4319
4320                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4321                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4322                 if (IS_ERR(rt)) {
4323                         err = PTR_ERR(rt);
4324                         rt = NULL;
4325                         goto cleanup;
4326                 }
4327
4328                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4329
4330                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4331                                             rt, &r_cfg);
4332                 if (err) {
4333                         fib6_info_release(rt);
4334                         goto cleanup;
4335                 }
4336
4337                 rtnh = rtnh_next(rtnh, &remaining);
4338         }
4339
4340         /* for add and replace send one notification with all nexthops.
4341          * Skip the notification in fib6_add_rt2node and send one with
4342          * the full route when done
4343          */
4344         info->skip_notify = 1;
4345
4346         err_nh = NULL;
4347         list_for_each_entry(nh, &rt6_nh_list, next) {
4348                 rt_last = nh->fib6_info;
4349                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4350                 fib6_info_release(nh->fib6_info);
4351
4352                 /* save reference to first route for notification */
4353                 if (!rt_notif && !err)
4354                         rt_notif = nh->fib6_info;
4355
4356                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4357                 nh->fib6_info = NULL;
4358                 if (err) {
4359                         if (replace && nhn)
4360                                 ip6_print_replace_route_err(&rt6_nh_list);
4361                         err_nh = nh;
4362                         goto add_errout;
4363                 }
4364
4365                 /* Because each route is added like a single route we remove
4366                  * these flags after the first nexthop: if there is a collision,
4367                  * we have already failed to add the first nexthop:
4368                  * fib6_add_rt2node() has rejected it; when replacing, old
4369                  * nexthops have been replaced by first new, the rest should
4370                  * be added to it.
4371                  */
4372                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4373                                                      NLM_F_REPLACE);
4374                 nhn++;
4375         }
4376
4377         /* success ... tell user about new route */
4378         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4379         goto cleanup;
4380
4381 add_errout:
4382         /* send notification for routes that were added so that
4383          * the delete notifications sent by ip6_route_del are
4384          * coherent
4385          */
4386         if (rt_notif)
4387                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4388
4389         /* Delete routes that were already added */
4390         list_for_each_entry(nh, &rt6_nh_list, next) {
4391                 if (err_nh == nh)
4392                         break;
4393                 ip6_route_del(&nh->r_cfg, extack);
4394         }
4395
4396 cleanup:
4397         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4398                 if (nh->fib6_info)
4399                         fib6_info_release(nh->fib6_info);
4400                 list_del(&nh->next);
4401                 kfree(nh);
4402         }
4403
4404         return err;
4405 }
4406
4407 static int ip6_route_multipath_del(struct fib6_config *cfg,
4408                                    struct netlink_ext_ack *extack)
4409 {
4410         struct fib6_config r_cfg;
4411         struct rtnexthop *rtnh;
4412         int remaining;
4413         int attrlen;
4414         int err = 1, last_err = 0;
4415
4416         remaining = cfg->fc_mp_len;
4417         rtnh = (struct rtnexthop *)cfg->fc_mp;
4418
4419         /* Parse a Multipath Entry */
4420         while (rtnh_ok(rtnh, remaining)) {
4421                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4422                 if (rtnh->rtnh_ifindex)
4423                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4424
4425                 attrlen = rtnh_attrlen(rtnh);
4426                 if (attrlen > 0) {
4427                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4428
4429                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4430                         if (nla) {
4431                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4432                                 r_cfg.fc_flags |= RTF_GATEWAY;
4433                         }
4434                 }
4435                 err = ip6_route_del(&r_cfg, extack);
4436                 if (err)
4437                         last_err = err;
4438
4439                 rtnh = rtnh_next(rtnh, &remaining);
4440         }
4441
4442         return last_err;
4443 }
4444
4445 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4446                               struct netlink_ext_ack *extack)
4447 {
4448         struct fib6_config cfg;
4449         int err;
4450
4451         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4452         if (err < 0)
4453                 return err;
4454
4455         if (cfg.fc_mp)
4456                 return ip6_route_multipath_del(&cfg, extack);
4457         else {
4458                 cfg.fc_delete_all_nh = 1;
4459                 return ip6_route_del(&cfg, extack);
4460         }
4461 }
4462
4463 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4464                               struct netlink_ext_ack *extack)
4465 {
4466         struct fib6_config cfg;
4467         int err;
4468
4469         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4470         if (err < 0)
4471                 return err;
4472
4473         if (cfg.fc_mp)
4474                 return ip6_route_multipath_add(&cfg, extack);
4475         else
4476                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4477 }
4478
4479 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4480 {
4481         int nexthop_len = 0;
4482
4483         if (rt->fib6_nsiblings) {
4484                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4485                             + NLA_ALIGN(sizeof(struct rtnexthop))
4486                             + nla_total_size(16) /* RTA_GATEWAY */
4487                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4488
4489                 nexthop_len *= rt->fib6_nsiblings;
4490         }
4491
4492         return NLMSG_ALIGN(sizeof(struct rtmsg))
4493                + nla_total_size(16) /* RTA_SRC */
4494                + nla_total_size(16) /* RTA_DST */
4495                + nla_total_size(16) /* RTA_GATEWAY */
4496                + nla_total_size(16) /* RTA_PREFSRC */
4497                + nla_total_size(4) /* RTA_TABLE */
4498                + nla_total_size(4) /* RTA_IIF */
4499                + nla_total_size(4) /* RTA_OIF */
4500                + nla_total_size(4) /* RTA_PRIORITY */
4501                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4502                + nla_total_size(sizeof(struct rta_cacheinfo))
4503                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4504                + nla_total_size(1) /* RTA_PREF */
4505                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4506                + nexthop_len;
4507 }
4508
4509 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4510                             unsigned int *flags, bool skip_oif)
4511 {
4512         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4513                 *flags |= RTNH_F_DEAD;
4514
4515         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4516                 *flags |= RTNH_F_LINKDOWN;
4517
4518                 rcu_read_lock();
4519                 if (fib6_ignore_linkdown(rt))
4520                         *flags |= RTNH_F_DEAD;
4521                 rcu_read_unlock();
4522         }
4523
4524         if (rt->fib6_flags & RTF_GATEWAY) {
4525                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4526                         goto nla_put_failure;
4527         }
4528
4529         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4530         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4531                 *flags |= RTNH_F_OFFLOAD;
4532
4533         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4534         if (!skip_oif && rt->fib6_nh.nh_dev &&
4535             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4536                 goto nla_put_failure;
4537
4538         if (rt->fib6_nh.nh_lwtstate &&
4539             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4540                 goto nla_put_failure;
4541
4542         return 0;
4543
4544 nla_put_failure:
4545         return -EMSGSIZE;
4546 }
4547
4548 /* add multipath next hop */
4549 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4550 {
4551         const struct net_device *dev = rt->fib6_nh.nh_dev;
4552         struct rtnexthop *rtnh;
4553         unsigned int flags = 0;
4554
4555         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4556         if (!rtnh)
4557                 goto nla_put_failure;
4558
4559         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4560         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4561
4562         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4563                 goto nla_put_failure;
4564
4565         rtnh->rtnh_flags = flags;
4566
4567         /* length of rtnetlink header + attributes */
4568         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4569
4570         return 0;
4571
4572 nla_put_failure:
4573         return -EMSGSIZE;
4574 }
4575
4576 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4577                          struct fib6_info *rt, struct dst_entry *dst,
4578                          struct in6_addr *dest, struct in6_addr *src,
4579                          int iif, int type, u32 portid, u32 seq,
4580                          unsigned int flags)
4581 {
4582         struct rtmsg *rtm;
4583         struct nlmsghdr *nlh;
4584         long expires = 0;
4585         u32 *pmetrics;
4586         u32 table;
4587
4588         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4589         if (!nlh)
4590                 return -EMSGSIZE;
4591
4592         rtm = nlmsg_data(nlh);
4593         rtm->rtm_family = AF_INET6;
4594         rtm->rtm_dst_len = rt->fib6_dst.plen;
4595         rtm->rtm_src_len = rt->fib6_src.plen;
4596         rtm->rtm_tos = 0;
4597         if (rt->fib6_table)
4598                 table = rt->fib6_table->tb6_id;
4599         else
4600                 table = RT6_TABLE_UNSPEC;
4601         rtm->rtm_table = table;
4602         if (nla_put_u32(skb, RTA_TABLE, table))
4603                 goto nla_put_failure;
4604
4605         rtm->rtm_type = rt->fib6_type;
4606         rtm->rtm_flags = 0;
4607         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4608         rtm->rtm_protocol = rt->fib6_protocol;
4609
4610         if (rt->fib6_flags & RTF_CACHE)
4611                 rtm->rtm_flags |= RTM_F_CLONED;
4612
4613         if (dest) {
4614                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4615                         goto nla_put_failure;
4616                 rtm->rtm_dst_len = 128;
4617         } else if (rtm->rtm_dst_len)
4618                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4619                         goto nla_put_failure;
4620 #ifdef CONFIG_IPV6_SUBTREES
4621         if (src) {
4622                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4623                         goto nla_put_failure;
4624                 rtm->rtm_src_len = 128;
4625         } else if (rtm->rtm_src_len &&
4626                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4627                 goto nla_put_failure;
4628 #endif
4629         if (iif) {
4630 #ifdef CONFIG_IPV6_MROUTE
4631                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4632                         int err = ip6mr_get_route(net, skb, rtm, portid);
4633
4634                         if (err == 0)
4635                                 return 0;
4636                         if (err < 0)
4637                                 goto nla_put_failure;
4638                 } else
4639 #endif
4640                         if (nla_put_u32(skb, RTA_IIF, iif))
4641                                 goto nla_put_failure;
4642         } else if (dest) {
4643                 struct in6_addr saddr_buf;
4644                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4645                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4646                         goto nla_put_failure;
4647         }
4648
4649         if (rt->fib6_prefsrc.plen) {
4650                 struct in6_addr saddr_buf;
4651                 saddr_buf = rt->fib6_prefsrc.addr;
4652                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4653                         goto nla_put_failure;
4654         }
4655
4656         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4657         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4658                 goto nla_put_failure;
4659
4660         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4661                 goto nla_put_failure;
4662
4663         /* For multipath routes, walk the siblings list and add
4664          * each as a nexthop within RTA_MULTIPATH.
4665          */
4666         if (rt->fib6_nsiblings) {
4667                 struct fib6_info *sibling, *next_sibling;
4668                 struct nlattr *mp;
4669
4670                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4671                 if (!mp)
4672                         goto nla_put_failure;
4673
4674                 if (rt6_add_nexthop(skb, rt) < 0)
4675                         goto nla_put_failure;
4676
4677                 list_for_each_entry_safe(sibling, next_sibling,
4678                                          &rt->fib6_siblings, fib6_siblings) {
4679                         if (rt6_add_nexthop(skb, sibling) < 0)
4680                                 goto nla_put_failure;
4681                 }
4682
4683                 nla_nest_end(skb, mp);
4684         } else {
4685                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4686                         goto nla_put_failure;
4687         }
4688
4689         if (rt->fib6_flags & RTF_EXPIRES) {
4690                 expires = dst ? dst->expires : rt->expires;
4691                 expires -= jiffies;
4692         }
4693
4694         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4695                 goto nla_put_failure;
4696
4697         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4698                 goto nla_put_failure;
4699
4700
4701         nlmsg_end(skb, nlh);
4702         return 0;
4703
4704 nla_put_failure:
4705         nlmsg_cancel(skb, nlh);
4706         return -EMSGSIZE;
4707 }
4708
4709 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4710 {
4711         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4712         struct net *net = arg->net;
4713
4714         if (rt == net->ipv6.fib6_null_entry)
4715                 return 0;
4716
4717         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4718                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4719
4720                 /* user wants prefix routes only */
4721                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4722                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4723                         /* success since this is not a prefix route */
4724                         return 1;
4725                 }
4726         }
4727
4728         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4729                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4730                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4731 }
4732
4733 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4734                               struct netlink_ext_ack *extack)
4735 {
4736         struct net *net = sock_net(in_skb->sk);
4737         struct nlattr *tb[RTA_MAX+1];
4738         int err, iif = 0, oif = 0;
4739         struct fib6_info *from;
4740         struct dst_entry *dst;
4741         struct rt6_info *rt;
4742         struct sk_buff *skb;
4743         struct rtmsg *rtm;
4744         struct flowi6 fl6;
4745         bool fibmatch;
4746
4747         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4748                           extack);
4749         if (err < 0)
4750                 goto errout;
4751
4752         err = -EINVAL;
4753         memset(&fl6, 0, sizeof(fl6));
4754         rtm = nlmsg_data(nlh);
4755         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4756         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4757
4758         if (tb[RTA_SRC]) {
4759                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4760                         goto errout;
4761
4762                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4763         }
4764
4765         if (tb[RTA_DST]) {
4766                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4767                         goto errout;
4768
4769                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4770         }
4771
4772         if (tb[RTA_IIF])
4773                 iif = nla_get_u32(tb[RTA_IIF]);
4774
4775         if (tb[RTA_OIF])
4776                 oif = nla_get_u32(tb[RTA_OIF]);
4777
4778         if (tb[RTA_MARK])
4779                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4780
4781         if (tb[RTA_UID])
4782                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4783                                            nla_get_u32(tb[RTA_UID]));
4784         else
4785                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4786
4787         if (iif) {
4788                 struct net_device *dev;
4789                 int flags = 0;
4790
4791                 rcu_read_lock();
4792
4793                 dev = dev_get_by_index_rcu(net, iif);
4794                 if (!dev) {
4795                         rcu_read_unlock();
4796                         err = -ENODEV;
4797                         goto errout;
4798                 }
4799
4800                 fl6.flowi6_iif = iif;
4801
4802                 if (!ipv6_addr_any(&fl6.saddr))
4803                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4804
4805                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4806
4807                 rcu_read_unlock();
4808         } else {
4809                 fl6.flowi6_oif = oif;
4810
4811                 dst = ip6_route_output(net, NULL, &fl6);
4812         }
4813
4814
4815         rt = container_of(dst, struct rt6_info, dst);
4816         if (rt->dst.error) {
4817                 err = rt->dst.error;
4818                 ip6_rt_put(rt);
4819                 goto errout;
4820         }
4821
4822         if (rt == net->ipv6.ip6_null_entry) {
4823                 err = rt->dst.error;
4824                 ip6_rt_put(rt);
4825                 goto errout;
4826         }
4827
4828         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4829         if (!skb) {
4830                 ip6_rt_put(rt);
4831                 err = -ENOBUFS;
4832                 goto errout;
4833         }
4834
4835         skb_dst_set(skb, &rt->dst);
4836
4837         rcu_read_lock();
4838         from = rcu_dereference(rt->from);
4839
4840         if (fibmatch)
4841                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4842                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4843                                     nlh->nlmsg_seq, 0);
4844         else
4845                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4846                                     &fl6.saddr, iif, RTM_NEWROUTE,
4847                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4848                                     0);
4849         rcu_read_unlock();
4850
4851         if (err < 0) {
4852                 kfree_skb(skb);
4853                 goto errout;
4854         }
4855
4856         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4857 errout:
4858         return err;
4859 }
4860
4861 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4862                      unsigned int nlm_flags)
4863 {
4864         struct sk_buff *skb;
4865         struct net *net = info->nl_net;
4866         u32 seq;
4867         int err;
4868
4869         err = -ENOBUFS;
4870         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4871
4872         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4873         if (!skb)
4874                 goto errout;
4875
4876         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4877                             event, info->portid, seq, nlm_flags);
4878         if (err < 0) {
4879                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4880                 WARN_ON(err == -EMSGSIZE);
4881                 kfree_skb(skb);
4882                 goto errout;
4883         }
4884         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4885                     info->nlh, gfp_any());
4886         return;
4887 errout:
4888         if (err < 0)
4889                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4890 }
4891
4892 static int ip6_route_dev_notify(struct notifier_block *this,
4893                                 unsigned long event, void *ptr)
4894 {
4895         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4896         struct net *net = dev_net(dev);
4897
4898         if (!(dev->flags & IFF_LOOPBACK))
4899                 return NOTIFY_OK;
4900
4901         if (event == NETDEV_REGISTER) {
4902                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4903                 net->ipv6.ip6_null_entry->dst.dev = dev;
4904                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4905 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4906                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4907                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4908                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4909                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4910 #endif
4911          } else if (event == NETDEV_UNREGISTER &&
4912                     dev->reg_state != NETREG_UNREGISTERED) {
4913                 /* NETDEV_UNREGISTER could be fired for multiple times by
4914                  * netdev_wait_allrefs(). Make sure we only call this once.
4915                  */
4916                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4917 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4918                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4919                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4920 #endif
4921         }
4922
4923         return NOTIFY_OK;
4924 }
4925
4926 /*
4927  *      /proc
4928  */
4929
4930 #ifdef CONFIG_PROC_FS
4931
4932 static const struct file_operations ipv6_route_proc_fops = {
4933         .open           = ipv6_route_open,
4934         .read           = seq_read,
4935         .llseek         = seq_lseek,
4936         .release        = seq_release_net,
4937 };
4938
4939 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4940 {
4941         struct net *net = (struct net *)seq->private;
4942         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4943                    net->ipv6.rt6_stats->fib_nodes,
4944                    net->ipv6.rt6_stats->fib_route_nodes,
4945                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4946                    net->ipv6.rt6_stats->fib_rt_entries,
4947                    net->ipv6.rt6_stats->fib_rt_cache,
4948                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4949                    net->ipv6.rt6_stats->fib_discarded_routes);
4950
4951         return 0;
4952 }
4953
4954 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4955 {
4956         return single_open_net(inode, file, rt6_stats_seq_show);
4957 }
4958
4959 static const struct file_operations rt6_stats_seq_fops = {
4960         .open    = rt6_stats_seq_open,
4961         .read    = seq_read,
4962         .llseek  = seq_lseek,
4963         .release = single_release_net,
4964 };
4965 #endif  /* CONFIG_PROC_FS */
4966
4967 #ifdef CONFIG_SYSCTL
4968
4969 static
4970 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4971                               void __user *buffer, size_t *lenp, loff_t *ppos)
4972 {
4973         struct net *net;
4974         int delay;
4975         if (!write)
4976                 return -EINVAL;
4977
4978         net = (struct net *)ctl->extra1;
4979         delay = net->ipv6.sysctl.flush_delay;
4980         proc_dointvec(ctl, write, buffer, lenp, ppos);
4981         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4982         return 0;
4983 }
4984
4985 struct ctl_table ipv6_route_table_template[] = {
4986         {
4987                 .procname       =       "flush",
4988                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4989                 .maxlen         =       sizeof(int),
4990                 .mode           =       0200,
4991                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4992         },
4993         {
4994                 .procname       =       "gc_thresh",
4995                 .data           =       &ip6_dst_ops_template.gc_thresh,
4996                 .maxlen         =       sizeof(int),
4997                 .mode           =       0644,
4998                 .proc_handler   =       proc_dointvec,
4999         },
5000         {
5001                 .procname       =       "max_size",
5002                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5003                 .maxlen         =       sizeof(int),
5004                 .mode           =       0644,
5005                 .proc_handler   =       proc_dointvec,
5006         },
5007         {
5008                 .procname       =       "gc_min_interval",
5009                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5010                 .maxlen         =       sizeof(int),
5011                 .mode           =       0644,
5012                 .proc_handler   =       proc_dointvec_jiffies,
5013         },
5014         {
5015                 .procname       =       "gc_timeout",
5016                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5017                 .maxlen         =       sizeof(int),
5018                 .mode           =       0644,
5019                 .proc_handler   =       proc_dointvec_jiffies,
5020         },
5021         {
5022                 .procname       =       "gc_interval",
5023                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5024                 .maxlen         =       sizeof(int),
5025                 .mode           =       0644,
5026                 .proc_handler   =       proc_dointvec_jiffies,
5027         },
5028         {
5029                 .procname       =       "gc_elasticity",
5030                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5031                 .maxlen         =       sizeof(int),
5032                 .mode           =       0644,
5033                 .proc_handler   =       proc_dointvec,
5034         },
5035         {
5036                 .procname       =       "mtu_expires",
5037                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5038                 .maxlen         =       sizeof(int),
5039                 .mode           =       0644,
5040                 .proc_handler   =       proc_dointvec_jiffies,
5041         },
5042         {
5043                 .procname       =       "min_adv_mss",
5044                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5045                 .maxlen         =       sizeof(int),
5046                 .mode           =       0644,
5047                 .proc_handler   =       proc_dointvec,
5048         },
5049         {
5050                 .procname       =       "gc_min_interval_ms",
5051                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5052                 .maxlen         =       sizeof(int),
5053                 .mode           =       0644,
5054                 .proc_handler   =       proc_dointvec_ms_jiffies,
5055         },
5056         { }
5057 };
5058
5059 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5060 {
5061         struct ctl_table *table;
5062
5063         table = kmemdup(ipv6_route_table_template,
5064                         sizeof(ipv6_route_table_template),
5065                         GFP_KERNEL);
5066
5067         if (table) {
5068                 table[0].data = &net->ipv6.sysctl.flush_delay;
5069                 table[0].extra1 = net;
5070                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5071                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5072                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5073                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5074                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5075                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5076                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5077                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5078                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5079
5080                 /* Don't export sysctls to unprivileged users */
5081                 if (net->user_ns != &init_user_ns)
5082                         table[0].procname = NULL;
5083         }
5084
5085         return table;
5086 }
5087 #endif
5088
5089 static int __net_init ip6_route_net_init(struct net *net)
5090 {
5091         int ret = -ENOMEM;
5092
5093         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5094                sizeof(net->ipv6.ip6_dst_ops));
5095
5096         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5097                 goto out_ip6_dst_ops;
5098
5099         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5100                                             sizeof(*net->ipv6.fib6_null_entry),
5101                                             GFP_KERNEL);
5102         if (!net->ipv6.fib6_null_entry)
5103                 goto out_ip6_dst_entries;
5104
5105         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5106                                            sizeof(*net->ipv6.ip6_null_entry),
5107                                            GFP_KERNEL);
5108         if (!net->ipv6.ip6_null_entry)
5109                 goto out_fib6_null_entry;
5110         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5111         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5112                          ip6_template_metrics, true);
5113
5114 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5115         net->ipv6.fib6_has_custom_rules = false;
5116         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5117                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5118                                                GFP_KERNEL);
5119         if (!net->ipv6.ip6_prohibit_entry)
5120                 goto out_ip6_null_entry;
5121         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5122         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5123                          ip6_template_metrics, true);
5124
5125         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5126                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5127                                                GFP_KERNEL);
5128         if (!net->ipv6.ip6_blk_hole_entry)
5129                 goto out_ip6_prohibit_entry;
5130         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5131         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5132                          ip6_template_metrics, true);
5133 #endif
5134
5135         net->ipv6.sysctl.flush_delay = 0;
5136         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5137         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5138         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5139         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5140         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5141         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5142         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5143
5144         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5145
5146         ret = 0;
5147 out:
5148         return ret;
5149
5150 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5151 out_ip6_prohibit_entry:
5152         kfree(net->ipv6.ip6_prohibit_entry);
5153 out_ip6_null_entry:
5154         kfree(net->ipv6.ip6_null_entry);
5155 #endif
5156 out_fib6_null_entry:
5157         kfree(net->ipv6.fib6_null_entry);
5158 out_ip6_dst_entries:
5159         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5160 out_ip6_dst_ops:
5161         goto out;
5162 }
5163
5164 static void __net_exit ip6_route_net_exit(struct net *net)
5165 {
5166         kfree(net->ipv6.fib6_null_entry);
5167         kfree(net->ipv6.ip6_null_entry);
5168 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5169         kfree(net->ipv6.ip6_prohibit_entry);
5170         kfree(net->ipv6.ip6_blk_hole_entry);
5171 #endif
5172         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5173 }
5174
5175 static int __net_init ip6_route_net_init_late(struct net *net)
5176 {
5177 #ifdef CONFIG_PROC_FS
5178         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5179         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5180 #endif
5181         return 0;
5182 }
5183
5184 static void __net_exit ip6_route_net_exit_late(struct net *net)
5185 {
5186 #ifdef CONFIG_PROC_FS
5187         remove_proc_entry("ipv6_route", net->proc_net);
5188         remove_proc_entry("rt6_stats", net->proc_net);
5189 #endif
5190 }
5191
5192 static struct pernet_operations ip6_route_net_ops = {
5193         .init = ip6_route_net_init,
5194         .exit = ip6_route_net_exit,
5195 };
5196
5197 static int __net_init ipv6_inetpeer_init(struct net *net)
5198 {
5199         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5200
5201         if (!bp)
5202                 return -ENOMEM;
5203         inet_peer_base_init(bp);
5204         net->ipv6.peers = bp;
5205         return 0;
5206 }
5207
5208 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5209 {
5210         struct inet_peer_base *bp = net->ipv6.peers;
5211
5212         net->ipv6.peers = NULL;
5213         inetpeer_invalidate_tree(bp);
5214         kfree(bp);
5215 }
5216
5217 static struct pernet_operations ipv6_inetpeer_ops = {
5218         .init   =       ipv6_inetpeer_init,
5219         .exit   =       ipv6_inetpeer_exit,
5220 };
5221
5222 static struct pernet_operations ip6_route_net_late_ops = {
5223         .init = ip6_route_net_init_late,
5224         .exit = ip6_route_net_exit_late,
5225 };
5226
5227 static struct notifier_block ip6_route_dev_notifier = {
5228         .notifier_call = ip6_route_dev_notify,
5229         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5230 };
5231
5232 void __init ip6_route_init_special_entries(void)
5233 {
5234         /* Registering of the loopback is done before this portion of code,
5235          * the loopback reference in rt6_info will not be taken, do it
5236          * manually for init_net */
5237         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5238         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5239         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5240   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5241         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5242         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5243         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5244         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5245   #endif
5246 }
5247
5248 int __init ip6_route_init(void)
5249 {
5250         int ret;
5251         int cpu;
5252
5253         ret = -ENOMEM;
5254         ip6_dst_ops_template.kmem_cachep =
5255                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5256                                   SLAB_HWCACHE_ALIGN, NULL);
5257         if (!ip6_dst_ops_template.kmem_cachep)
5258                 goto out;
5259
5260         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5261         if (ret)
5262                 goto out_kmem_cache;
5263
5264         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5265         if (ret)
5266                 goto out_dst_entries;
5267
5268         ret = register_pernet_subsys(&ip6_route_net_ops);
5269         if (ret)
5270                 goto out_register_inetpeer;
5271
5272         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5273
5274         ret = fib6_init();
5275         if (ret)
5276                 goto out_register_subsys;
5277
5278         ret = xfrm6_init();
5279         if (ret)
5280                 goto out_fib6_init;
5281
5282         ret = fib6_rules_init();
5283         if (ret)
5284                 goto xfrm6_init;
5285
5286         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5287         if (ret)
5288                 goto fib6_rules_init;
5289
5290         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5291                                    inet6_rtm_newroute, NULL, 0);
5292         if (ret < 0)
5293                 goto out_register_late_subsys;
5294
5295         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5296                                    inet6_rtm_delroute, NULL, 0);
5297         if (ret < 0)
5298                 goto out_register_late_subsys;
5299
5300         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5301                                    inet6_rtm_getroute, NULL,
5302                                    RTNL_FLAG_DOIT_UNLOCKED);
5303         if (ret < 0)
5304                 goto out_register_late_subsys;
5305
5306         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5307         if (ret)
5308                 goto out_register_late_subsys;
5309
5310         for_each_possible_cpu(cpu) {
5311                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5312
5313                 INIT_LIST_HEAD(&ul->head);
5314                 spin_lock_init(&ul->lock);
5315         }
5316
5317 out:
5318         return ret;
5319
5320 out_register_late_subsys:
5321         rtnl_unregister_all(PF_INET6);
5322         unregister_pernet_subsys(&ip6_route_net_late_ops);
5323 fib6_rules_init:
5324         fib6_rules_cleanup();
5325 xfrm6_init:
5326         xfrm6_fini();
5327 out_fib6_init:
5328         fib6_gc_cleanup();
5329 out_register_subsys:
5330         unregister_pernet_subsys(&ip6_route_net_ops);
5331 out_register_inetpeer:
5332         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5333 out_dst_entries:
5334         dst_entries_destroy(&ip6_dst_blackhole_ops);
5335 out_kmem_cache:
5336         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5337         goto out;
5338 }
5339
5340 void ip6_route_cleanup(void)
5341 {
5342         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5343         unregister_pernet_subsys(&ip6_route_net_late_ops);
5344         fib6_rules_cleanup();
5345         xfrm6_fini();
5346         fib6_gc_cleanup();
5347         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5348         unregister_pernet_subsys(&ip6_route_net_ops);
5349         dst_entries_destroy(&ip6_dst_blackhole_ops);
5350         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5351 }