Merge tag 'drm-fixes-2018-09-21' of git://anongit.freedesktop.org/drm/drm
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213         return neigh_create(&nd_tbl, daddr, dev);
214 }
215
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217                                               struct sk_buff *skb,
218                                               const void *daddr)
219 {
220         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = ATOMIC_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct fib6_info *from;
369         struct inet6_dev *idev;
370
371         dst_destroy_metrics_generic(dst);
372         rt6_uncached_list_del(rt);
373
374         idev = rt->rt6i_idev;
375         if (idev) {
376                 rt->rt6i_idev = NULL;
377                 in6_dev_put(idev);
378         }
379
380         rcu_read_lock();
381         from = rcu_dereference(rt->from);
382         rcu_assign_pointer(rt->from, NULL);
383         fib6_info_release(from);
384         rcu_read_unlock();
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429                                         struct fib6_info *match,
430                                         struct flowi6 *fl6, int oif,
431                                         const struct sk_buff *skb,
432                                         int strict)
433 {
434         struct fib6_info *sibling, *next_sibling;
435
436         /* We might have already computed the hash for ICMPv6 errors. In such
437          * case it will always be non-zero. Otherwise now is the time to do it.
438          */
439         if (!fl6->mp_hash)
440                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441
442         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443                 return match;
444
445         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446                                  fib6_siblings) {
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(sibling, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458         return match;
459 }
460
461 /*
462  *      Route lookup. rcu_read_lock() should be held.
463  */
464
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466                                                  struct fib6_info *rt,
467                                                     const struct in6_addr *saddr,
468                                                     int oif,
469                                                     int flags)
470 {
471         struct fib6_info *sprt;
472
473         if (!oif && ipv6_addr_any(saddr) &&
474             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475                 return rt;
476
477         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
479
480                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481                         continue;
482
483                 if (oif) {
484                         if (dev->ifindex == oif)
485                                 return sprt;
486                 } else {
487                         if (ipv6_chk_addr(net, saddr, dev,
488                                           flags & RT6_LOOKUP_F_IFACE))
489                                 return sprt;
490                 }
491         }
492
493         if (oif && flags & RT6_LOOKUP_F_IFACE)
494                 return net->ipv6.fib6_null_entry;
495
496         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
497 }
498
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501         struct work_struct work;
502         struct in6_addr target;
503         struct net_device *dev;
504 };
505
506 static void rt6_probe_deferred(struct work_struct *w)
507 {
508         struct in6_addr mcaddr;
509         struct __rt6_probe_work *work =
510                 container_of(w, struct __rt6_probe_work, work);
511
512         addrconf_addr_solict_mult(&work->target, &mcaddr);
513         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514         dev_put(work->dev);
515         kfree(work);
516 }
517
518 static void rt6_probe(struct fib6_info *rt)
519 {
520         struct __rt6_probe_work *work;
521         const struct in6_addr *nh_gw;
522         struct neighbour *neigh;
523         struct net_device *dev;
524
525         /*
526          * Okay, this does not seem to be appropriate
527          * for now, however, we need to check if it
528          * is really so; aka Router Reachability Probing.
529          *
530          * Router Reachability Probe MUST be rate-limited
531          * to no more than one per minute.
532          */
533         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
534                 return;
535
536         nh_gw = &rt->fib6_nh.nh_gw;
537         dev = rt->fib6_nh.nh_dev;
538         rcu_read_lock_bh();
539         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
540         if (neigh) {
541                 struct inet6_dev *idev;
542
543                 if (neigh->nud_state & NUD_VALID)
544                         goto out;
545
546                 idev = __in6_dev_get(dev);
547                 work = NULL;
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else {
558                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
559         }
560
561         if (work) {
562                 INIT_WORK(&work->work, rt6_probe_deferred);
563                 work->target = *nh_gw;
564                 dev_hold(dev);
565                 work->dev = dev;
566                 schedule_work(&work->work);
567         }
568
569 out:
570         rcu_read_unlock_bh();
571 }
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
574 {
575 }
576 #endif
577
578 /*
579  * Default Router Selection (RFC 2461 6.3.6)
580  */
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 {
583         const struct net_device *dev = rt->fib6_nh.nh_dev;
584
585         if (!oif || dev->ifindex == oif)
586                 return 2;
587         return 0;
588 }
589
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 {
592         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593         struct neighbour *neigh;
594
595         if (rt->fib6_flags & RTF_NONEXTHOP ||
596             !(rt->fib6_flags & RTF_GATEWAY))
597                 return RT6_NUD_SUCCEED;
598
599         rcu_read_lock_bh();
600         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601                                           &rt->fib6_nh.nh_gw);
602         if (neigh) {
603                 read_lock(&neigh->lock);
604                 if (neigh->nud_state & NUD_VALID)
605                         ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607                 else if (!(neigh->nud_state & NUD_FAILED))
608                         ret = RT6_NUD_SUCCEED;
609                 else
610                         ret = RT6_NUD_FAIL_PROBE;
611 #endif
612                 read_unlock(&neigh->lock);
613         } else {
614                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616         }
617         rcu_read_unlock_bh();
618
619         return ret;
620 }
621
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 {
624         int m;
625
626         m = rt6_check_dev(rt, oif);
627         if (!m && (strict & RT6_LOOKUP_F_IFACE))
628                 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632         if (strict & RT6_LOOKUP_F_REACHABLE) {
633                 int n = rt6_check_neigh(rt);
634                 if (n < 0)
635                         return n;
636         }
637         return m;
638 }
639
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 {
643         const struct net_device *dev = fib6_info_nh_dev(f6i);
644         bool rc = false;
645
646         if (dev) {
647                 const struct inet6_dev *idev = __in6_dev_get(dev);
648
649                 rc = !!idev->cnf.ignore_routes_with_linkdown;
650         }
651
652         return rc;
653 }
654
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656                                    int *mpri, struct fib6_info *match,
657                                    bool *do_rr)
658 {
659         int m;
660         bool match_do_rr = false;
661
662         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663                 goto out;
664
665         if (fib6_ignore_linkdown(rt) &&
666             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668                 goto out;
669
670         if (fib6_check_expired(rt))
671                 goto out;
672
673         m = rt6_score_route(rt, oif, strict);
674         if (m == RT6_NUD_FAIL_DO_RR) {
675                 match_do_rr = true;
676                 m = 0; /* lowest valid score */
677         } else if (m == RT6_NUD_FAIL_HARD) {
678                 goto out;
679         }
680
681         if (strict & RT6_LOOKUP_F_REACHABLE)
682                 rt6_probe(rt);
683
684         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685         if (m > *mpri) {
686                 *do_rr = match_do_rr;
687                 *mpri = m;
688                 match = rt;
689         }
690 out:
691         return match;
692 }
693
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695                                      struct fib6_info *leaf,
696                                      struct fib6_info *rr_head,
697                                      u32 metric, int oif, int strict,
698                                      bool *do_rr)
699 {
700         struct fib6_info *rt, *match, *cont;
701         int mpri = -1;
702
703         match = NULL;
704         cont = NULL;
705         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706                 if (rt->fib6_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         for (rt = leaf; rt && rt != rr_head;
715              rt = rcu_dereference(rt->fib6_next)) {
716                 if (rt->fib6_metric != metric) {
717                         cont = rt;
718                         break;
719                 }
720
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722         }
723
724         if (match || !cont)
725                 return match;
726
727         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         if (ort->fib6_flags & RTF_REJECT) {
950                 ip6_rt_init_dst_reject(rt, ort);
951                 return;
952         }
953
954         rt->dst.error = 0;
955         rt->dst.output = ip6_output;
956
957         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958                 rt->dst.input = ip6_input;
959         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960                 rt->dst.input = ip6_mc_input;
961         } else {
962                 rt->dst.input = ip6_forward;
963         }
964
965         if (ort->fib6_nh.nh_lwtstate) {
966                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
967                 lwtunnel_set_redirect(&rt->dst);
968         }
969
970         rt->dst.lastuse = jiffies;
971 }
972
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 {
976         rt->rt6i_flags &= ~RTF_EXPIRES;
977         rcu_assign_pointer(rt->from, from);
978         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
979 }
980
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 {
984         struct net_device *dev = fib6_info_nh_dev(ort);
985
986         ip6_rt_init_dst(rt, ort);
987
988         rt->rt6i_dst = ort->fib6_dst;
989         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
991         rt->rt6i_flags = ort->fib6_flags;
992         rt6_set_from(rt, ort);
993 #ifdef CONFIG_IPV6_SUBTREES
994         rt->rt6i_src = ort->fib6_src;
995 #endif
996         rt->rt6i_prefsrc = ort->fib6_prefsrc;
997 }
998
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000                                         struct in6_addr *saddr)
1001 {
1002         struct fib6_node *pn, *sn;
1003         while (1) {
1004                 if (fn->fn_flags & RTN_TL_ROOT)
1005                         return NULL;
1006                 pn = rcu_dereference(fn->parent);
1007                 sn = FIB6_SUBTREE(pn);
1008                 if (sn && sn != fn)
1009                         fn = fib6_node_lookup(sn, NULL, saddr);
1010                 else
1011                         fn = pn;
1012                 if (fn->fn_flags & RTN_RTINFO)
1013                         return fn;
1014         }
1015 }
1016
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018                           bool null_fallback)
1019 {
1020         struct rt6_info *rt = *prt;
1021
1022         if (dst_hold_safe(&rt->dst))
1023                 return true;
1024         if (null_fallback) {
1025                 rt = net->ipv6.ip6_null_entry;
1026                 dst_hold(&rt->dst);
1027         } else {
1028                 rt = NULL;
1029         }
1030         *prt = rt;
1031         return false;
1032 }
1033
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037         unsigned short flags = fib6_info_dst_flags(rt);
1038         struct net_device *dev = rt->fib6_nh.nh_dev;
1039         struct rt6_info *nrt;
1040
1041         if (!fib6_info_hold_safe(rt))
1042                 return NULL;
1043
1044         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1045         if (nrt)
1046                 ip6_rt_copy_init(nrt, rt);
1047         else
1048                 fib6_info_release(rt);
1049
1050         return nrt;
1051 }
1052
1053 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1054                                              struct fib6_table *table,
1055                                              struct flowi6 *fl6,
1056                                              const struct sk_buff *skb,
1057                                              int flags)
1058 {
1059         struct fib6_info *f6i;
1060         struct fib6_node *fn;
1061         struct rt6_info *rt;
1062
1063         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1064                 flags &= ~RT6_LOOKUP_F_IFACE;
1065
1066         rcu_read_lock();
1067         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1068 restart:
1069         f6i = rcu_dereference(fn->leaf);
1070         if (!f6i) {
1071                 f6i = net->ipv6.fib6_null_entry;
1072         } else {
1073                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1074                                       fl6->flowi6_oif, flags);
1075                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1076                         f6i = fib6_multipath_select(net, f6i, fl6,
1077                                                     fl6->flowi6_oif, skb,
1078                                                     flags);
1079         }
1080         if (f6i == net->ipv6.fib6_null_entry) {
1081                 fn = fib6_backtrack(fn, &fl6->saddr);
1082                 if (fn)
1083                         goto restart;
1084         }
1085
1086         trace_fib6_table_lookup(net, f6i, table, fl6);
1087
1088         /* Search through exception table */
1089         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1090         if (rt) {
1091                 if (ip6_hold_safe(net, &rt, true))
1092                         dst_use_noref(&rt->dst, jiffies);
1093         } else if (f6i == net->ipv6.fib6_null_entry) {
1094                 rt = net->ipv6.ip6_null_entry;
1095                 dst_hold(&rt->dst);
1096         } else {
1097                 rt = ip6_create_rt_rcu(f6i);
1098                 if (!rt) {
1099                         rt = net->ipv6.ip6_null_entry;
1100                         dst_hold(&rt->dst);
1101                 }
1102         }
1103
1104         rcu_read_unlock();
1105
1106         return rt;
1107 }
1108
1109 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1110                                    const struct sk_buff *skb, int flags)
1111 {
1112         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1113 }
1114 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1115
1116 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1117                             const struct in6_addr *saddr, int oif,
1118                             const struct sk_buff *skb, int strict)
1119 {
1120         struct flowi6 fl6 = {
1121                 .flowi6_oif = oif,
1122                 .daddr = *daddr,
1123         };
1124         struct dst_entry *dst;
1125         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1126
1127         if (saddr) {
1128                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1129                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1130         }
1131
1132         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1133         if (dst->error == 0)
1134                 return (struct rt6_info *) dst;
1135
1136         dst_release(dst);
1137
1138         return NULL;
1139 }
1140 EXPORT_SYMBOL(rt6_lookup);
1141
1142 /* ip6_ins_rt is called with FREE table->tb6_lock.
1143  * It takes new route entry, the addition fails by any reason the
1144  * route is released.
1145  * Caller must hold dst before calling it.
1146  */
1147
1148 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1149                         struct netlink_ext_ack *extack)
1150 {
1151         int err;
1152         struct fib6_table *table;
1153
1154         table = rt->fib6_table;
1155         spin_lock_bh(&table->tb6_lock);
1156         err = fib6_add(&table->tb6_root, rt, info, extack);
1157         spin_unlock_bh(&table->tb6_lock);
1158
1159         return err;
1160 }
1161
1162 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1163 {
1164         struct nl_info info = { .nl_net = net, };
1165
1166         return __ip6_ins_rt(rt, &info, NULL);
1167 }
1168
1169 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1170                                            const struct in6_addr *daddr,
1171                                            const struct in6_addr *saddr)
1172 {
1173         struct net_device *dev;
1174         struct rt6_info *rt;
1175
1176         /*
1177          *      Clone the route.
1178          */
1179
1180         if (!fib6_info_hold_safe(ort))
1181                 return NULL;
1182
1183         dev = ip6_rt_get_dev_rcu(ort);
1184         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1185         if (!rt) {
1186                 fib6_info_release(ort);
1187                 return NULL;
1188         }
1189
1190         ip6_rt_copy_init(rt, ort);
1191         rt->rt6i_flags |= RTF_CACHE;
1192         rt->dst.flags |= DST_HOST;
1193         rt->rt6i_dst.addr = *daddr;
1194         rt->rt6i_dst.plen = 128;
1195
1196         if (!rt6_is_gw_or_nonexthop(ort)) {
1197                 if (ort->fib6_dst.plen != 128 &&
1198                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1199                         rt->rt6i_flags |= RTF_ANYCAST;
1200 #ifdef CONFIG_IPV6_SUBTREES
1201                 if (rt->rt6i_src.plen && saddr) {
1202                         rt->rt6i_src.addr = *saddr;
1203                         rt->rt6i_src.plen = 128;
1204                 }
1205 #endif
1206         }
1207
1208         return rt;
1209 }
1210
1211 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1212 {
1213         unsigned short flags = fib6_info_dst_flags(rt);
1214         struct net_device *dev;
1215         struct rt6_info *pcpu_rt;
1216
1217         if (!fib6_info_hold_safe(rt))
1218                 return NULL;
1219
1220         rcu_read_lock();
1221         dev = ip6_rt_get_dev_rcu(rt);
1222         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1223         rcu_read_unlock();
1224         if (!pcpu_rt) {
1225                 fib6_info_release(rt);
1226                 return NULL;
1227         }
1228         ip6_rt_copy_init(pcpu_rt, rt);
1229         pcpu_rt->rt6i_flags |= RTF_PCPU;
1230         return pcpu_rt;
1231 }
1232
1233 /* It should be called with rcu_read_lock() acquired */
1234 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1235 {
1236         struct rt6_info *pcpu_rt, **p;
1237
1238         p = this_cpu_ptr(rt->rt6i_pcpu);
1239         pcpu_rt = *p;
1240
1241         if (pcpu_rt)
1242                 ip6_hold_safe(NULL, &pcpu_rt, false);
1243
1244         return pcpu_rt;
1245 }
1246
1247 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1248                                             struct fib6_info *rt)
1249 {
1250         struct rt6_info *pcpu_rt, *prev, **p;
1251
1252         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1253         if (!pcpu_rt) {
1254                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1255                 return net->ipv6.ip6_null_entry;
1256         }
1257
1258         dst_hold(&pcpu_rt->dst);
1259         p = this_cpu_ptr(rt->rt6i_pcpu);
1260         prev = cmpxchg(p, NULL, pcpu_rt);
1261         BUG_ON(prev);
1262
1263         return pcpu_rt;
1264 }
1265
1266 /* exception hash table implementation
1267  */
1268 static DEFINE_SPINLOCK(rt6_exception_lock);
1269
1270 /* Remove rt6_ex from hash table and free the memory
1271  * Caller must hold rt6_exception_lock
1272  */
1273 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1274                                  struct rt6_exception *rt6_ex)
1275 {
1276         struct net *net;
1277
1278         if (!bucket || !rt6_ex)
1279                 return;
1280
1281         net = dev_net(rt6_ex->rt6i->dst.dev);
1282         hlist_del_rcu(&rt6_ex->hlist);
1283         dst_release(&rt6_ex->rt6i->dst);
1284         kfree_rcu(rt6_ex, rcu);
1285         WARN_ON_ONCE(!bucket->depth);
1286         bucket->depth--;
1287         net->ipv6.rt6_stats->fib_rt_cache--;
1288 }
1289
1290 /* Remove oldest rt6_ex in bucket and free the memory
1291  * Caller must hold rt6_exception_lock
1292  */
1293 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1294 {
1295         struct rt6_exception *rt6_ex, *oldest = NULL;
1296
1297         if (!bucket)
1298                 return;
1299
1300         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1301                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1302                         oldest = rt6_ex;
1303         }
1304         rt6_remove_exception(bucket, oldest);
1305 }
1306
1307 static u32 rt6_exception_hash(const struct in6_addr *dst,
1308                               const struct in6_addr *src)
1309 {
1310         static u32 seed __read_mostly;
1311         u32 val;
1312
1313         net_get_random_once(&seed, sizeof(seed));
1314         val = jhash(dst, sizeof(*dst), seed);
1315
1316 #ifdef CONFIG_IPV6_SUBTREES
1317         if (src)
1318                 val = jhash(src, sizeof(*src), val);
1319 #endif
1320         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1321 }
1322
1323 /* Helper function to find the cached rt in the hash table
1324  * and update bucket pointer to point to the bucket for this
1325  * (daddr, saddr) pair
1326  * Caller must hold rt6_exception_lock
1327  */
1328 static struct rt6_exception *
1329 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1330                               const struct in6_addr *daddr,
1331                               const struct in6_addr *saddr)
1332 {
1333         struct rt6_exception *rt6_ex;
1334         u32 hval;
1335
1336         if (!(*bucket) || !daddr)
1337                 return NULL;
1338
1339         hval = rt6_exception_hash(daddr, saddr);
1340         *bucket += hval;
1341
1342         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1343                 struct rt6_info *rt6 = rt6_ex->rt6i;
1344                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1345
1346 #ifdef CONFIG_IPV6_SUBTREES
1347                 if (matched && saddr)
1348                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1349 #endif
1350                 if (matched)
1351                         return rt6_ex;
1352         }
1353         return NULL;
1354 }
1355
1356 /* Helper function to find the cached rt in the hash table
1357  * and update bucket pointer to point to the bucket for this
1358  * (daddr, saddr) pair
1359  * Caller must hold rcu_read_lock()
1360  */
1361 static struct rt6_exception *
1362 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1363                          const struct in6_addr *daddr,
1364                          const struct in6_addr *saddr)
1365 {
1366         struct rt6_exception *rt6_ex;
1367         u32 hval;
1368
1369         WARN_ON_ONCE(!rcu_read_lock_held());
1370
1371         if (!(*bucket) || !daddr)
1372                 return NULL;
1373
1374         hval = rt6_exception_hash(daddr, saddr);
1375         *bucket += hval;
1376
1377         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1378                 struct rt6_info *rt6 = rt6_ex->rt6i;
1379                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1380
1381 #ifdef CONFIG_IPV6_SUBTREES
1382                 if (matched && saddr)
1383                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1384 #endif
1385                 if (matched)
1386                         return rt6_ex;
1387         }
1388         return NULL;
1389 }
1390
1391 static unsigned int fib6_mtu(const struct fib6_info *rt)
1392 {
1393         unsigned int mtu;
1394
1395         if (rt->fib6_pmtu) {
1396                 mtu = rt->fib6_pmtu;
1397         } else {
1398                 struct net_device *dev = fib6_info_nh_dev(rt);
1399                 struct inet6_dev *idev;
1400
1401                 rcu_read_lock();
1402                 idev = __in6_dev_get(dev);
1403                 mtu = idev->cnf.mtu6;
1404                 rcu_read_unlock();
1405         }
1406
1407         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1408
1409         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1410 }
1411
1412 static int rt6_insert_exception(struct rt6_info *nrt,
1413                                 struct fib6_info *ort)
1414 {
1415         struct net *net = dev_net(nrt->dst.dev);
1416         struct rt6_exception_bucket *bucket;
1417         struct in6_addr *src_key = NULL;
1418         struct rt6_exception *rt6_ex;
1419         int err = 0;
1420
1421         spin_lock_bh(&rt6_exception_lock);
1422
1423         if (ort->exception_bucket_flushed) {
1424                 err = -EINVAL;
1425                 goto out;
1426         }
1427
1428         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1429                                         lockdep_is_held(&rt6_exception_lock));
1430         if (!bucket) {
1431                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1432                                  GFP_ATOMIC);
1433                 if (!bucket) {
1434                         err = -ENOMEM;
1435                         goto out;
1436                 }
1437                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1438         }
1439
1440 #ifdef CONFIG_IPV6_SUBTREES
1441         /* rt6i_src.plen != 0 indicates ort is in subtree
1442          * and exception table is indexed by a hash of
1443          * both rt6i_dst and rt6i_src.
1444          * Otherwise, the exception table is indexed by
1445          * a hash of only rt6i_dst.
1446          */
1447         if (ort->fib6_src.plen)
1448                 src_key = &nrt->rt6i_src.addr;
1449 #endif
1450
1451         /* Update rt6i_prefsrc as it could be changed
1452          * in rt6_remove_prefsrc()
1453          */
1454         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1455         /* rt6_mtu_change() might lower mtu on ort.
1456          * Only insert this exception route if its mtu
1457          * is less than ort's mtu value.
1458          */
1459         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1460                 err = -EINVAL;
1461                 goto out;
1462         }
1463
1464         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1465                                                src_key);
1466         if (rt6_ex)
1467                 rt6_remove_exception(bucket, rt6_ex);
1468
1469         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1470         if (!rt6_ex) {
1471                 err = -ENOMEM;
1472                 goto out;
1473         }
1474         rt6_ex->rt6i = nrt;
1475         rt6_ex->stamp = jiffies;
1476         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1477         bucket->depth++;
1478         net->ipv6.rt6_stats->fib_rt_cache++;
1479
1480         if (bucket->depth > FIB6_MAX_DEPTH)
1481                 rt6_exception_remove_oldest(bucket);
1482
1483 out:
1484         spin_unlock_bh(&rt6_exception_lock);
1485
1486         /* Update fn->fn_sernum to invalidate all cached dst */
1487         if (!err) {
1488                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1489                 fib6_update_sernum(net, ort);
1490                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1491                 fib6_force_start_gc(net);
1492         }
1493
1494         return err;
1495 }
1496
1497 void rt6_flush_exceptions(struct fib6_info *rt)
1498 {
1499         struct rt6_exception_bucket *bucket;
1500         struct rt6_exception *rt6_ex;
1501         struct hlist_node *tmp;
1502         int i;
1503
1504         spin_lock_bh(&rt6_exception_lock);
1505         /* Prevent rt6_insert_exception() to recreate the bucket list */
1506         rt->exception_bucket_flushed = 1;
1507
1508         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1509                                     lockdep_is_held(&rt6_exception_lock));
1510         if (!bucket)
1511                 goto out;
1512
1513         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1514                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1515                         rt6_remove_exception(bucket, rt6_ex);
1516                 WARN_ON_ONCE(bucket->depth);
1517                 bucket++;
1518         }
1519
1520 out:
1521         spin_unlock_bh(&rt6_exception_lock);
1522 }
1523
1524 /* Find cached rt in the hash table inside passed in rt
1525  * Caller has to hold rcu_read_lock()
1526  */
1527 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1528                                            struct in6_addr *daddr,
1529                                            struct in6_addr *saddr)
1530 {
1531         struct rt6_exception_bucket *bucket;
1532         struct in6_addr *src_key = NULL;
1533         struct rt6_exception *rt6_ex;
1534         struct rt6_info *res = NULL;
1535
1536         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1537
1538 #ifdef CONFIG_IPV6_SUBTREES
1539         /* rt6i_src.plen != 0 indicates rt is in subtree
1540          * and exception table is indexed by a hash of
1541          * both rt6i_dst and rt6i_src.
1542          * Otherwise, the exception table is indexed by
1543          * a hash of only rt6i_dst.
1544          */
1545         if (rt->fib6_src.plen)
1546                 src_key = saddr;
1547 #endif
1548         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1549
1550         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1551                 res = rt6_ex->rt6i;
1552
1553         return res;
1554 }
1555
1556 /* Remove the passed in cached rt from the hash table that contains it */
1557 static int rt6_remove_exception_rt(struct rt6_info *rt)
1558 {
1559         struct rt6_exception_bucket *bucket;
1560         struct in6_addr *src_key = NULL;
1561         struct rt6_exception *rt6_ex;
1562         struct fib6_info *from;
1563         int err;
1564
1565         from = rcu_dereference(rt->from);
1566         if (!from ||
1567             !(rt->rt6i_flags & RTF_CACHE))
1568                 return -EINVAL;
1569
1570         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1571                 return -ENOENT;
1572
1573         spin_lock_bh(&rt6_exception_lock);
1574         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1575                                     lockdep_is_held(&rt6_exception_lock));
1576 #ifdef CONFIG_IPV6_SUBTREES
1577         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1578          * and exception table is indexed by a hash of
1579          * both rt6i_dst and rt6i_src.
1580          * Otherwise, the exception table is indexed by
1581          * a hash of only rt6i_dst.
1582          */
1583         if (from->fib6_src.plen)
1584                 src_key = &rt->rt6i_src.addr;
1585 #endif
1586         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1587                                                &rt->rt6i_dst.addr,
1588                                                src_key);
1589         if (rt6_ex) {
1590                 rt6_remove_exception(bucket, rt6_ex);
1591                 err = 0;
1592         } else {
1593                 err = -ENOENT;
1594         }
1595
1596         spin_unlock_bh(&rt6_exception_lock);
1597         return err;
1598 }
1599
1600 /* Find rt6_ex which contains the passed in rt cache and
1601  * refresh its stamp
1602  */
1603 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1604 {
1605         struct rt6_exception_bucket *bucket;
1606         struct fib6_info *from = rt->from;
1607         struct in6_addr *src_key = NULL;
1608         struct rt6_exception *rt6_ex;
1609
1610         if (!from ||
1611             !(rt->rt6i_flags & RTF_CACHE))
1612                 return;
1613
1614         rcu_read_lock();
1615         bucket = rcu_dereference(from->rt6i_exception_bucket);
1616
1617 #ifdef CONFIG_IPV6_SUBTREES
1618         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1619          * and exception table is indexed by a hash of
1620          * both rt6i_dst and rt6i_src.
1621          * Otherwise, the exception table is indexed by
1622          * a hash of only rt6i_dst.
1623          */
1624         if (from->fib6_src.plen)
1625                 src_key = &rt->rt6i_src.addr;
1626 #endif
1627         rt6_ex = __rt6_find_exception_rcu(&bucket,
1628                                           &rt->rt6i_dst.addr,
1629                                           src_key);
1630         if (rt6_ex)
1631                 rt6_ex->stamp = jiffies;
1632
1633         rcu_read_unlock();
1634 }
1635
1636 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1637 {
1638         struct rt6_exception_bucket *bucket;
1639         struct rt6_exception *rt6_ex;
1640         int i;
1641
1642         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1643                                         lockdep_is_held(&rt6_exception_lock));
1644
1645         if (bucket) {
1646                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1647                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1648                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1649                         }
1650                         bucket++;
1651                 }
1652         }
1653 }
1654
1655 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1656                                          struct rt6_info *rt, int mtu)
1657 {
1658         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1659          * lowest MTU in the path: always allow updating the route PMTU to
1660          * reflect PMTU decreases.
1661          *
1662          * If the new MTU is higher, and the route PMTU is equal to the local
1663          * MTU, this means the old MTU is the lowest in the path, so allow
1664          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1665          * handle this.
1666          */
1667
1668         if (dst_mtu(&rt->dst) >= mtu)
1669                 return true;
1670
1671         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1672                 return true;
1673
1674         return false;
1675 }
1676
1677 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1678                                        struct fib6_info *rt, int mtu)
1679 {
1680         struct rt6_exception_bucket *bucket;
1681         struct rt6_exception *rt6_ex;
1682         int i;
1683
1684         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1685                                         lockdep_is_held(&rt6_exception_lock));
1686
1687         if (!bucket)
1688                 return;
1689
1690         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1691                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1692                         struct rt6_info *entry = rt6_ex->rt6i;
1693
1694                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1695                          * route), the metrics of its rt->from have already
1696                          * been updated.
1697                          */
1698                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1699                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1700                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1701                 }
1702                 bucket++;
1703         }
1704 }
1705
1706 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1707
1708 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1709                                         struct in6_addr *gateway)
1710 {
1711         struct rt6_exception_bucket *bucket;
1712         struct rt6_exception *rt6_ex;
1713         struct hlist_node *tmp;
1714         int i;
1715
1716         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1717                 return;
1718
1719         spin_lock_bh(&rt6_exception_lock);
1720         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1721                                      lockdep_is_held(&rt6_exception_lock));
1722
1723         if (bucket) {
1724                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1725                         hlist_for_each_entry_safe(rt6_ex, tmp,
1726                                                   &bucket->chain, hlist) {
1727                                 struct rt6_info *entry = rt6_ex->rt6i;
1728
1729                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1730                                     RTF_CACHE_GATEWAY &&
1731                                     ipv6_addr_equal(gateway,
1732                                                     &entry->rt6i_gateway)) {
1733                                         rt6_remove_exception(bucket, rt6_ex);
1734                                 }
1735                         }
1736                         bucket++;
1737                 }
1738         }
1739
1740         spin_unlock_bh(&rt6_exception_lock);
1741 }
1742
1743 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1744                                       struct rt6_exception *rt6_ex,
1745                                       struct fib6_gc_args *gc_args,
1746                                       unsigned long now)
1747 {
1748         struct rt6_info *rt = rt6_ex->rt6i;
1749
1750         /* we are pruning and obsoleting aged-out and non gateway exceptions
1751          * even if others have still references to them, so that on next
1752          * dst_check() such references can be dropped.
1753          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1754          * expired, independently from their aging, as per RFC 8201 section 4
1755          */
1756         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1757                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1758                         RT6_TRACE("aging clone %p\n", rt);
1759                         rt6_remove_exception(bucket, rt6_ex);
1760                         return;
1761                 }
1762         } else if (time_after(jiffies, rt->dst.expires)) {
1763                 RT6_TRACE("purging expired route %p\n", rt);
1764                 rt6_remove_exception(bucket, rt6_ex);
1765                 return;
1766         }
1767
1768         if (rt->rt6i_flags & RTF_GATEWAY) {
1769                 struct neighbour *neigh;
1770                 __u8 neigh_flags = 0;
1771
1772                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1773                 if (neigh)
1774                         neigh_flags = neigh->flags;
1775
1776                 if (!(neigh_flags & NTF_ROUTER)) {
1777                         RT6_TRACE("purging route %p via non-router but gateway\n",
1778                                   rt);
1779                         rt6_remove_exception(bucket, rt6_ex);
1780                         return;
1781                 }
1782         }
1783
1784         gc_args->more++;
1785 }
1786
1787 void rt6_age_exceptions(struct fib6_info *rt,
1788                         struct fib6_gc_args *gc_args,
1789                         unsigned long now)
1790 {
1791         struct rt6_exception_bucket *bucket;
1792         struct rt6_exception *rt6_ex;
1793         struct hlist_node *tmp;
1794         int i;
1795
1796         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1797                 return;
1798
1799         rcu_read_lock_bh();
1800         spin_lock(&rt6_exception_lock);
1801         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1802                                     lockdep_is_held(&rt6_exception_lock));
1803
1804         if (bucket) {
1805                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1806                         hlist_for_each_entry_safe(rt6_ex, tmp,
1807                                                   &bucket->chain, hlist) {
1808                                 rt6_age_examine_exception(bucket, rt6_ex,
1809                                                           gc_args, now);
1810                         }
1811                         bucket++;
1812                 }
1813         }
1814         spin_unlock(&rt6_exception_lock);
1815         rcu_read_unlock_bh();
1816 }
1817
1818 /* must be called with rcu lock held */
1819 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1820                                     int oif, struct flowi6 *fl6, int strict)
1821 {
1822         struct fib6_node *fn, *saved_fn;
1823         struct fib6_info *f6i;
1824
1825         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1826         saved_fn = fn;
1827
1828         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1829                 oif = 0;
1830
1831 redo_rt6_select:
1832         f6i = rt6_select(net, fn, oif, strict);
1833         if (f6i == net->ipv6.fib6_null_entry) {
1834                 fn = fib6_backtrack(fn, &fl6->saddr);
1835                 if (fn)
1836                         goto redo_rt6_select;
1837                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1838                         /* also consider unreachable route */
1839                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1840                         fn = saved_fn;
1841                         goto redo_rt6_select;
1842                 }
1843         }
1844
1845         trace_fib6_table_lookup(net, f6i, table, fl6);
1846
1847         return f6i;
1848 }
1849
1850 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1851                                int oif, struct flowi6 *fl6,
1852                                const struct sk_buff *skb, int flags)
1853 {
1854         struct fib6_info *f6i;
1855         struct rt6_info *rt;
1856         int strict = 0;
1857
1858         strict |= flags & RT6_LOOKUP_F_IFACE;
1859         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1860         if (net->ipv6.devconf_all->forwarding == 0)
1861                 strict |= RT6_LOOKUP_F_REACHABLE;
1862
1863         rcu_read_lock();
1864
1865         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1866         if (f6i->fib6_nsiblings)
1867                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1868
1869         if (f6i == net->ipv6.fib6_null_entry) {
1870                 rt = net->ipv6.ip6_null_entry;
1871                 rcu_read_unlock();
1872                 dst_hold(&rt->dst);
1873                 return rt;
1874         }
1875
1876         /*Search through exception table */
1877         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1878         if (rt) {
1879                 if (ip6_hold_safe(net, &rt, true))
1880                         dst_use_noref(&rt->dst, jiffies);
1881
1882                 rcu_read_unlock();
1883                 return rt;
1884         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1885                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1886                 /* Create a RTF_CACHE clone which will not be
1887                  * owned by the fib6 tree.  It is for the special case where
1888                  * the daddr in the skb during the neighbor look-up is different
1889                  * from the fl6->daddr used to look-up route here.
1890                  */
1891                 struct rt6_info *uncached_rt;
1892
1893                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1894
1895                 rcu_read_unlock();
1896
1897                 if (uncached_rt) {
1898                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1899                          * No need for another dst_hold()
1900                          */
1901                         rt6_uncached_list_add(uncached_rt);
1902                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1903                 } else {
1904                         uncached_rt = net->ipv6.ip6_null_entry;
1905                         dst_hold(&uncached_rt->dst);
1906                 }
1907
1908                 return uncached_rt;
1909         } else {
1910                 /* Get a percpu copy */
1911
1912                 struct rt6_info *pcpu_rt;
1913
1914                 local_bh_disable();
1915                 pcpu_rt = rt6_get_pcpu_route(f6i);
1916
1917                 if (!pcpu_rt)
1918                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1919
1920                 local_bh_enable();
1921                 rcu_read_unlock();
1922
1923                 return pcpu_rt;
1924         }
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_pol_route);
1927
1928 static struct rt6_info *ip6_pol_route_input(struct net *net,
1929                                             struct fib6_table *table,
1930                                             struct flowi6 *fl6,
1931                                             const struct sk_buff *skb,
1932                                             int flags)
1933 {
1934         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1935 }
1936
1937 struct dst_entry *ip6_route_input_lookup(struct net *net,
1938                                          struct net_device *dev,
1939                                          struct flowi6 *fl6,
1940                                          const struct sk_buff *skb,
1941                                          int flags)
1942 {
1943         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1944                 flags |= RT6_LOOKUP_F_IFACE;
1945
1946         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1947 }
1948 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1949
1950 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1951                                   struct flow_keys *keys,
1952                                   struct flow_keys *flkeys)
1953 {
1954         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1955         const struct ipv6hdr *key_iph = outer_iph;
1956         struct flow_keys *_flkeys = flkeys;
1957         const struct ipv6hdr *inner_iph;
1958         const struct icmp6hdr *icmph;
1959         struct ipv6hdr _inner_iph;
1960         struct icmp6hdr _icmph;
1961
1962         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1963                 goto out;
1964
1965         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1966                                    sizeof(_icmph), &_icmph);
1967         if (!icmph)
1968                 goto out;
1969
1970         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1971             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1972             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1973             icmph->icmp6_type != ICMPV6_PARAMPROB)
1974                 goto out;
1975
1976         inner_iph = skb_header_pointer(skb,
1977                                        skb_transport_offset(skb) + sizeof(*icmph),
1978                                        sizeof(_inner_iph), &_inner_iph);
1979         if (!inner_iph)
1980                 goto out;
1981
1982         key_iph = inner_iph;
1983         _flkeys = NULL;
1984 out:
1985         if (_flkeys) {
1986                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1987                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1988                 keys->tags.flow_label = _flkeys->tags.flow_label;
1989                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1990         } else {
1991                 keys->addrs.v6addrs.src = key_iph->saddr;
1992                 keys->addrs.v6addrs.dst = key_iph->daddr;
1993                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1994                 keys->basic.ip_proto = key_iph->nexthdr;
1995         }
1996 }
1997
1998 /* if skb is set it will be used and fl6 can be NULL */
1999 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2000                        const struct sk_buff *skb, struct flow_keys *flkeys)
2001 {
2002         struct flow_keys hash_keys;
2003         u32 mhash;
2004
2005         switch (ip6_multipath_hash_policy(net)) {
2006         case 0:
2007                 memset(&hash_keys, 0, sizeof(hash_keys));
2008                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009                 if (skb) {
2010                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2011                 } else {
2012                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2013                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2014                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2015                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2016                 }
2017                 break;
2018         case 1:
2019                 if (skb) {
2020                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2021                         struct flow_keys keys;
2022
2023                         /* short-circuit if we already have L4 hash present */
2024                         if (skb->l4_hash)
2025                                 return skb_get_hash_raw(skb) >> 1;
2026
2027                         memset(&hash_keys, 0, sizeof(hash_keys));
2028
2029                         if (!flkeys) {
2030                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2031                                 flkeys = &keys;
2032                         }
2033                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2034                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2035                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2036                         hash_keys.ports.src = flkeys->ports.src;
2037                         hash_keys.ports.dst = flkeys->ports.dst;
2038                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2039                 } else {
2040                         memset(&hash_keys, 0, sizeof(hash_keys));
2041                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2042                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2043                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2044                         hash_keys.ports.src = fl6->fl6_sport;
2045                         hash_keys.ports.dst = fl6->fl6_dport;
2046                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2047                 }
2048                 break;
2049         }
2050         mhash = flow_hash_from_keys(&hash_keys);
2051
2052         return mhash >> 1;
2053 }
2054
2055 void ip6_route_input(struct sk_buff *skb)
2056 {
2057         const struct ipv6hdr *iph = ipv6_hdr(skb);
2058         struct net *net = dev_net(skb->dev);
2059         int flags = RT6_LOOKUP_F_HAS_SADDR;
2060         struct ip_tunnel_info *tun_info;
2061         struct flowi6 fl6 = {
2062                 .flowi6_iif = skb->dev->ifindex,
2063                 .daddr = iph->daddr,
2064                 .saddr = iph->saddr,
2065                 .flowlabel = ip6_flowinfo(iph),
2066                 .flowi6_mark = skb->mark,
2067                 .flowi6_proto = iph->nexthdr,
2068         };
2069         struct flow_keys *flkeys = NULL, _flkeys;
2070
2071         tun_info = skb_tunnel_info(skb);
2072         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2073                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2074
2075         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2076                 flkeys = &_flkeys;
2077
2078         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2079                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2080         skb_dst_drop(skb);
2081         skb_dst_set(skb,
2082                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2083 }
2084
2085 static struct rt6_info *ip6_pol_route_output(struct net *net,
2086                                              struct fib6_table *table,
2087                                              struct flowi6 *fl6,
2088                                              const struct sk_buff *skb,
2089                                              int flags)
2090 {
2091         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2092 }
2093
2094 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2095                                          struct flowi6 *fl6, int flags)
2096 {
2097         bool any_src;
2098
2099         if (rt6_need_strict(&fl6->daddr)) {
2100                 struct dst_entry *dst;
2101
2102                 dst = l3mdev_link_scope_lookup(net, fl6);
2103                 if (dst)
2104                         return dst;
2105         }
2106
2107         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2108
2109         any_src = ipv6_addr_any(&fl6->saddr);
2110         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2111             (fl6->flowi6_oif && any_src))
2112                 flags |= RT6_LOOKUP_F_IFACE;
2113
2114         if (!any_src)
2115                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2116         else if (sk)
2117                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2118
2119         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2120 }
2121 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2122
2123 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2124 {
2125         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2126         struct net_device *loopback_dev = net->loopback_dev;
2127         struct dst_entry *new = NULL;
2128
2129         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2130                        DST_OBSOLETE_DEAD, 0);
2131         if (rt) {
2132                 rt6_info_init(rt);
2133                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2134
2135                 new = &rt->dst;
2136                 new->__use = 1;
2137                 new->input = dst_discard;
2138                 new->output = dst_discard_out;
2139
2140                 dst_copy_metrics(new, &ort->dst);
2141
2142                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2143                 rt->rt6i_gateway = ort->rt6i_gateway;
2144                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2145
2146                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2147 #ifdef CONFIG_IPV6_SUBTREES
2148                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2149 #endif
2150         }
2151
2152         dst_release(dst_orig);
2153         return new ? new : ERR_PTR(-ENOMEM);
2154 }
2155
2156 /*
2157  *      Destination cache support functions
2158  */
2159
2160 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2161 {
2162         u32 rt_cookie = 0;
2163
2164         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2165                 return false;
2166
2167         if (fib6_check_expired(f6i))
2168                 return false;
2169
2170         return true;
2171 }
2172
2173 static struct dst_entry *rt6_check(struct rt6_info *rt,
2174                                    struct fib6_info *from,
2175                                    u32 cookie)
2176 {
2177         u32 rt_cookie = 0;
2178
2179         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2180             rt_cookie != cookie)
2181                 return NULL;
2182
2183         if (rt6_check_expired(rt))
2184                 return NULL;
2185
2186         return &rt->dst;
2187 }
2188
2189 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2190                                             struct fib6_info *from,
2191                                             u32 cookie)
2192 {
2193         if (!__rt6_check_expired(rt) &&
2194             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2195             fib6_check(from, cookie))
2196                 return &rt->dst;
2197         else
2198                 return NULL;
2199 }
2200
2201 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2202 {
2203         struct dst_entry *dst_ret;
2204         struct fib6_info *from;
2205         struct rt6_info *rt;
2206
2207         rt = container_of(dst, struct rt6_info, dst);
2208
2209         rcu_read_lock();
2210
2211         /* All IPV6 dsts are created with ->obsolete set to the value
2212          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2213          * into this function always.
2214          */
2215
2216         from = rcu_dereference(rt->from);
2217
2218         if (from && (rt->rt6i_flags & RTF_PCPU ||
2219             unlikely(!list_empty(&rt->rt6i_uncached))))
2220                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2221         else
2222                 dst_ret = rt6_check(rt, from, cookie);
2223
2224         rcu_read_unlock();
2225
2226         return dst_ret;
2227 }
2228
2229 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2230 {
2231         struct rt6_info *rt = (struct rt6_info *) dst;
2232
2233         if (rt) {
2234                 if (rt->rt6i_flags & RTF_CACHE) {
2235                         rcu_read_lock();
2236                         if (rt6_check_expired(rt)) {
2237                                 rt6_remove_exception_rt(rt);
2238                                 dst = NULL;
2239                         }
2240                         rcu_read_unlock();
2241                 } else {
2242                         dst_release(dst);
2243                         dst = NULL;
2244                 }
2245         }
2246         return dst;
2247 }
2248
2249 static void ip6_link_failure(struct sk_buff *skb)
2250 {
2251         struct rt6_info *rt;
2252
2253         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2254
2255         rt = (struct rt6_info *) skb_dst(skb);
2256         if (rt) {
2257                 rcu_read_lock();
2258                 if (rt->rt6i_flags & RTF_CACHE) {
2259                         if (dst_hold_safe(&rt->dst))
2260                                 rt6_remove_exception_rt(rt);
2261                 } else {
2262                         struct fib6_info *from;
2263                         struct fib6_node *fn;
2264
2265                         from = rcu_dereference(rt->from);
2266                         if (from) {
2267                                 fn = rcu_dereference(from->fib6_node);
2268                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2269                                         fn->fn_sernum = -1;
2270                         }
2271                 }
2272                 rcu_read_unlock();
2273         }
2274 }
2275
2276 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2277 {
2278         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2279                 struct fib6_info *from;
2280
2281                 rcu_read_lock();
2282                 from = rcu_dereference(rt0->from);
2283                 if (from)
2284                         rt0->dst.expires = from->expires;
2285                 rcu_read_unlock();
2286         }
2287
2288         dst_set_expires(&rt0->dst, timeout);
2289         rt0->rt6i_flags |= RTF_EXPIRES;
2290 }
2291
2292 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2293 {
2294         struct net *net = dev_net(rt->dst.dev);
2295
2296         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2297         rt->rt6i_flags |= RTF_MODIFIED;
2298         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2299 }
2300
2301 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2302 {
2303         bool from_set;
2304
2305         rcu_read_lock();
2306         from_set = !!rcu_dereference(rt->from);
2307         rcu_read_unlock();
2308
2309         return !(rt->rt6i_flags & RTF_CACHE) &&
2310                 (rt->rt6i_flags & RTF_PCPU || from_set);
2311 }
2312
2313 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2314                                  const struct ipv6hdr *iph, u32 mtu)
2315 {
2316         const struct in6_addr *daddr, *saddr;
2317         struct rt6_info *rt6 = (struct rt6_info *)dst;
2318
2319         if (dst_metric_locked(dst, RTAX_MTU))
2320                 return;
2321
2322         if (iph) {
2323                 daddr = &iph->daddr;
2324                 saddr = &iph->saddr;
2325         } else if (sk) {
2326                 daddr = &sk->sk_v6_daddr;
2327                 saddr = &inet6_sk(sk)->saddr;
2328         } else {
2329                 daddr = NULL;
2330                 saddr = NULL;
2331         }
2332         dst_confirm_neigh(dst, daddr);
2333         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2334         if (mtu >= dst_mtu(dst))
2335                 return;
2336
2337         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2338                 rt6_do_update_pmtu(rt6, mtu);
2339                 /* update rt6_ex->stamp for cache */
2340                 if (rt6->rt6i_flags & RTF_CACHE)
2341                         rt6_update_exception_stamp_rt(rt6);
2342         } else if (daddr) {
2343                 struct fib6_info *from;
2344                 struct rt6_info *nrt6;
2345
2346                 rcu_read_lock();
2347                 from = rcu_dereference(rt6->from);
2348                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2349                 if (nrt6) {
2350                         rt6_do_update_pmtu(nrt6, mtu);
2351                         if (rt6_insert_exception(nrt6, from))
2352                                 dst_release_immediate(&nrt6->dst);
2353                 }
2354                 rcu_read_unlock();
2355         }
2356 }
2357
2358 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2359                                struct sk_buff *skb, u32 mtu)
2360 {
2361         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2362 }
2363
2364 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2365                      int oif, u32 mark, kuid_t uid)
2366 {
2367         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2368         struct dst_entry *dst;
2369         struct flowi6 fl6;
2370
2371         memset(&fl6, 0, sizeof(fl6));
2372         fl6.flowi6_oif = oif;
2373         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2374         fl6.daddr = iph->daddr;
2375         fl6.saddr = iph->saddr;
2376         fl6.flowlabel = ip6_flowinfo(iph);
2377         fl6.flowi6_uid = uid;
2378
2379         dst = ip6_route_output(net, NULL, &fl6);
2380         if (!dst->error)
2381                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2382         dst_release(dst);
2383 }
2384 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2385
2386 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2387 {
2388         struct dst_entry *dst;
2389
2390         ip6_update_pmtu(skb, sock_net(sk), mtu,
2391                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2392
2393         dst = __sk_dst_get(sk);
2394         if (!dst || !dst->obsolete ||
2395             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2396                 return;
2397
2398         bh_lock_sock(sk);
2399         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2400                 ip6_datagram_dst_update(sk, false);
2401         bh_unlock_sock(sk);
2402 }
2403 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2404
2405 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2406                            const struct flowi6 *fl6)
2407 {
2408 #ifdef CONFIG_IPV6_SUBTREES
2409         struct ipv6_pinfo *np = inet6_sk(sk);
2410 #endif
2411
2412         ip6_dst_store(sk, dst,
2413                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2414                       &sk->sk_v6_daddr : NULL,
2415 #ifdef CONFIG_IPV6_SUBTREES
2416                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2417                       &np->saddr :
2418 #endif
2419                       NULL);
2420 }
2421
2422 /* Handle redirects */
2423 struct ip6rd_flowi {
2424         struct flowi6 fl6;
2425         struct in6_addr gateway;
2426 };
2427
2428 static struct rt6_info *__ip6_route_redirect(struct net *net,
2429                                              struct fib6_table *table,
2430                                              struct flowi6 *fl6,
2431                                              const struct sk_buff *skb,
2432                                              int flags)
2433 {
2434         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2435         struct rt6_info *ret = NULL, *rt_cache;
2436         struct fib6_info *rt;
2437         struct fib6_node *fn;
2438
2439         /* Get the "current" route for this destination and
2440          * check if the redirect has come from appropriate router.
2441          *
2442          * RFC 4861 specifies that redirects should only be
2443          * accepted if they come from the nexthop to the target.
2444          * Due to the way the routes are chosen, this notion
2445          * is a bit fuzzy and one might need to check all possible
2446          * routes.
2447          */
2448
2449         rcu_read_lock();
2450         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2451 restart:
2452         for_each_fib6_node_rt_rcu(fn) {
2453                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2454                         continue;
2455                 if (fib6_check_expired(rt))
2456                         continue;
2457                 if (rt->fib6_flags & RTF_REJECT)
2458                         break;
2459                 if (!(rt->fib6_flags & RTF_GATEWAY))
2460                         continue;
2461                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2462                         continue;
2463                 /* rt_cache's gateway might be different from its 'parent'
2464                  * in the case of an ip redirect.
2465                  * So we keep searching in the exception table if the gateway
2466                  * is different.
2467                  */
2468                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2469                         rt_cache = rt6_find_cached_rt(rt,
2470                                                       &fl6->daddr,
2471                                                       &fl6->saddr);
2472                         if (rt_cache &&
2473                             ipv6_addr_equal(&rdfl->gateway,
2474                                             &rt_cache->rt6i_gateway)) {
2475                                 ret = rt_cache;
2476                                 break;
2477                         }
2478                         continue;
2479                 }
2480                 break;
2481         }
2482
2483         if (!rt)
2484                 rt = net->ipv6.fib6_null_entry;
2485         else if (rt->fib6_flags & RTF_REJECT) {
2486                 ret = net->ipv6.ip6_null_entry;
2487                 goto out;
2488         }
2489
2490         if (rt == net->ipv6.fib6_null_entry) {
2491                 fn = fib6_backtrack(fn, &fl6->saddr);
2492                 if (fn)
2493                         goto restart;
2494         }
2495
2496 out:
2497         if (ret)
2498                 ip6_hold_safe(net, &ret, true);
2499         else
2500                 ret = ip6_create_rt_rcu(rt);
2501
2502         rcu_read_unlock();
2503
2504         trace_fib6_table_lookup(net, rt, table, fl6);
2505         return ret;
2506 };
2507
2508 static struct dst_entry *ip6_route_redirect(struct net *net,
2509                                             const struct flowi6 *fl6,
2510                                             const struct sk_buff *skb,
2511                                             const struct in6_addr *gateway)
2512 {
2513         int flags = RT6_LOOKUP_F_HAS_SADDR;
2514         struct ip6rd_flowi rdfl;
2515
2516         rdfl.fl6 = *fl6;
2517         rdfl.gateway = *gateway;
2518
2519         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2520                                 flags, __ip6_route_redirect);
2521 }
2522
2523 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2524                   kuid_t uid)
2525 {
2526         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2527         struct dst_entry *dst;
2528         struct flowi6 fl6;
2529
2530         memset(&fl6, 0, sizeof(fl6));
2531         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2532         fl6.flowi6_oif = oif;
2533         fl6.flowi6_mark = mark;
2534         fl6.daddr = iph->daddr;
2535         fl6.saddr = iph->saddr;
2536         fl6.flowlabel = ip6_flowinfo(iph);
2537         fl6.flowi6_uid = uid;
2538
2539         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2540         rt6_do_redirect(dst, NULL, skb);
2541         dst_release(dst);
2542 }
2543 EXPORT_SYMBOL_GPL(ip6_redirect);
2544
2545 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2546                             u32 mark)
2547 {
2548         const struct ipv6hdr *iph = ipv6_hdr(skb);
2549         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2550         struct dst_entry *dst;
2551         struct flowi6 fl6;
2552
2553         memset(&fl6, 0, sizeof(fl6));
2554         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2555         fl6.flowi6_oif = oif;
2556         fl6.flowi6_mark = mark;
2557         fl6.daddr = msg->dest;
2558         fl6.saddr = iph->daddr;
2559         fl6.flowi6_uid = sock_net_uid(net, NULL);
2560
2561         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2562         rt6_do_redirect(dst, NULL, skb);
2563         dst_release(dst);
2564 }
2565
2566 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2567 {
2568         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2569                      sk->sk_uid);
2570 }
2571 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2572
2573 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2574 {
2575         struct net_device *dev = dst->dev;
2576         unsigned int mtu = dst_mtu(dst);
2577         struct net *net = dev_net(dev);
2578
2579         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2580
2581         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2582                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2583
2584         /*
2585          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2586          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2587          * IPV6_MAXPLEN is also valid and means: "any MSS,
2588          * rely only on pmtu discovery"
2589          */
2590         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2591                 mtu = IPV6_MAXPLEN;
2592         return mtu;
2593 }
2594
2595 static unsigned int ip6_mtu(const struct dst_entry *dst)
2596 {
2597         struct inet6_dev *idev;
2598         unsigned int mtu;
2599
2600         mtu = dst_metric_raw(dst, RTAX_MTU);
2601         if (mtu)
2602                 goto out;
2603
2604         mtu = IPV6_MIN_MTU;
2605
2606         rcu_read_lock();
2607         idev = __in6_dev_get(dst->dev);
2608         if (idev)
2609                 mtu = idev->cnf.mtu6;
2610         rcu_read_unlock();
2611
2612 out:
2613         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2614
2615         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2616 }
2617
2618 /* MTU selection:
2619  * 1. mtu on route is locked - use it
2620  * 2. mtu from nexthop exception
2621  * 3. mtu from egress device
2622  *
2623  * based on ip6_dst_mtu_forward and exception logic of
2624  * rt6_find_cached_rt; called with rcu_read_lock
2625  */
2626 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2627                       struct in6_addr *saddr)
2628 {
2629         struct rt6_exception_bucket *bucket;
2630         struct rt6_exception *rt6_ex;
2631         struct in6_addr *src_key;
2632         struct inet6_dev *idev;
2633         u32 mtu = 0;
2634
2635         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2636                 mtu = f6i->fib6_pmtu;
2637                 if (mtu)
2638                         goto out;
2639         }
2640
2641         src_key = NULL;
2642 #ifdef CONFIG_IPV6_SUBTREES
2643         if (f6i->fib6_src.plen)
2644                 src_key = saddr;
2645 #endif
2646
2647         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2648         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2649         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2650                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2651
2652         if (likely(!mtu)) {
2653                 struct net_device *dev = fib6_info_nh_dev(f6i);
2654
2655                 mtu = IPV6_MIN_MTU;
2656                 idev = __in6_dev_get(dev);
2657                 if (idev && idev->cnf.mtu6 > mtu)
2658                         mtu = idev->cnf.mtu6;
2659         }
2660
2661         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2662 out:
2663         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2664 }
2665
2666 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2667                                   struct flowi6 *fl6)
2668 {
2669         struct dst_entry *dst;
2670         struct rt6_info *rt;
2671         struct inet6_dev *idev = in6_dev_get(dev);
2672         struct net *net = dev_net(dev);
2673
2674         if (unlikely(!idev))
2675                 return ERR_PTR(-ENODEV);
2676
2677         rt = ip6_dst_alloc(net, dev, 0);
2678         if (unlikely(!rt)) {
2679                 in6_dev_put(idev);
2680                 dst = ERR_PTR(-ENOMEM);
2681                 goto out;
2682         }
2683
2684         rt->dst.flags |= DST_HOST;
2685         rt->dst.input = ip6_input;
2686         rt->dst.output  = ip6_output;
2687         rt->rt6i_gateway  = fl6->daddr;
2688         rt->rt6i_dst.addr = fl6->daddr;
2689         rt->rt6i_dst.plen = 128;
2690         rt->rt6i_idev     = idev;
2691         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2692
2693         /* Add this dst into uncached_list so that rt6_disable_ip() can
2694          * do proper release of the net_device
2695          */
2696         rt6_uncached_list_add(rt);
2697         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2698
2699         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2700
2701 out:
2702         return dst;
2703 }
2704
2705 static int ip6_dst_gc(struct dst_ops *ops)
2706 {
2707         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2708         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2709         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2710         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2711         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2712         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2713         int entries;
2714
2715         entries = dst_entries_get_fast(ops);
2716         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2717             entries <= rt_max_size)
2718                 goto out;
2719
2720         net->ipv6.ip6_rt_gc_expire++;
2721         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2722         entries = dst_entries_get_slow(ops);
2723         if (entries < ops->gc_thresh)
2724                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2725 out:
2726         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2727         return entries > rt_max_size;
2728 }
2729
2730 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2731                                struct fib6_config *cfg)
2732 {
2733         struct dst_metrics *p;
2734
2735         if (!cfg->fc_mx)
2736                 return 0;
2737
2738         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2739         if (unlikely(!p))
2740                 return -ENOMEM;
2741
2742         refcount_set(&p->refcnt, 1);
2743         rt->fib6_metrics = p;
2744
2745         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2746 }
2747
2748 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2749                                             struct fib6_config *cfg,
2750                                             const struct in6_addr *gw_addr,
2751                                             u32 tbid, int flags)
2752 {
2753         struct flowi6 fl6 = {
2754                 .flowi6_oif = cfg->fc_ifindex,
2755                 .daddr = *gw_addr,
2756                 .saddr = cfg->fc_prefsrc,
2757         };
2758         struct fib6_table *table;
2759         struct rt6_info *rt;
2760
2761         table = fib6_get_table(net, tbid);
2762         if (!table)
2763                 return NULL;
2764
2765         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2766                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2767
2768         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2769         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2770
2771         /* if table lookup failed, fall back to full lookup */
2772         if (rt == net->ipv6.ip6_null_entry) {
2773                 ip6_rt_put(rt);
2774                 rt = NULL;
2775         }
2776
2777         return rt;
2778 }
2779
2780 static int ip6_route_check_nh_onlink(struct net *net,
2781                                      struct fib6_config *cfg,
2782                                      const struct net_device *dev,
2783                                      struct netlink_ext_ack *extack)
2784 {
2785         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2786         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2787         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2788         struct rt6_info *grt;
2789         int err;
2790
2791         err = 0;
2792         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2793         if (grt) {
2794                 if (!grt->dst.error &&
2795                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2796                         NL_SET_ERR_MSG(extack,
2797                                        "Nexthop has invalid gateway or device mismatch");
2798                         err = -EINVAL;
2799                 }
2800
2801                 ip6_rt_put(grt);
2802         }
2803
2804         return err;
2805 }
2806
2807 static int ip6_route_check_nh(struct net *net,
2808                               struct fib6_config *cfg,
2809                               struct net_device **_dev,
2810                               struct inet6_dev **idev)
2811 {
2812         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2813         struct net_device *dev = _dev ? *_dev : NULL;
2814         struct rt6_info *grt = NULL;
2815         int err = -EHOSTUNREACH;
2816
2817         if (cfg->fc_table) {
2818                 int flags = RT6_LOOKUP_F_IFACE;
2819
2820                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2821                                           cfg->fc_table, flags);
2822                 if (grt) {
2823                         if (grt->rt6i_flags & RTF_GATEWAY ||
2824                             (dev && dev != grt->dst.dev)) {
2825                                 ip6_rt_put(grt);
2826                                 grt = NULL;
2827                         }
2828                 }
2829         }
2830
2831         if (!grt)
2832                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2833
2834         if (!grt)
2835                 goto out;
2836
2837         if (dev) {
2838                 if (dev != grt->dst.dev) {
2839                         ip6_rt_put(grt);
2840                         goto out;
2841                 }
2842         } else {
2843                 *_dev = dev = grt->dst.dev;
2844                 *idev = grt->rt6i_idev;
2845                 dev_hold(dev);
2846                 in6_dev_hold(grt->rt6i_idev);
2847         }
2848
2849         if (!(grt->rt6i_flags & RTF_GATEWAY))
2850                 err = 0;
2851
2852         ip6_rt_put(grt);
2853
2854 out:
2855         return err;
2856 }
2857
2858 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2859                            struct net_device **_dev, struct inet6_dev **idev,
2860                            struct netlink_ext_ack *extack)
2861 {
2862         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2863         int gwa_type = ipv6_addr_type(gw_addr);
2864         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2865         const struct net_device *dev = *_dev;
2866         bool need_addr_check = !dev;
2867         int err = -EINVAL;
2868
2869         /* if gw_addr is local we will fail to detect this in case
2870          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2871          * will return already-added prefix route via interface that
2872          * prefix route was assigned to, which might be non-loopback.
2873          */
2874         if (dev &&
2875             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2876                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2877                 goto out;
2878         }
2879
2880         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2881                 /* IPv6 strictly inhibits using not link-local
2882                  * addresses as nexthop address.
2883                  * Otherwise, router will not able to send redirects.
2884                  * It is very good, but in some (rare!) circumstances
2885                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2886                  * some exceptions. --ANK
2887                  * We allow IPv4-mapped nexthops to support RFC4798-type
2888                  * addressing
2889                  */
2890                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2891                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2892                         goto out;
2893                 }
2894
2895                 if (cfg->fc_flags & RTNH_F_ONLINK)
2896                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2897                 else
2898                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2899
2900                 if (err)
2901                         goto out;
2902         }
2903
2904         /* reload in case device was changed */
2905         dev = *_dev;
2906
2907         err = -EINVAL;
2908         if (!dev) {
2909                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2910                 goto out;
2911         } else if (dev->flags & IFF_LOOPBACK) {
2912                 NL_SET_ERR_MSG(extack,
2913                                "Egress device can not be loopback device for this route");
2914                 goto out;
2915         }
2916
2917         /* if we did not check gw_addr above, do so now that the
2918          * egress device has been resolved.
2919          */
2920         if (need_addr_check &&
2921             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2922                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2923                 goto out;
2924         }
2925
2926         err = 0;
2927 out:
2928         return err;
2929 }
2930
2931 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2932                                               gfp_t gfp_flags,
2933                                               struct netlink_ext_ack *extack)
2934 {
2935         struct net *net = cfg->fc_nlinfo.nl_net;
2936         struct fib6_info *rt = NULL;
2937         struct net_device *dev = NULL;
2938         struct inet6_dev *idev = NULL;
2939         struct fib6_table *table;
2940         int addr_type;
2941         int err = -EINVAL;
2942
2943         /* RTF_PCPU is an internal flag; can not be set by userspace */
2944         if (cfg->fc_flags & RTF_PCPU) {
2945                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2946                 goto out;
2947         }
2948
2949         /* RTF_CACHE is an internal flag; can not be set by userspace */
2950         if (cfg->fc_flags & RTF_CACHE) {
2951                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2952                 goto out;
2953         }
2954
2955         if (cfg->fc_type > RTN_MAX) {
2956                 NL_SET_ERR_MSG(extack, "Invalid route type");
2957                 goto out;
2958         }
2959
2960         if (cfg->fc_dst_len > 128) {
2961                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2962                 goto out;
2963         }
2964         if (cfg->fc_src_len > 128) {
2965                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2966                 goto out;
2967         }
2968 #ifndef CONFIG_IPV6_SUBTREES
2969         if (cfg->fc_src_len) {
2970                 NL_SET_ERR_MSG(extack,
2971                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2972                 goto out;
2973         }
2974 #endif
2975         if (cfg->fc_ifindex) {
2976                 err = -ENODEV;
2977                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2978                 if (!dev)
2979                         goto out;
2980                 idev = in6_dev_get(dev);
2981                 if (!idev)
2982                         goto out;
2983         }
2984
2985         if (cfg->fc_metric == 0)
2986                 cfg->fc_metric = IP6_RT_PRIO_USER;
2987
2988         if (cfg->fc_flags & RTNH_F_ONLINK) {
2989                 if (!dev) {
2990                         NL_SET_ERR_MSG(extack,
2991                                        "Nexthop device required for onlink");
2992                         err = -ENODEV;
2993                         goto out;
2994                 }
2995
2996                 if (!(dev->flags & IFF_UP)) {
2997                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2998                         err = -ENETDOWN;
2999                         goto out;
3000                 }
3001         }
3002
3003         err = -ENOBUFS;
3004         if (cfg->fc_nlinfo.nlh &&
3005             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3006                 table = fib6_get_table(net, cfg->fc_table);
3007                 if (!table) {
3008                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3009                         table = fib6_new_table(net, cfg->fc_table);
3010                 }
3011         } else {
3012                 table = fib6_new_table(net, cfg->fc_table);
3013         }
3014
3015         if (!table)
3016                 goto out;
3017
3018         err = -ENOMEM;
3019         rt = fib6_info_alloc(gfp_flags);
3020         if (!rt)
3021                 goto out;
3022
3023         if (cfg->fc_flags & RTF_ADDRCONF)
3024                 rt->dst_nocount = true;
3025
3026         err = ip6_convert_metrics(net, rt, cfg);
3027         if (err < 0)
3028                 goto out;
3029
3030         if (cfg->fc_flags & RTF_EXPIRES)
3031                 fib6_set_expires(rt, jiffies +
3032                                 clock_t_to_jiffies(cfg->fc_expires));
3033         else
3034                 fib6_clean_expires(rt);
3035
3036         if (cfg->fc_protocol == RTPROT_UNSPEC)
3037                 cfg->fc_protocol = RTPROT_BOOT;
3038         rt->fib6_protocol = cfg->fc_protocol;
3039
3040         addr_type = ipv6_addr_type(&cfg->fc_dst);
3041
3042         if (cfg->fc_encap) {
3043                 struct lwtunnel_state *lwtstate;
3044
3045                 err = lwtunnel_build_state(cfg->fc_encap_type,
3046                                            cfg->fc_encap, AF_INET6, cfg,
3047                                            &lwtstate, extack);
3048                 if (err)
3049                         goto out;
3050                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3051         }
3052
3053         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3054         rt->fib6_dst.plen = cfg->fc_dst_len;
3055         if (rt->fib6_dst.plen == 128)
3056                 rt->dst_host = true;
3057
3058 #ifdef CONFIG_IPV6_SUBTREES
3059         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3060         rt->fib6_src.plen = cfg->fc_src_len;
3061 #endif
3062
3063         rt->fib6_metric = cfg->fc_metric;
3064         rt->fib6_nh.nh_weight = 1;
3065
3066         rt->fib6_type = cfg->fc_type;
3067
3068         /* We cannot add true routes via loopback here,
3069            they would result in kernel looping; promote them to reject routes
3070          */
3071         if ((cfg->fc_flags & RTF_REJECT) ||
3072             (dev && (dev->flags & IFF_LOOPBACK) &&
3073              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3074              !(cfg->fc_flags & RTF_LOCAL))) {
3075                 /* hold loopback dev/idev if we haven't done so. */
3076                 if (dev != net->loopback_dev) {
3077                         if (dev) {
3078                                 dev_put(dev);
3079                                 in6_dev_put(idev);
3080                         }
3081                         dev = net->loopback_dev;
3082                         dev_hold(dev);
3083                         idev = in6_dev_get(dev);
3084                         if (!idev) {
3085                                 err = -ENODEV;
3086                                 goto out;
3087                         }
3088                 }
3089                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3090                 goto install_route;
3091         }
3092
3093         if (cfg->fc_flags & RTF_GATEWAY) {
3094                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3095                 if (err)
3096                         goto out;
3097
3098                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3099         }
3100
3101         err = -ENODEV;
3102         if (!dev)
3103                 goto out;
3104
3105         if (idev->cnf.disable_ipv6) {
3106                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3107                 err = -EACCES;
3108                 goto out;
3109         }
3110
3111         if (!(dev->flags & IFF_UP)) {
3112                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3113                 err = -ENETDOWN;
3114                 goto out;
3115         }
3116
3117         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3118                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3119                         NL_SET_ERR_MSG(extack, "Invalid source address");
3120                         err = -EINVAL;
3121                         goto out;
3122                 }
3123                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3124                 rt->fib6_prefsrc.plen = 128;
3125         } else
3126                 rt->fib6_prefsrc.plen = 0;
3127
3128         rt->fib6_flags = cfg->fc_flags;
3129
3130 install_route:
3131         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3132             !netif_carrier_ok(dev))
3133                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3134         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3135         rt->fib6_nh.nh_dev = dev;
3136         rt->fib6_table = table;
3137
3138         cfg->fc_nlinfo.nl_net = dev_net(dev);
3139
3140         if (idev)
3141                 in6_dev_put(idev);
3142
3143         return rt;
3144 out:
3145         if (dev)
3146                 dev_put(dev);
3147         if (idev)
3148                 in6_dev_put(idev);
3149
3150         fib6_info_release(rt);
3151         return ERR_PTR(err);
3152 }
3153
3154 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3155                   struct netlink_ext_ack *extack)
3156 {
3157         struct fib6_info *rt;
3158         int err;
3159
3160         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3161         if (IS_ERR(rt))
3162                 return PTR_ERR(rt);
3163
3164         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3165         fib6_info_release(rt);
3166
3167         return err;
3168 }
3169
3170 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3171 {
3172         struct net *net = info->nl_net;
3173         struct fib6_table *table;
3174         int err;
3175
3176         if (rt == net->ipv6.fib6_null_entry) {
3177                 err = -ENOENT;
3178                 goto out;
3179         }
3180
3181         table = rt->fib6_table;
3182         spin_lock_bh(&table->tb6_lock);
3183         err = fib6_del(rt, info);
3184         spin_unlock_bh(&table->tb6_lock);
3185
3186 out:
3187         fib6_info_release(rt);
3188         return err;
3189 }
3190
3191 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3192 {
3193         struct nl_info info = { .nl_net = net };
3194
3195         return __ip6_del_rt(rt, &info);
3196 }
3197
3198 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3199 {
3200         struct nl_info *info = &cfg->fc_nlinfo;
3201         struct net *net = info->nl_net;
3202         struct sk_buff *skb = NULL;
3203         struct fib6_table *table;
3204         int err = -ENOENT;
3205
3206         if (rt == net->ipv6.fib6_null_entry)
3207                 goto out_put;
3208         table = rt->fib6_table;
3209         spin_lock_bh(&table->tb6_lock);
3210
3211         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3212                 struct fib6_info *sibling, *next_sibling;
3213
3214                 /* prefer to send a single notification with all hops */
3215                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3216                 if (skb) {
3217                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3218
3219                         if (rt6_fill_node(net, skb, rt, NULL,
3220                                           NULL, NULL, 0, RTM_DELROUTE,
3221                                           info->portid, seq, 0) < 0) {
3222                                 kfree_skb(skb);
3223                                 skb = NULL;
3224                         } else
3225                                 info->skip_notify = 1;
3226                 }
3227
3228                 list_for_each_entry_safe(sibling, next_sibling,
3229                                          &rt->fib6_siblings,
3230                                          fib6_siblings) {
3231                         err = fib6_del(sibling, info);
3232                         if (err)
3233                                 goto out_unlock;
3234                 }
3235         }
3236
3237         err = fib6_del(rt, info);
3238 out_unlock:
3239         spin_unlock_bh(&table->tb6_lock);
3240 out_put:
3241         fib6_info_release(rt);
3242
3243         if (skb) {
3244                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3245                             info->nlh, gfp_any());
3246         }
3247         return err;
3248 }
3249
3250 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3251 {
3252         int rc = -ESRCH;
3253
3254         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3255                 goto out;
3256
3257         if (cfg->fc_flags & RTF_GATEWAY &&
3258             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3259                 goto out;
3260         if (dst_hold_safe(&rt->dst))
3261                 rc = rt6_remove_exception_rt(rt);
3262 out:
3263         return rc;
3264 }
3265
3266 static int ip6_route_del(struct fib6_config *cfg,
3267                          struct netlink_ext_ack *extack)
3268 {
3269         struct rt6_info *rt_cache;
3270         struct fib6_table *table;
3271         struct fib6_info *rt;
3272         struct fib6_node *fn;
3273         int err = -ESRCH;
3274
3275         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3276         if (!table) {
3277                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3278                 return err;
3279         }
3280
3281         rcu_read_lock();
3282
3283         fn = fib6_locate(&table->tb6_root,
3284                          &cfg->fc_dst, cfg->fc_dst_len,
3285                          &cfg->fc_src, cfg->fc_src_len,
3286                          !(cfg->fc_flags & RTF_CACHE));
3287
3288         if (fn) {
3289                 for_each_fib6_node_rt_rcu(fn) {
3290                         if (cfg->fc_flags & RTF_CACHE) {
3291                                 int rc;
3292
3293                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3294                                                               &cfg->fc_src);
3295                                 if (rt_cache) {
3296                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3297                                         if (rc != -ESRCH) {
3298                                                 rcu_read_unlock();
3299                                                 return rc;
3300                                         }
3301                                 }
3302                                 continue;
3303                         }
3304                         if (cfg->fc_ifindex &&
3305                             (!rt->fib6_nh.nh_dev ||
3306                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3307                                 continue;
3308                         if (cfg->fc_flags & RTF_GATEWAY &&
3309                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3310                                 continue;
3311                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3312                                 continue;
3313                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3314                                 continue;
3315                         if (!fib6_info_hold_safe(rt))
3316                                 continue;
3317                         rcu_read_unlock();
3318
3319                         /* if gateway was specified only delete the one hop */
3320                         if (cfg->fc_flags & RTF_GATEWAY)
3321                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3322
3323                         return __ip6_del_rt_siblings(rt, cfg);
3324                 }
3325         }
3326         rcu_read_unlock();
3327
3328         return err;
3329 }
3330
3331 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3332 {
3333         struct netevent_redirect netevent;
3334         struct rt6_info *rt, *nrt = NULL;
3335         struct ndisc_options ndopts;
3336         struct inet6_dev *in6_dev;
3337         struct neighbour *neigh;
3338         struct fib6_info *from;
3339         struct rd_msg *msg;
3340         int optlen, on_link;
3341         u8 *lladdr;
3342
3343         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3344         optlen -= sizeof(*msg);
3345
3346         if (optlen < 0) {
3347                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3348                 return;
3349         }
3350
3351         msg = (struct rd_msg *)icmp6_hdr(skb);
3352
3353         if (ipv6_addr_is_multicast(&msg->dest)) {
3354                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3355                 return;
3356         }
3357
3358         on_link = 0;
3359         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3360                 on_link = 1;
3361         } else if (ipv6_addr_type(&msg->target) !=
3362                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3363                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3364                 return;
3365         }
3366
3367         in6_dev = __in6_dev_get(skb->dev);
3368         if (!in6_dev)
3369                 return;
3370         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3371                 return;
3372
3373         /* RFC2461 8.1:
3374          *      The IP source address of the Redirect MUST be the same as the current
3375          *      first-hop router for the specified ICMP Destination Address.
3376          */
3377
3378         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3379                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3380                 return;
3381         }
3382
3383         lladdr = NULL;
3384         if (ndopts.nd_opts_tgt_lladdr) {
3385                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3386                                              skb->dev);
3387                 if (!lladdr) {
3388                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3389                         return;
3390                 }
3391         }
3392
3393         rt = (struct rt6_info *) dst;
3394         if (rt->rt6i_flags & RTF_REJECT) {
3395                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3396                 return;
3397         }
3398
3399         /* Redirect received -> path was valid.
3400          * Look, redirects are sent only in response to data packets,
3401          * so that this nexthop apparently is reachable. --ANK
3402          */
3403         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3404
3405         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3406         if (!neigh)
3407                 return;
3408
3409         /*
3410          *      We have finally decided to accept it.
3411          */
3412
3413         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3414                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3415                      NEIGH_UPDATE_F_OVERRIDE|
3416                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3417                                      NEIGH_UPDATE_F_ISROUTER)),
3418                      NDISC_REDIRECT, &ndopts);
3419
3420         rcu_read_lock();
3421         from = rcu_dereference(rt->from);
3422         /* This fib6_info_hold() is safe here because we hold reference to rt
3423          * and rt already holds reference to fib6_info.
3424          */
3425         fib6_info_hold(from);
3426         rcu_read_unlock();
3427
3428         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3429         if (!nrt)
3430                 goto out;
3431
3432         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3433         if (on_link)
3434                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3435
3436         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3437
3438         /* No need to remove rt from the exception table if rt is
3439          * a cached route because rt6_insert_exception() will
3440          * takes care of it
3441          */
3442         if (rt6_insert_exception(nrt, from)) {
3443                 dst_release_immediate(&nrt->dst);
3444                 goto out;
3445         }
3446
3447         netevent.old = &rt->dst;
3448         netevent.new = &nrt->dst;
3449         netevent.daddr = &msg->dest;
3450         netevent.neigh = neigh;
3451         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3452
3453 out:
3454         fib6_info_release(from);
3455         neigh_release(neigh);
3456 }
3457
3458 #ifdef CONFIG_IPV6_ROUTE_INFO
3459 static struct fib6_info *rt6_get_route_info(struct net *net,
3460                                            const struct in6_addr *prefix, int prefixlen,
3461                                            const struct in6_addr *gwaddr,
3462                                            struct net_device *dev)
3463 {
3464         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3465         int ifindex = dev->ifindex;
3466         struct fib6_node *fn;
3467         struct fib6_info *rt = NULL;
3468         struct fib6_table *table;
3469
3470         table = fib6_get_table(net, tb_id);
3471         if (!table)
3472                 return NULL;
3473
3474         rcu_read_lock();
3475         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3476         if (!fn)
3477                 goto out;
3478
3479         for_each_fib6_node_rt_rcu(fn) {
3480                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3481                         continue;
3482                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3483                         continue;
3484                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3485                         continue;
3486                 if (!fib6_info_hold_safe(rt))
3487                         continue;
3488                 break;
3489         }
3490 out:
3491         rcu_read_unlock();
3492         return rt;
3493 }
3494
3495 static struct fib6_info *rt6_add_route_info(struct net *net,
3496                                            const struct in6_addr *prefix, int prefixlen,
3497                                            const struct in6_addr *gwaddr,
3498                                            struct net_device *dev,
3499                                            unsigned int pref)
3500 {
3501         struct fib6_config cfg = {
3502                 .fc_metric      = IP6_RT_PRIO_USER,
3503                 .fc_ifindex     = dev->ifindex,
3504                 .fc_dst_len     = prefixlen,
3505                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3506                                   RTF_UP | RTF_PREF(pref),
3507                 .fc_protocol = RTPROT_RA,
3508                 .fc_type = RTN_UNICAST,
3509                 .fc_nlinfo.portid = 0,
3510                 .fc_nlinfo.nlh = NULL,
3511                 .fc_nlinfo.nl_net = net,
3512         };
3513
3514         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3515         cfg.fc_dst = *prefix;
3516         cfg.fc_gateway = *gwaddr;
3517
3518         /* We should treat it as a default route if prefix length is 0. */
3519         if (!prefixlen)
3520                 cfg.fc_flags |= RTF_DEFAULT;
3521
3522         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3523
3524         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3525 }
3526 #endif
3527
3528 struct fib6_info *rt6_get_dflt_router(struct net *net,
3529                                      const struct in6_addr *addr,
3530                                      struct net_device *dev)
3531 {
3532         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3533         struct fib6_info *rt;
3534         struct fib6_table *table;
3535
3536         table = fib6_get_table(net, tb_id);
3537         if (!table)
3538                 return NULL;
3539
3540         rcu_read_lock();
3541         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3542                 if (dev == rt->fib6_nh.nh_dev &&
3543                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3544                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3545                         break;
3546         }
3547         if (rt && !fib6_info_hold_safe(rt))
3548                 rt = NULL;
3549         rcu_read_unlock();
3550         return rt;
3551 }
3552
3553 struct fib6_info *rt6_add_dflt_router(struct net *net,
3554                                      const struct in6_addr *gwaddr,
3555                                      struct net_device *dev,
3556                                      unsigned int pref)
3557 {
3558         struct fib6_config cfg = {
3559                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3560                 .fc_metric      = IP6_RT_PRIO_USER,
3561                 .fc_ifindex     = dev->ifindex,
3562                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3563                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3564                 .fc_protocol = RTPROT_RA,
3565                 .fc_type = RTN_UNICAST,
3566                 .fc_nlinfo.portid = 0,
3567                 .fc_nlinfo.nlh = NULL,
3568                 .fc_nlinfo.nl_net = net,
3569         };
3570
3571         cfg.fc_gateway = *gwaddr;
3572
3573         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3574                 struct fib6_table *table;
3575
3576                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3577                 if (table)
3578                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3579         }
3580
3581         return rt6_get_dflt_router(net, gwaddr, dev);
3582 }
3583
3584 static void __rt6_purge_dflt_routers(struct net *net,
3585                                      struct fib6_table *table)
3586 {
3587         struct fib6_info *rt;
3588
3589 restart:
3590         rcu_read_lock();
3591         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3592                 struct net_device *dev = fib6_info_nh_dev(rt);
3593                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3594
3595                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3596                     (!idev || idev->cnf.accept_ra != 2) &&
3597                     fib6_info_hold_safe(rt)) {
3598                         rcu_read_unlock();
3599                         ip6_del_rt(net, rt);
3600                         goto restart;
3601                 }
3602         }
3603         rcu_read_unlock();
3604
3605         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3606 }
3607
3608 void rt6_purge_dflt_routers(struct net *net)
3609 {
3610         struct fib6_table *table;
3611         struct hlist_head *head;
3612         unsigned int h;
3613
3614         rcu_read_lock();
3615
3616         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3617                 head = &net->ipv6.fib_table_hash[h];
3618                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3619                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3620                                 __rt6_purge_dflt_routers(net, table);
3621                 }
3622         }
3623
3624         rcu_read_unlock();
3625 }
3626
3627 static void rtmsg_to_fib6_config(struct net *net,
3628                                  struct in6_rtmsg *rtmsg,
3629                                  struct fib6_config *cfg)
3630 {
3631         memset(cfg, 0, sizeof(*cfg));
3632
3633         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3634                          : RT6_TABLE_MAIN;
3635         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3636         cfg->fc_metric = rtmsg->rtmsg_metric;
3637         cfg->fc_expires = rtmsg->rtmsg_info;
3638         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3639         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3640         cfg->fc_flags = rtmsg->rtmsg_flags;
3641         cfg->fc_type = rtmsg->rtmsg_type;
3642
3643         cfg->fc_nlinfo.nl_net = net;
3644
3645         cfg->fc_dst = rtmsg->rtmsg_dst;
3646         cfg->fc_src = rtmsg->rtmsg_src;
3647         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3648 }
3649
3650 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3651 {
3652         struct fib6_config cfg;
3653         struct in6_rtmsg rtmsg;
3654         int err;
3655
3656         switch (cmd) {
3657         case SIOCADDRT:         /* Add a route */
3658         case SIOCDELRT:         /* Delete a route */
3659                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3660                         return -EPERM;
3661                 err = copy_from_user(&rtmsg, arg,
3662                                      sizeof(struct in6_rtmsg));
3663                 if (err)
3664                         return -EFAULT;
3665
3666                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3667
3668                 rtnl_lock();
3669                 switch (cmd) {
3670                 case SIOCADDRT:
3671                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3672                         break;
3673                 case SIOCDELRT:
3674                         err = ip6_route_del(&cfg, NULL);
3675                         break;
3676                 default:
3677                         err = -EINVAL;
3678                 }
3679                 rtnl_unlock();
3680
3681                 return err;
3682         }
3683
3684         return -EINVAL;
3685 }
3686
3687 /*
3688  *      Drop the packet on the floor
3689  */
3690
3691 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3692 {
3693         int type;
3694         struct dst_entry *dst = skb_dst(skb);
3695         switch (ipstats_mib_noroutes) {
3696         case IPSTATS_MIB_INNOROUTES:
3697                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3698                 if (type == IPV6_ADDR_ANY) {
3699                         IP6_INC_STATS(dev_net(dst->dev),
3700                                       __in6_dev_get_safely(skb->dev),
3701                                       IPSTATS_MIB_INADDRERRORS);
3702                         break;
3703                 }
3704                 /* FALLTHROUGH */
3705         case IPSTATS_MIB_OUTNOROUTES:
3706                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3707                               ipstats_mib_noroutes);
3708                 break;
3709         }
3710         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3711         kfree_skb(skb);
3712         return 0;
3713 }
3714
3715 static int ip6_pkt_discard(struct sk_buff *skb)
3716 {
3717         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3718 }
3719
3720 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3721 {
3722         skb->dev = skb_dst(skb)->dev;
3723         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3724 }
3725
3726 static int ip6_pkt_prohibit(struct sk_buff *skb)
3727 {
3728         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3729 }
3730
3731 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3732 {
3733         skb->dev = skb_dst(skb)->dev;
3734         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3735 }
3736
3737 /*
3738  *      Allocate a dst for local (unicast / anycast) address.
3739  */
3740
3741 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3742                                      struct inet6_dev *idev,
3743                                      const struct in6_addr *addr,
3744                                      bool anycast, gfp_t gfp_flags)
3745 {
3746         u32 tb_id;
3747         struct net_device *dev = idev->dev;
3748         struct fib6_info *f6i;
3749
3750         f6i = fib6_info_alloc(gfp_flags);
3751         if (!f6i)
3752                 return ERR_PTR(-ENOMEM);
3753
3754         f6i->dst_nocount = true;
3755         f6i->dst_host = true;
3756         f6i->fib6_protocol = RTPROT_KERNEL;
3757         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3758         if (anycast) {
3759                 f6i->fib6_type = RTN_ANYCAST;
3760                 f6i->fib6_flags |= RTF_ANYCAST;
3761         } else {
3762                 f6i->fib6_type = RTN_LOCAL;
3763                 f6i->fib6_flags |= RTF_LOCAL;
3764         }
3765
3766         f6i->fib6_nh.nh_gw = *addr;
3767         dev_hold(dev);
3768         f6i->fib6_nh.nh_dev = dev;
3769         f6i->fib6_dst.addr = *addr;
3770         f6i->fib6_dst.plen = 128;
3771         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3772         f6i->fib6_table = fib6_get_table(net, tb_id);
3773
3774         return f6i;
3775 }
3776
3777 /* remove deleted ip from prefsrc entries */
3778 struct arg_dev_net_ip {
3779         struct net_device *dev;
3780         struct net *net;
3781         struct in6_addr *addr;
3782 };
3783
3784 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3785 {
3786         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3787         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3788         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3789
3790         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3791             rt != net->ipv6.fib6_null_entry &&
3792             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3793                 spin_lock_bh(&rt6_exception_lock);
3794                 /* remove prefsrc entry */
3795                 rt->fib6_prefsrc.plen = 0;
3796                 /* need to update cache as well */
3797                 rt6_exceptions_remove_prefsrc(rt);
3798                 spin_unlock_bh(&rt6_exception_lock);
3799         }
3800         return 0;
3801 }
3802
3803 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3804 {
3805         struct net *net = dev_net(ifp->idev->dev);
3806         struct arg_dev_net_ip adni = {
3807                 .dev = ifp->idev->dev,
3808                 .net = net,
3809                 .addr = &ifp->addr,
3810         };
3811         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3812 }
3813
3814 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3815
3816 /* Remove routers and update dst entries when gateway turn into host. */
3817 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3818 {
3819         struct in6_addr *gateway = (struct in6_addr *)arg;
3820
3821         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3822             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3823                 return -1;
3824         }
3825
3826         /* Further clean up cached routes in exception table.
3827          * This is needed because cached route may have a different
3828          * gateway than its 'parent' in the case of an ip redirect.
3829          */
3830         rt6_exceptions_clean_tohost(rt, gateway);
3831
3832         return 0;
3833 }
3834
3835 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3836 {
3837         fib6_clean_all(net, fib6_clean_tohost, gateway);
3838 }
3839
3840 struct arg_netdev_event {
3841         const struct net_device *dev;
3842         union {
3843                 unsigned int nh_flags;
3844                 unsigned long event;
3845         };
3846 };
3847
3848 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3849 {
3850         struct fib6_info *iter;
3851         struct fib6_node *fn;
3852
3853         fn = rcu_dereference_protected(rt->fib6_node,
3854                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3855         iter = rcu_dereference_protected(fn->leaf,
3856                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3857         while (iter) {
3858                 if (iter->fib6_metric == rt->fib6_metric &&
3859                     rt6_qualify_for_ecmp(iter))
3860                         return iter;
3861                 iter = rcu_dereference_protected(iter->fib6_next,
3862                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3863         }
3864
3865         return NULL;
3866 }
3867
3868 static bool rt6_is_dead(const struct fib6_info *rt)
3869 {
3870         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3871             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3872              fib6_ignore_linkdown(rt)))
3873                 return true;
3874
3875         return false;
3876 }
3877
3878 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3879 {
3880         struct fib6_info *iter;
3881         int total = 0;
3882
3883         if (!rt6_is_dead(rt))
3884                 total += rt->fib6_nh.nh_weight;
3885
3886         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3887                 if (!rt6_is_dead(iter))
3888                         total += iter->fib6_nh.nh_weight;
3889         }
3890
3891         return total;
3892 }
3893
3894 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3895 {
3896         int upper_bound = -1;
3897
3898         if (!rt6_is_dead(rt)) {
3899                 *weight += rt->fib6_nh.nh_weight;
3900                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3901                                                     total) - 1;
3902         }
3903         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3904 }
3905
3906 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3907 {
3908         struct fib6_info *iter;
3909         int weight = 0;
3910
3911         rt6_upper_bound_set(rt, &weight, total);
3912
3913         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3914                 rt6_upper_bound_set(iter, &weight, total);
3915 }
3916
3917 void rt6_multipath_rebalance(struct fib6_info *rt)
3918 {
3919         struct fib6_info *first;
3920         int total;
3921
3922         /* In case the entire multipath route was marked for flushing,
3923          * then there is no need to rebalance upon the removal of every
3924          * sibling route.
3925          */
3926         if (!rt->fib6_nsiblings || rt->should_flush)
3927                 return;
3928
3929         /* During lookup routes are evaluated in order, so we need to
3930          * make sure upper bounds are assigned from the first sibling
3931          * onwards.
3932          */
3933         first = rt6_multipath_first_sibling(rt);
3934         if (WARN_ON_ONCE(!first))
3935                 return;
3936
3937         total = rt6_multipath_total_weight(first);
3938         rt6_multipath_upper_bound_set(first, total);
3939 }
3940
3941 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3942 {
3943         const struct arg_netdev_event *arg = p_arg;
3944         struct net *net = dev_net(arg->dev);
3945
3946         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3947                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3948                 fib6_update_sernum_upto_root(net, rt);
3949                 rt6_multipath_rebalance(rt);
3950         }
3951
3952         return 0;
3953 }
3954
3955 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3956 {
3957         struct arg_netdev_event arg = {
3958                 .dev = dev,
3959                 {
3960                         .nh_flags = nh_flags,
3961                 },
3962         };
3963
3964         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3965                 arg.nh_flags |= RTNH_F_LINKDOWN;
3966
3967         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3968 }
3969
3970 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3971                                    const struct net_device *dev)
3972 {
3973         struct fib6_info *iter;
3974
3975         if (rt->fib6_nh.nh_dev == dev)
3976                 return true;
3977         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3978                 if (iter->fib6_nh.nh_dev == dev)
3979                         return true;
3980
3981         return false;
3982 }
3983
3984 static void rt6_multipath_flush(struct fib6_info *rt)
3985 {
3986         struct fib6_info *iter;
3987
3988         rt->should_flush = 1;
3989         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3990                 iter->should_flush = 1;
3991 }
3992
3993 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3994                                              const struct net_device *down_dev)
3995 {
3996         struct fib6_info *iter;
3997         unsigned int dead = 0;
3998
3999         if (rt->fib6_nh.nh_dev == down_dev ||
4000             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4001                 dead++;
4002         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4003                 if (iter->fib6_nh.nh_dev == down_dev ||
4004                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4005                         dead++;
4006
4007         return dead;
4008 }
4009
4010 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4011                                        const struct net_device *dev,
4012                                        unsigned int nh_flags)
4013 {
4014         struct fib6_info *iter;
4015
4016         if (rt->fib6_nh.nh_dev == dev)
4017                 rt->fib6_nh.nh_flags |= nh_flags;
4018         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4019                 if (iter->fib6_nh.nh_dev == dev)
4020                         iter->fib6_nh.nh_flags |= nh_flags;
4021 }
4022
4023 /* called with write lock held for table with rt */
4024 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4025 {
4026         const struct arg_netdev_event *arg = p_arg;
4027         const struct net_device *dev = arg->dev;
4028         struct net *net = dev_net(dev);
4029
4030         if (rt == net->ipv6.fib6_null_entry)
4031                 return 0;
4032
4033         switch (arg->event) {
4034         case NETDEV_UNREGISTER:
4035                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4036         case NETDEV_DOWN:
4037                 if (rt->should_flush)
4038                         return -1;
4039                 if (!rt->fib6_nsiblings)
4040                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4041                 if (rt6_multipath_uses_dev(rt, dev)) {
4042                         unsigned int count;
4043
4044                         count = rt6_multipath_dead_count(rt, dev);
4045                         if (rt->fib6_nsiblings + 1 == count) {
4046                                 rt6_multipath_flush(rt);
4047                                 return -1;
4048                         }
4049                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4050                                                    RTNH_F_LINKDOWN);
4051                         fib6_update_sernum(net, rt);
4052                         rt6_multipath_rebalance(rt);
4053                 }
4054                 return -2;
4055         case NETDEV_CHANGE:
4056                 if (rt->fib6_nh.nh_dev != dev ||
4057                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4058                         break;
4059                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4060                 rt6_multipath_rebalance(rt);
4061                 break;
4062         }
4063
4064         return 0;
4065 }
4066
4067 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4068 {
4069         struct arg_netdev_event arg = {
4070                 .dev = dev,
4071                 {
4072                         .event = event,
4073                 },
4074         };
4075
4076         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4077 }
4078
4079 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4080 {
4081         rt6_sync_down_dev(dev, event);
4082         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4083         neigh_ifdown(&nd_tbl, dev);
4084 }
4085
4086 struct rt6_mtu_change_arg {
4087         struct net_device *dev;
4088         unsigned int mtu;
4089 };
4090
4091 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4092 {
4093         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4094         struct inet6_dev *idev;
4095
4096         /* In IPv6 pmtu discovery is not optional,
4097            so that RTAX_MTU lock cannot disable it.
4098            We still use this lock to block changes
4099            caused by addrconf/ndisc.
4100         */
4101
4102         idev = __in6_dev_get(arg->dev);
4103         if (!idev)
4104                 return 0;
4105
4106         /* For administrative MTU increase, there is no way to discover
4107            IPv6 PMTU increase, so PMTU increase should be updated here.
4108            Since RFC 1981 doesn't include administrative MTU increase
4109            update PMTU increase is a MUST. (i.e. jumbo frame)
4110          */
4111         if (rt->fib6_nh.nh_dev == arg->dev &&
4112             !fib6_metric_locked(rt, RTAX_MTU)) {
4113                 u32 mtu = rt->fib6_pmtu;
4114
4115                 if (mtu >= arg->mtu ||
4116                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4117                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4118
4119                 spin_lock_bh(&rt6_exception_lock);
4120                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4121                 spin_unlock_bh(&rt6_exception_lock);
4122         }
4123         return 0;
4124 }
4125
4126 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4127 {
4128         struct rt6_mtu_change_arg arg = {
4129                 .dev = dev,
4130                 .mtu = mtu,
4131         };
4132
4133         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4134 }
4135
4136 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4137         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4138         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4139         [RTA_OIF]               = { .type = NLA_U32 },
4140         [RTA_IIF]               = { .type = NLA_U32 },
4141         [RTA_PRIORITY]          = { .type = NLA_U32 },
4142         [RTA_METRICS]           = { .type = NLA_NESTED },
4143         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4144         [RTA_PREF]              = { .type = NLA_U8 },
4145         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4146         [RTA_ENCAP]             = { .type = NLA_NESTED },
4147         [RTA_EXPIRES]           = { .type = NLA_U32 },
4148         [RTA_UID]               = { .type = NLA_U32 },
4149         [RTA_MARK]              = { .type = NLA_U32 },
4150         [RTA_TABLE]             = { .type = NLA_U32 },
4151         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4152         [RTA_SPORT]             = { .type = NLA_U16 },
4153         [RTA_DPORT]             = { .type = NLA_U16 },
4154 };
4155
4156 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4157                               struct fib6_config *cfg,
4158                               struct netlink_ext_ack *extack)
4159 {
4160         struct rtmsg *rtm;
4161         struct nlattr *tb[RTA_MAX+1];
4162         unsigned int pref;
4163         int err;
4164
4165         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4166                           NULL);
4167         if (err < 0)
4168                 goto errout;
4169
4170         err = -EINVAL;
4171         rtm = nlmsg_data(nlh);
4172         memset(cfg, 0, sizeof(*cfg));
4173
4174         cfg->fc_table = rtm->rtm_table;
4175         cfg->fc_dst_len = rtm->rtm_dst_len;
4176         cfg->fc_src_len = rtm->rtm_src_len;
4177         cfg->fc_flags = RTF_UP;
4178         cfg->fc_protocol = rtm->rtm_protocol;
4179         cfg->fc_type = rtm->rtm_type;
4180
4181         if (rtm->rtm_type == RTN_UNREACHABLE ||
4182             rtm->rtm_type == RTN_BLACKHOLE ||
4183             rtm->rtm_type == RTN_PROHIBIT ||
4184             rtm->rtm_type == RTN_THROW)
4185                 cfg->fc_flags |= RTF_REJECT;
4186
4187         if (rtm->rtm_type == RTN_LOCAL)
4188                 cfg->fc_flags |= RTF_LOCAL;
4189
4190         if (rtm->rtm_flags & RTM_F_CLONED)
4191                 cfg->fc_flags |= RTF_CACHE;
4192
4193         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4194
4195         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4196         cfg->fc_nlinfo.nlh = nlh;
4197         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4198
4199         if (tb[RTA_GATEWAY]) {
4200                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4201                 cfg->fc_flags |= RTF_GATEWAY;
4202         }
4203
4204         if (tb[RTA_DST]) {
4205                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4206
4207                 if (nla_len(tb[RTA_DST]) < plen)
4208                         goto errout;
4209
4210                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4211         }
4212
4213         if (tb[RTA_SRC]) {
4214                 int plen = (rtm->rtm_src_len + 7) >> 3;
4215
4216                 if (nla_len(tb[RTA_SRC]) < plen)
4217                         goto errout;
4218
4219                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4220         }
4221
4222         if (tb[RTA_PREFSRC])
4223                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4224
4225         if (tb[RTA_OIF])
4226                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4227
4228         if (tb[RTA_PRIORITY])
4229                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4230
4231         if (tb[RTA_METRICS]) {
4232                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4233                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4234         }
4235
4236         if (tb[RTA_TABLE])
4237                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4238
4239         if (tb[RTA_MULTIPATH]) {
4240                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4241                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4242
4243                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4244                                                      cfg->fc_mp_len, extack);
4245                 if (err < 0)
4246                         goto errout;
4247         }
4248
4249         if (tb[RTA_PREF]) {
4250                 pref = nla_get_u8(tb[RTA_PREF]);
4251                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4252                     pref != ICMPV6_ROUTER_PREF_HIGH)
4253                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4254                 cfg->fc_flags |= RTF_PREF(pref);
4255         }
4256
4257         if (tb[RTA_ENCAP])
4258                 cfg->fc_encap = tb[RTA_ENCAP];
4259
4260         if (tb[RTA_ENCAP_TYPE]) {
4261                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4262
4263                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4264                 if (err < 0)
4265                         goto errout;
4266         }
4267
4268         if (tb[RTA_EXPIRES]) {
4269                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4270
4271                 if (addrconf_finite_timeout(timeout)) {
4272                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4273                         cfg->fc_flags |= RTF_EXPIRES;
4274                 }
4275         }
4276
4277         err = 0;
4278 errout:
4279         return err;
4280 }
4281
4282 struct rt6_nh {
4283         struct fib6_info *fib6_info;
4284         struct fib6_config r_cfg;
4285         struct list_head next;
4286 };
4287
4288 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4289 {
4290         struct rt6_nh *nh;
4291
4292         list_for_each_entry(nh, rt6_nh_list, next) {
4293                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4294                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4295                         nh->r_cfg.fc_ifindex);
4296         }
4297 }
4298
4299 static int ip6_route_info_append(struct net *net,
4300                                  struct list_head *rt6_nh_list,
4301                                  struct fib6_info *rt,
4302                                  struct fib6_config *r_cfg)
4303 {
4304         struct rt6_nh *nh;
4305         int err = -EEXIST;
4306
4307         list_for_each_entry(nh, rt6_nh_list, next) {
4308                 /* check if fib6_info already exists */
4309                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4310                         return err;
4311         }
4312
4313         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4314         if (!nh)
4315                 return -ENOMEM;
4316         nh->fib6_info = rt;
4317         err = ip6_convert_metrics(net, rt, r_cfg);
4318         if (err) {
4319                 kfree(nh);
4320                 return err;
4321         }
4322         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4323         list_add_tail(&nh->next, rt6_nh_list);
4324
4325         return 0;
4326 }
4327
4328 static void ip6_route_mpath_notify(struct fib6_info *rt,
4329                                    struct fib6_info *rt_last,
4330                                    struct nl_info *info,
4331                                    __u16 nlflags)
4332 {
4333         /* if this is an APPEND route, then rt points to the first route
4334          * inserted and rt_last points to last route inserted. Userspace
4335          * wants a consistent dump of the route which starts at the first
4336          * nexthop. Since sibling routes are always added at the end of
4337          * the list, find the first sibling of the last route appended
4338          */
4339         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4340                 rt = list_first_entry(&rt_last->fib6_siblings,
4341                                       struct fib6_info,
4342                                       fib6_siblings);
4343         }
4344
4345         if (rt)
4346                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4347 }
4348
4349 static int ip6_route_multipath_add(struct fib6_config *cfg,
4350                                    struct netlink_ext_ack *extack)
4351 {
4352         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4353         struct nl_info *info = &cfg->fc_nlinfo;
4354         struct fib6_config r_cfg;
4355         struct rtnexthop *rtnh;
4356         struct fib6_info *rt;
4357         struct rt6_nh *err_nh;
4358         struct rt6_nh *nh, *nh_safe;
4359         __u16 nlflags;
4360         int remaining;
4361         int attrlen;
4362         int err = 1;
4363         int nhn = 0;
4364         int replace = (cfg->fc_nlinfo.nlh &&
4365                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4366         LIST_HEAD(rt6_nh_list);
4367
4368         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4369         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4370                 nlflags |= NLM_F_APPEND;
4371
4372         remaining = cfg->fc_mp_len;
4373         rtnh = (struct rtnexthop *)cfg->fc_mp;
4374
4375         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4376          * fib6_info structs per nexthop
4377          */
4378         while (rtnh_ok(rtnh, remaining)) {
4379                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4380                 if (rtnh->rtnh_ifindex)
4381                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4382
4383                 attrlen = rtnh_attrlen(rtnh);
4384                 if (attrlen > 0) {
4385                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4386
4387                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4388                         if (nla) {
4389                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4390                                 r_cfg.fc_flags |= RTF_GATEWAY;
4391                         }
4392                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4393                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4394                         if (nla)
4395                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4396                 }
4397
4398                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4399                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4400                 if (IS_ERR(rt)) {
4401                         err = PTR_ERR(rt);
4402                         rt = NULL;
4403                         goto cleanup;
4404                 }
4405                 if (!rt6_qualify_for_ecmp(rt)) {
4406                         err = -EINVAL;
4407                         NL_SET_ERR_MSG(extack,
4408                                        "Device only routes can not be added for IPv6 using the multipath API.");
4409                         fib6_info_release(rt);
4410                         goto cleanup;
4411                 }
4412
4413                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4414
4415                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4416                                             rt, &r_cfg);
4417                 if (err) {
4418                         fib6_info_release(rt);
4419                         goto cleanup;
4420                 }
4421
4422                 rtnh = rtnh_next(rtnh, &remaining);
4423         }
4424
4425         /* for add and replace send one notification with all nexthops.
4426          * Skip the notification in fib6_add_rt2node and send one with
4427          * the full route when done
4428          */
4429         info->skip_notify = 1;
4430
4431         err_nh = NULL;
4432         list_for_each_entry(nh, &rt6_nh_list, next) {
4433                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4434                 fib6_info_release(nh->fib6_info);
4435
4436                 if (!err) {
4437                         /* save reference to last route successfully inserted */
4438                         rt_last = nh->fib6_info;
4439
4440                         /* save reference to first route for notification */
4441                         if (!rt_notif)
4442                                 rt_notif = nh->fib6_info;
4443                 }
4444
4445                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4446                 nh->fib6_info = NULL;
4447                 if (err) {
4448                         if (replace && nhn)
4449                                 ip6_print_replace_route_err(&rt6_nh_list);
4450                         err_nh = nh;
4451                         goto add_errout;
4452                 }
4453
4454                 /* Because each route is added like a single route we remove
4455                  * these flags after the first nexthop: if there is a collision,
4456                  * we have already failed to add the first nexthop:
4457                  * fib6_add_rt2node() has rejected it; when replacing, old
4458                  * nexthops have been replaced by first new, the rest should
4459                  * be added to it.
4460                  */
4461                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4462                                                      NLM_F_REPLACE);
4463                 nhn++;
4464         }
4465
4466         /* success ... tell user about new route */
4467         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4468         goto cleanup;
4469
4470 add_errout:
4471         /* send notification for routes that were added so that
4472          * the delete notifications sent by ip6_route_del are
4473          * coherent
4474          */
4475         if (rt_notif)
4476                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4477
4478         /* Delete routes that were already added */
4479         list_for_each_entry(nh, &rt6_nh_list, next) {
4480                 if (err_nh == nh)
4481                         break;
4482                 ip6_route_del(&nh->r_cfg, extack);
4483         }
4484
4485 cleanup:
4486         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4487                 if (nh->fib6_info)
4488                         fib6_info_release(nh->fib6_info);
4489                 list_del(&nh->next);
4490                 kfree(nh);
4491         }
4492
4493         return err;
4494 }
4495
4496 static int ip6_route_multipath_del(struct fib6_config *cfg,
4497                                    struct netlink_ext_ack *extack)
4498 {
4499         struct fib6_config r_cfg;
4500         struct rtnexthop *rtnh;
4501         int remaining;
4502         int attrlen;
4503         int err = 1, last_err = 0;
4504
4505         remaining = cfg->fc_mp_len;
4506         rtnh = (struct rtnexthop *)cfg->fc_mp;
4507
4508         /* Parse a Multipath Entry */
4509         while (rtnh_ok(rtnh, remaining)) {
4510                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4511                 if (rtnh->rtnh_ifindex)
4512                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4513
4514                 attrlen = rtnh_attrlen(rtnh);
4515                 if (attrlen > 0) {
4516                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4517
4518                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4519                         if (nla) {
4520                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4521                                 r_cfg.fc_flags |= RTF_GATEWAY;
4522                         }
4523                 }
4524                 err = ip6_route_del(&r_cfg, extack);
4525                 if (err)
4526                         last_err = err;
4527
4528                 rtnh = rtnh_next(rtnh, &remaining);
4529         }
4530
4531         return last_err;
4532 }
4533
4534 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4535                               struct netlink_ext_ack *extack)
4536 {
4537         struct fib6_config cfg;
4538         int err;
4539
4540         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4541         if (err < 0)
4542                 return err;
4543
4544         if (cfg.fc_mp)
4545                 return ip6_route_multipath_del(&cfg, extack);
4546         else {
4547                 cfg.fc_delete_all_nh = 1;
4548                 return ip6_route_del(&cfg, extack);
4549         }
4550 }
4551
4552 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4553                               struct netlink_ext_ack *extack)
4554 {
4555         struct fib6_config cfg;
4556         int err;
4557
4558         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4559         if (err < 0)
4560                 return err;
4561
4562         if (cfg.fc_mp)
4563                 return ip6_route_multipath_add(&cfg, extack);
4564         else
4565                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4566 }
4567
4568 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4569 {
4570         int nexthop_len = 0;
4571
4572         if (rt->fib6_nsiblings) {
4573                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4574                             + NLA_ALIGN(sizeof(struct rtnexthop))
4575                             + nla_total_size(16) /* RTA_GATEWAY */
4576                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4577
4578                 nexthop_len *= rt->fib6_nsiblings;
4579         }
4580
4581         return NLMSG_ALIGN(sizeof(struct rtmsg))
4582                + nla_total_size(16) /* RTA_SRC */
4583                + nla_total_size(16) /* RTA_DST */
4584                + nla_total_size(16) /* RTA_GATEWAY */
4585                + nla_total_size(16) /* RTA_PREFSRC */
4586                + nla_total_size(4) /* RTA_TABLE */
4587                + nla_total_size(4) /* RTA_IIF */
4588                + nla_total_size(4) /* RTA_OIF */
4589                + nla_total_size(4) /* RTA_PRIORITY */
4590                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4591                + nla_total_size(sizeof(struct rta_cacheinfo))
4592                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4593                + nla_total_size(1) /* RTA_PREF */
4594                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4595                + nexthop_len;
4596 }
4597
4598 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4599                             unsigned int *flags, bool skip_oif)
4600 {
4601         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4602                 *flags |= RTNH_F_DEAD;
4603
4604         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4605                 *flags |= RTNH_F_LINKDOWN;
4606
4607                 rcu_read_lock();
4608                 if (fib6_ignore_linkdown(rt))
4609                         *flags |= RTNH_F_DEAD;
4610                 rcu_read_unlock();
4611         }
4612
4613         if (rt->fib6_flags & RTF_GATEWAY) {
4614                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4615                         goto nla_put_failure;
4616         }
4617
4618         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4619         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4620                 *flags |= RTNH_F_OFFLOAD;
4621
4622         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4623         if (!skip_oif && rt->fib6_nh.nh_dev &&
4624             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4625                 goto nla_put_failure;
4626
4627         if (rt->fib6_nh.nh_lwtstate &&
4628             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4629                 goto nla_put_failure;
4630
4631         return 0;
4632
4633 nla_put_failure:
4634         return -EMSGSIZE;
4635 }
4636
4637 /* add multipath next hop */
4638 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4639 {
4640         const struct net_device *dev = rt->fib6_nh.nh_dev;
4641         struct rtnexthop *rtnh;
4642         unsigned int flags = 0;
4643
4644         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4645         if (!rtnh)
4646                 goto nla_put_failure;
4647
4648         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4649         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4650
4651         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4652                 goto nla_put_failure;
4653
4654         rtnh->rtnh_flags = flags;
4655
4656         /* length of rtnetlink header + attributes */
4657         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4658
4659         return 0;
4660
4661 nla_put_failure:
4662         return -EMSGSIZE;
4663 }
4664
4665 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4666                          struct fib6_info *rt, struct dst_entry *dst,
4667                          struct in6_addr *dest, struct in6_addr *src,
4668                          int iif, int type, u32 portid, u32 seq,
4669                          unsigned int flags)
4670 {
4671         struct rt6_info *rt6 = (struct rt6_info *)dst;
4672         struct rt6key *rt6_dst, *rt6_src;
4673         u32 *pmetrics, table, rt6_flags;
4674         struct nlmsghdr *nlh;
4675         struct rtmsg *rtm;
4676         long expires = 0;
4677
4678         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4679         if (!nlh)
4680                 return -EMSGSIZE;
4681
4682         if (rt6) {
4683                 rt6_dst = &rt6->rt6i_dst;
4684                 rt6_src = &rt6->rt6i_src;
4685                 rt6_flags = rt6->rt6i_flags;
4686         } else {
4687                 rt6_dst = &rt->fib6_dst;
4688                 rt6_src = &rt->fib6_src;
4689                 rt6_flags = rt->fib6_flags;
4690         }
4691
4692         rtm = nlmsg_data(nlh);
4693         rtm->rtm_family = AF_INET6;
4694         rtm->rtm_dst_len = rt6_dst->plen;
4695         rtm->rtm_src_len = rt6_src->plen;
4696         rtm->rtm_tos = 0;
4697         if (rt->fib6_table)
4698                 table = rt->fib6_table->tb6_id;
4699         else
4700                 table = RT6_TABLE_UNSPEC;
4701         rtm->rtm_table = table;
4702         if (nla_put_u32(skb, RTA_TABLE, table))
4703                 goto nla_put_failure;
4704
4705         rtm->rtm_type = rt->fib6_type;
4706         rtm->rtm_flags = 0;
4707         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4708         rtm->rtm_protocol = rt->fib6_protocol;
4709
4710         if (rt6_flags & RTF_CACHE)
4711                 rtm->rtm_flags |= RTM_F_CLONED;
4712
4713         if (dest) {
4714                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4715                         goto nla_put_failure;
4716                 rtm->rtm_dst_len = 128;
4717         } else if (rtm->rtm_dst_len)
4718                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4719                         goto nla_put_failure;
4720 #ifdef CONFIG_IPV6_SUBTREES
4721         if (src) {
4722                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4723                         goto nla_put_failure;
4724                 rtm->rtm_src_len = 128;
4725         } else if (rtm->rtm_src_len &&
4726                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4727                 goto nla_put_failure;
4728 #endif
4729         if (iif) {
4730 #ifdef CONFIG_IPV6_MROUTE
4731                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4732                         int err = ip6mr_get_route(net, skb, rtm, portid);
4733
4734                         if (err == 0)
4735                                 return 0;
4736                         if (err < 0)
4737                                 goto nla_put_failure;
4738                 } else
4739 #endif
4740                         if (nla_put_u32(skb, RTA_IIF, iif))
4741                                 goto nla_put_failure;
4742         } else if (dest) {
4743                 struct in6_addr saddr_buf;
4744                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4745                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4746                         goto nla_put_failure;
4747         }
4748
4749         if (rt->fib6_prefsrc.plen) {
4750                 struct in6_addr saddr_buf;
4751                 saddr_buf = rt->fib6_prefsrc.addr;
4752                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4753                         goto nla_put_failure;
4754         }
4755
4756         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4757         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4758                 goto nla_put_failure;
4759
4760         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4761                 goto nla_put_failure;
4762
4763         /* For multipath routes, walk the siblings list and add
4764          * each as a nexthop within RTA_MULTIPATH.
4765          */
4766         if (rt6) {
4767                 if (rt6_flags & RTF_GATEWAY &&
4768                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4769                         goto nla_put_failure;
4770
4771                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4772                         goto nla_put_failure;
4773         } else if (rt->fib6_nsiblings) {
4774                 struct fib6_info *sibling, *next_sibling;
4775                 struct nlattr *mp;
4776
4777                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4778                 if (!mp)
4779                         goto nla_put_failure;
4780
4781                 if (rt6_add_nexthop(skb, rt) < 0)
4782                         goto nla_put_failure;
4783
4784                 list_for_each_entry_safe(sibling, next_sibling,
4785                                          &rt->fib6_siblings, fib6_siblings) {
4786                         if (rt6_add_nexthop(skb, sibling) < 0)
4787                                 goto nla_put_failure;
4788                 }
4789
4790                 nla_nest_end(skb, mp);
4791         } else {
4792                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4793                         goto nla_put_failure;
4794         }
4795
4796         if (rt6_flags & RTF_EXPIRES) {
4797                 expires = dst ? dst->expires : rt->expires;
4798                 expires -= jiffies;
4799         }
4800
4801         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4802                 goto nla_put_failure;
4803
4804         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4805                 goto nla_put_failure;
4806
4807
4808         nlmsg_end(skb, nlh);
4809         return 0;
4810
4811 nla_put_failure:
4812         nlmsg_cancel(skb, nlh);
4813         return -EMSGSIZE;
4814 }
4815
4816 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4817 {
4818         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4819         struct net *net = arg->net;
4820
4821         if (rt == net->ipv6.fib6_null_entry)
4822                 return 0;
4823
4824         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4825                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4826
4827                 /* user wants prefix routes only */
4828                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4829                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4830                         /* success since this is not a prefix route */
4831                         return 1;
4832                 }
4833         }
4834
4835         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4836                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4837                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4838 }
4839
4840 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4841                               struct netlink_ext_ack *extack)
4842 {
4843         struct net *net = sock_net(in_skb->sk);
4844         struct nlattr *tb[RTA_MAX+1];
4845         int err, iif = 0, oif = 0;
4846         struct fib6_info *from;
4847         struct dst_entry *dst;
4848         struct rt6_info *rt;
4849         struct sk_buff *skb;
4850         struct rtmsg *rtm;
4851         struct flowi6 fl6;
4852         bool fibmatch;
4853
4854         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4855                           extack);
4856         if (err < 0)
4857                 goto errout;
4858
4859         err = -EINVAL;
4860         memset(&fl6, 0, sizeof(fl6));
4861         rtm = nlmsg_data(nlh);
4862         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4863         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4864
4865         if (tb[RTA_SRC]) {
4866                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4867                         goto errout;
4868
4869                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4870         }
4871
4872         if (tb[RTA_DST]) {
4873                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4874                         goto errout;
4875
4876                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4877         }
4878
4879         if (tb[RTA_IIF])
4880                 iif = nla_get_u32(tb[RTA_IIF]);
4881
4882         if (tb[RTA_OIF])
4883                 oif = nla_get_u32(tb[RTA_OIF]);
4884
4885         if (tb[RTA_MARK])
4886                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4887
4888         if (tb[RTA_UID])
4889                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4890                                            nla_get_u32(tb[RTA_UID]));
4891         else
4892                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4893
4894         if (tb[RTA_SPORT])
4895                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4896
4897         if (tb[RTA_DPORT])
4898                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4899
4900         if (tb[RTA_IP_PROTO]) {
4901                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4902                                                   &fl6.flowi6_proto, extack);
4903                 if (err)
4904                         goto errout;
4905         }
4906
4907         if (iif) {
4908                 struct net_device *dev;
4909                 int flags = 0;
4910
4911                 rcu_read_lock();
4912
4913                 dev = dev_get_by_index_rcu(net, iif);
4914                 if (!dev) {
4915                         rcu_read_unlock();
4916                         err = -ENODEV;
4917                         goto errout;
4918                 }
4919
4920                 fl6.flowi6_iif = iif;
4921
4922                 if (!ipv6_addr_any(&fl6.saddr))
4923                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4924
4925                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4926
4927                 rcu_read_unlock();
4928         } else {
4929                 fl6.flowi6_oif = oif;
4930
4931                 dst = ip6_route_output(net, NULL, &fl6);
4932         }
4933
4934
4935         rt = container_of(dst, struct rt6_info, dst);
4936         if (rt->dst.error) {
4937                 err = rt->dst.error;
4938                 ip6_rt_put(rt);
4939                 goto errout;
4940         }
4941
4942         if (rt == net->ipv6.ip6_null_entry) {
4943                 err = rt->dst.error;
4944                 ip6_rt_put(rt);
4945                 goto errout;
4946         }
4947
4948         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4949         if (!skb) {
4950                 ip6_rt_put(rt);
4951                 err = -ENOBUFS;
4952                 goto errout;
4953         }
4954
4955         skb_dst_set(skb, &rt->dst);
4956
4957         rcu_read_lock();
4958         from = rcu_dereference(rt->from);
4959
4960         if (fibmatch)
4961                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4962                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4963                                     nlh->nlmsg_seq, 0);
4964         else
4965                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4966                                     &fl6.saddr, iif, RTM_NEWROUTE,
4967                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4968                                     0);
4969         rcu_read_unlock();
4970
4971         if (err < 0) {
4972                 kfree_skb(skb);
4973                 goto errout;
4974         }
4975
4976         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4977 errout:
4978         return err;
4979 }
4980
4981 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4982                      unsigned int nlm_flags)
4983 {
4984         struct sk_buff *skb;
4985         struct net *net = info->nl_net;
4986         u32 seq;
4987         int err;
4988
4989         err = -ENOBUFS;
4990         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4991
4992         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4993         if (!skb)
4994                 goto errout;
4995
4996         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4997                             event, info->portid, seq, nlm_flags);
4998         if (err < 0) {
4999                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5000                 WARN_ON(err == -EMSGSIZE);
5001                 kfree_skb(skb);
5002                 goto errout;
5003         }
5004         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5005                     info->nlh, gfp_any());
5006         return;
5007 errout:
5008         if (err < 0)
5009                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5010 }
5011
5012 static int ip6_route_dev_notify(struct notifier_block *this,
5013                                 unsigned long event, void *ptr)
5014 {
5015         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5016         struct net *net = dev_net(dev);
5017
5018         if (!(dev->flags & IFF_LOOPBACK))
5019                 return NOTIFY_OK;
5020
5021         if (event == NETDEV_REGISTER) {
5022                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5023                 net->ipv6.ip6_null_entry->dst.dev = dev;
5024                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5025 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5026                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5027                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5028                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5029                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5030 #endif
5031          } else if (event == NETDEV_UNREGISTER &&
5032                     dev->reg_state != NETREG_UNREGISTERED) {
5033                 /* NETDEV_UNREGISTER could be fired for multiple times by
5034                  * netdev_wait_allrefs(). Make sure we only call this once.
5035                  */
5036                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5037 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5038                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5039                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5040 #endif
5041         }
5042
5043         return NOTIFY_OK;
5044 }
5045
5046 /*
5047  *      /proc
5048  */
5049
5050 #ifdef CONFIG_PROC_FS
5051 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5052 {
5053         struct net *net = (struct net *)seq->private;
5054         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5055                    net->ipv6.rt6_stats->fib_nodes,
5056                    net->ipv6.rt6_stats->fib_route_nodes,
5057                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5058                    net->ipv6.rt6_stats->fib_rt_entries,
5059                    net->ipv6.rt6_stats->fib_rt_cache,
5060                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5061                    net->ipv6.rt6_stats->fib_discarded_routes);
5062
5063         return 0;
5064 }
5065 #endif  /* CONFIG_PROC_FS */
5066
5067 #ifdef CONFIG_SYSCTL
5068
5069 static
5070 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5071                               void __user *buffer, size_t *lenp, loff_t *ppos)
5072 {
5073         struct net *net;
5074         int delay;
5075         if (!write)
5076                 return -EINVAL;
5077
5078         net = (struct net *)ctl->extra1;
5079         delay = net->ipv6.sysctl.flush_delay;
5080         proc_dointvec(ctl, write, buffer, lenp, ppos);
5081         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5082         return 0;
5083 }
5084
5085 struct ctl_table ipv6_route_table_template[] = {
5086         {
5087                 .procname       =       "flush",
5088                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5089                 .maxlen         =       sizeof(int),
5090                 .mode           =       0200,
5091                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5092         },
5093         {
5094                 .procname       =       "gc_thresh",
5095                 .data           =       &ip6_dst_ops_template.gc_thresh,
5096                 .maxlen         =       sizeof(int),
5097                 .mode           =       0644,
5098                 .proc_handler   =       proc_dointvec,
5099         },
5100         {
5101                 .procname       =       "max_size",
5102                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5103                 .maxlen         =       sizeof(int),
5104                 .mode           =       0644,
5105                 .proc_handler   =       proc_dointvec,
5106         },
5107         {
5108                 .procname       =       "gc_min_interval",
5109                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5110                 .maxlen         =       sizeof(int),
5111                 .mode           =       0644,
5112                 .proc_handler   =       proc_dointvec_jiffies,
5113         },
5114         {
5115                 .procname       =       "gc_timeout",
5116                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5117                 .maxlen         =       sizeof(int),
5118                 .mode           =       0644,
5119                 .proc_handler   =       proc_dointvec_jiffies,
5120         },
5121         {
5122                 .procname       =       "gc_interval",
5123                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5124                 .maxlen         =       sizeof(int),
5125                 .mode           =       0644,
5126                 .proc_handler   =       proc_dointvec_jiffies,
5127         },
5128         {
5129                 .procname       =       "gc_elasticity",
5130                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5131                 .maxlen         =       sizeof(int),
5132                 .mode           =       0644,
5133                 .proc_handler   =       proc_dointvec,
5134         },
5135         {
5136                 .procname       =       "mtu_expires",
5137                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5138                 .maxlen         =       sizeof(int),
5139                 .mode           =       0644,
5140                 .proc_handler   =       proc_dointvec_jiffies,
5141         },
5142         {
5143                 .procname       =       "min_adv_mss",
5144                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5145                 .maxlen         =       sizeof(int),
5146                 .mode           =       0644,
5147                 .proc_handler   =       proc_dointvec,
5148         },
5149         {
5150                 .procname       =       "gc_min_interval_ms",
5151                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5152                 .maxlen         =       sizeof(int),
5153                 .mode           =       0644,
5154                 .proc_handler   =       proc_dointvec_ms_jiffies,
5155         },
5156         { }
5157 };
5158
5159 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5160 {
5161         struct ctl_table *table;
5162
5163         table = kmemdup(ipv6_route_table_template,
5164                         sizeof(ipv6_route_table_template),
5165                         GFP_KERNEL);
5166
5167         if (table) {
5168                 table[0].data = &net->ipv6.sysctl.flush_delay;
5169                 table[0].extra1 = net;
5170                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5171                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5172                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5173                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5174                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5175                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5176                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5177                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5178                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5179
5180                 /* Don't export sysctls to unprivileged users */
5181                 if (net->user_ns != &init_user_ns)
5182                         table[0].procname = NULL;
5183         }
5184
5185         return table;
5186 }
5187 #endif
5188
5189 static int __net_init ip6_route_net_init(struct net *net)
5190 {
5191         int ret = -ENOMEM;
5192
5193         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5194                sizeof(net->ipv6.ip6_dst_ops));
5195
5196         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5197                 goto out_ip6_dst_ops;
5198
5199         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5200                                             sizeof(*net->ipv6.fib6_null_entry),
5201                                             GFP_KERNEL);
5202         if (!net->ipv6.fib6_null_entry)
5203                 goto out_ip6_dst_entries;
5204
5205         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5206                                            sizeof(*net->ipv6.ip6_null_entry),
5207                                            GFP_KERNEL);
5208         if (!net->ipv6.ip6_null_entry)
5209                 goto out_fib6_null_entry;
5210         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5211         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5212                          ip6_template_metrics, true);
5213
5214 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5215         net->ipv6.fib6_has_custom_rules = false;
5216         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5217                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5218                                                GFP_KERNEL);
5219         if (!net->ipv6.ip6_prohibit_entry)
5220                 goto out_ip6_null_entry;
5221         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5222         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5223                          ip6_template_metrics, true);
5224
5225         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5226                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5227                                                GFP_KERNEL);
5228         if (!net->ipv6.ip6_blk_hole_entry)
5229                 goto out_ip6_prohibit_entry;
5230         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5231         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5232                          ip6_template_metrics, true);
5233 #endif
5234
5235         net->ipv6.sysctl.flush_delay = 0;
5236         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5237         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5238         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5239         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5240         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5241         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5242         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5243
5244         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5245
5246         ret = 0;
5247 out:
5248         return ret;
5249
5250 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5251 out_ip6_prohibit_entry:
5252         kfree(net->ipv6.ip6_prohibit_entry);
5253 out_ip6_null_entry:
5254         kfree(net->ipv6.ip6_null_entry);
5255 #endif
5256 out_fib6_null_entry:
5257         kfree(net->ipv6.fib6_null_entry);
5258 out_ip6_dst_entries:
5259         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5260 out_ip6_dst_ops:
5261         goto out;
5262 }
5263
5264 static void __net_exit ip6_route_net_exit(struct net *net)
5265 {
5266         kfree(net->ipv6.fib6_null_entry);
5267         kfree(net->ipv6.ip6_null_entry);
5268 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5269         kfree(net->ipv6.ip6_prohibit_entry);
5270         kfree(net->ipv6.ip6_blk_hole_entry);
5271 #endif
5272         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5273 }
5274
5275 static int __net_init ip6_route_net_init_late(struct net *net)
5276 {
5277 #ifdef CONFIG_PROC_FS
5278         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5279                         sizeof(struct ipv6_route_iter));
5280         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5281                         rt6_stats_seq_show, NULL);
5282 #endif
5283         return 0;
5284 }
5285
5286 static void __net_exit ip6_route_net_exit_late(struct net *net)
5287 {
5288 #ifdef CONFIG_PROC_FS
5289         remove_proc_entry("ipv6_route", net->proc_net);
5290         remove_proc_entry("rt6_stats", net->proc_net);
5291 #endif
5292 }
5293
5294 static struct pernet_operations ip6_route_net_ops = {
5295         .init = ip6_route_net_init,
5296         .exit = ip6_route_net_exit,
5297 };
5298
5299 static int __net_init ipv6_inetpeer_init(struct net *net)
5300 {
5301         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5302
5303         if (!bp)
5304                 return -ENOMEM;
5305         inet_peer_base_init(bp);
5306         net->ipv6.peers = bp;
5307         return 0;
5308 }
5309
5310 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5311 {
5312         struct inet_peer_base *bp = net->ipv6.peers;
5313
5314         net->ipv6.peers = NULL;
5315         inetpeer_invalidate_tree(bp);
5316         kfree(bp);
5317 }
5318
5319 static struct pernet_operations ipv6_inetpeer_ops = {
5320         .init   =       ipv6_inetpeer_init,
5321         .exit   =       ipv6_inetpeer_exit,
5322 };
5323
5324 static struct pernet_operations ip6_route_net_late_ops = {
5325         .init = ip6_route_net_init_late,
5326         .exit = ip6_route_net_exit_late,
5327 };
5328
5329 static struct notifier_block ip6_route_dev_notifier = {
5330         .notifier_call = ip6_route_dev_notify,
5331         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5332 };
5333
5334 void __init ip6_route_init_special_entries(void)
5335 {
5336         /* Registering of the loopback is done before this portion of code,
5337          * the loopback reference in rt6_info will not be taken, do it
5338          * manually for init_net */
5339         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5340         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5341         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5342   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5343         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5344         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5345         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5346         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5347   #endif
5348 }
5349
5350 int __init ip6_route_init(void)
5351 {
5352         int ret;
5353         int cpu;
5354
5355         ret = -ENOMEM;
5356         ip6_dst_ops_template.kmem_cachep =
5357                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5358                                   SLAB_HWCACHE_ALIGN, NULL);
5359         if (!ip6_dst_ops_template.kmem_cachep)
5360                 goto out;
5361
5362         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5363         if (ret)
5364                 goto out_kmem_cache;
5365
5366         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5367         if (ret)
5368                 goto out_dst_entries;
5369
5370         ret = register_pernet_subsys(&ip6_route_net_ops);
5371         if (ret)
5372                 goto out_register_inetpeer;
5373
5374         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5375
5376         ret = fib6_init();
5377         if (ret)
5378                 goto out_register_subsys;
5379
5380         ret = xfrm6_init();
5381         if (ret)
5382                 goto out_fib6_init;
5383
5384         ret = fib6_rules_init();
5385         if (ret)
5386                 goto xfrm6_init;
5387
5388         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5389         if (ret)
5390                 goto fib6_rules_init;
5391
5392         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5393                                    inet6_rtm_newroute, NULL, 0);
5394         if (ret < 0)
5395                 goto out_register_late_subsys;
5396
5397         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5398                                    inet6_rtm_delroute, NULL, 0);
5399         if (ret < 0)
5400                 goto out_register_late_subsys;
5401
5402         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5403                                    inet6_rtm_getroute, NULL,
5404                                    RTNL_FLAG_DOIT_UNLOCKED);
5405         if (ret < 0)
5406                 goto out_register_late_subsys;
5407
5408         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5409         if (ret)
5410                 goto out_register_late_subsys;
5411
5412         for_each_possible_cpu(cpu) {
5413                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5414
5415                 INIT_LIST_HEAD(&ul->head);
5416                 spin_lock_init(&ul->lock);
5417         }
5418
5419 out:
5420         return ret;
5421
5422 out_register_late_subsys:
5423         rtnl_unregister_all(PF_INET6);
5424         unregister_pernet_subsys(&ip6_route_net_late_ops);
5425 fib6_rules_init:
5426         fib6_rules_cleanup();
5427 xfrm6_init:
5428         xfrm6_fini();
5429 out_fib6_init:
5430         fib6_gc_cleanup();
5431 out_register_subsys:
5432         unregister_pernet_subsys(&ip6_route_net_ops);
5433 out_register_inetpeer:
5434         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5435 out_dst_entries:
5436         dst_entries_destroy(&ip6_dst_blackhole_ops);
5437 out_kmem_cache:
5438         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5439         goto out;
5440 }
5441
5442 void ip6_route_cleanup(void)
5443 {
5444         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5445         unregister_pernet_subsys(&ip6_route_net_late_ops);
5446         fib6_rules_cleanup();
5447         xfrm6_fini();
5448         fib6_gc_cleanup();
5449         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5450         unregister_pernet_subsys(&ip6_route_net_ops);
5451         dst_entries_destroy(&ip6_dst_blackhole_ops);
5452         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5453 }