nds32: fix build error "relocation truncated to fit: R_NDS32_25_PCREL_RELA" when
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213         return neigh_create(&nd_tbl, daddr, dev);
214 }
215
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217                                               struct sk_buff *skb,
218                                               const void *daddr)
219 {
220         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = ATOMIC_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct fib6_info *from;
369         struct inet6_dev *idev;
370
371         dst_destroy_metrics_generic(dst);
372         rt6_uncached_list_del(rt);
373
374         idev = rt->rt6i_idev;
375         if (idev) {
376                 rt->rt6i_idev = NULL;
377                 in6_dev_put(idev);
378         }
379
380         rcu_read_lock();
381         from = rcu_dereference(rt->from);
382         rcu_assign_pointer(rt->from, NULL);
383         fib6_info_release(from);
384         rcu_read_unlock();
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429                                         struct fib6_info *match,
430                                         struct flowi6 *fl6, int oif,
431                                         const struct sk_buff *skb,
432                                         int strict)
433 {
434         struct fib6_info *sibling, *next_sibling;
435
436         /* We might have already computed the hash for ICMPv6 errors. In such
437          * case it will always be non-zero. Otherwise now is the time to do it.
438          */
439         if (!fl6->mp_hash)
440                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441
442         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443                 return match;
444
445         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446                                  fib6_siblings) {
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(sibling, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458         return match;
459 }
460
461 /*
462  *      Route lookup. rcu_read_lock() should be held.
463  */
464
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466                                                  struct fib6_info *rt,
467                                                     const struct in6_addr *saddr,
468                                                     int oif,
469                                                     int flags)
470 {
471         struct fib6_info *sprt;
472
473         if (!oif && ipv6_addr_any(saddr) &&
474             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475                 return rt;
476
477         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
479
480                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481                         continue;
482
483                 if (oif) {
484                         if (dev->ifindex == oif)
485                                 return sprt;
486                 } else {
487                         if (ipv6_chk_addr(net, saddr, dev,
488                                           flags & RT6_LOOKUP_F_IFACE))
489                                 return sprt;
490                 }
491         }
492
493         if (oif && flags & RT6_LOOKUP_F_IFACE)
494                 return net->ipv6.fib6_null_entry;
495
496         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
497 }
498
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501         struct work_struct work;
502         struct in6_addr target;
503         struct net_device *dev;
504 };
505
506 static void rt6_probe_deferred(struct work_struct *w)
507 {
508         struct in6_addr mcaddr;
509         struct __rt6_probe_work *work =
510                 container_of(w, struct __rt6_probe_work, work);
511
512         addrconf_addr_solict_mult(&work->target, &mcaddr);
513         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514         dev_put(work->dev);
515         kfree(work);
516 }
517
518 static void rt6_probe(struct fib6_info *rt)
519 {
520         struct __rt6_probe_work *work;
521         const struct in6_addr *nh_gw;
522         struct neighbour *neigh;
523         struct net_device *dev;
524
525         /*
526          * Okay, this does not seem to be appropriate
527          * for now, however, we need to check if it
528          * is really so; aka Router Reachability Probing.
529          *
530          * Router Reachability Probe MUST be rate-limited
531          * to no more than one per minute.
532          */
533         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
534                 return;
535
536         nh_gw = &rt->fib6_nh.nh_gw;
537         dev = rt->fib6_nh.nh_dev;
538         rcu_read_lock_bh();
539         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
540         if (neigh) {
541                 struct inet6_dev *idev;
542
543                 if (neigh->nud_state & NUD_VALID)
544                         goto out;
545
546                 idev = __in6_dev_get(dev);
547                 work = NULL;
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else {
558                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
559         }
560
561         if (work) {
562                 INIT_WORK(&work->work, rt6_probe_deferred);
563                 work->target = *nh_gw;
564                 dev_hold(dev);
565                 work->dev = dev;
566                 schedule_work(&work->work);
567         }
568
569 out:
570         rcu_read_unlock_bh();
571 }
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
574 {
575 }
576 #endif
577
578 /*
579  * Default Router Selection (RFC 2461 6.3.6)
580  */
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 {
583         const struct net_device *dev = rt->fib6_nh.nh_dev;
584
585         if (!oif || dev->ifindex == oif)
586                 return 2;
587         return 0;
588 }
589
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 {
592         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593         struct neighbour *neigh;
594
595         if (rt->fib6_flags & RTF_NONEXTHOP ||
596             !(rt->fib6_flags & RTF_GATEWAY))
597                 return RT6_NUD_SUCCEED;
598
599         rcu_read_lock_bh();
600         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601                                           &rt->fib6_nh.nh_gw);
602         if (neigh) {
603                 read_lock(&neigh->lock);
604                 if (neigh->nud_state & NUD_VALID)
605                         ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607                 else if (!(neigh->nud_state & NUD_FAILED))
608                         ret = RT6_NUD_SUCCEED;
609                 else
610                         ret = RT6_NUD_FAIL_PROBE;
611 #endif
612                 read_unlock(&neigh->lock);
613         } else {
614                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616         }
617         rcu_read_unlock_bh();
618
619         return ret;
620 }
621
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 {
624         int m;
625
626         m = rt6_check_dev(rt, oif);
627         if (!m && (strict & RT6_LOOKUP_F_IFACE))
628                 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632         if (strict & RT6_LOOKUP_F_REACHABLE) {
633                 int n = rt6_check_neigh(rt);
634                 if (n < 0)
635                         return n;
636         }
637         return m;
638 }
639
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 {
643         const struct net_device *dev = fib6_info_nh_dev(f6i);
644         bool rc = false;
645
646         if (dev) {
647                 const struct inet6_dev *idev = __in6_dev_get(dev);
648
649                 rc = !!idev->cnf.ignore_routes_with_linkdown;
650         }
651
652         return rc;
653 }
654
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656                                    int *mpri, struct fib6_info *match,
657                                    bool *do_rr)
658 {
659         int m;
660         bool match_do_rr = false;
661
662         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663                 goto out;
664
665         if (fib6_ignore_linkdown(rt) &&
666             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668                 goto out;
669
670         if (fib6_check_expired(rt))
671                 goto out;
672
673         m = rt6_score_route(rt, oif, strict);
674         if (m == RT6_NUD_FAIL_DO_RR) {
675                 match_do_rr = true;
676                 m = 0; /* lowest valid score */
677         } else if (m == RT6_NUD_FAIL_HARD) {
678                 goto out;
679         }
680
681         if (strict & RT6_LOOKUP_F_REACHABLE)
682                 rt6_probe(rt);
683
684         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685         if (m > *mpri) {
686                 *do_rr = match_do_rr;
687                 *mpri = m;
688                 match = rt;
689         }
690 out:
691         return match;
692 }
693
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695                                      struct fib6_info *leaf,
696                                      struct fib6_info *rr_head,
697                                      u32 metric, int oif, int strict,
698                                      bool *do_rr)
699 {
700         struct fib6_info *rt, *match, *cont;
701         int mpri = -1;
702
703         match = NULL;
704         cont = NULL;
705         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706                 if (rt->fib6_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         for (rt = leaf; rt && rt != rr_head;
715              rt = rcu_dereference(rt->fib6_next)) {
716                 if (rt->fib6_metric != metric) {
717                         cont = rt;
718                         break;
719                 }
720
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722         }
723
724         if (match || !cont)
725                 return match;
726
727         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         rt->dst.flags |= fib6_info_dst_flags(ort);
950
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
976 {
977         rt->rt6i_flags &= ~RTF_EXPIRES;
978         fib6_info_hold(from);
979         rcu_assign_pointer(rt->from, from);
980         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981         if (from->fib6_metrics != &dst_default_metrics) {
982                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
983                 refcount_inc(&from->fib6_metrics->refcnt);
984         }
985 }
986
987 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
988 {
989         struct net_device *dev = fib6_info_nh_dev(ort);
990
991         ip6_rt_init_dst(rt, ort);
992
993         rt->rt6i_dst = ort->fib6_dst;
994         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
995         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
996         rt->rt6i_flags = ort->fib6_flags;
997         rt6_set_from(rt, ort);
998 #ifdef CONFIG_IPV6_SUBTREES
999         rt->rt6i_src = ort->fib6_src;
1000 #endif
1001         rt->rt6i_prefsrc = ort->fib6_prefsrc;
1002         rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1003 }
1004
1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1006                                         struct in6_addr *saddr)
1007 {
1008         struct fib6_node *pn, *sn;
1009         while (1) {
1010                 if (fn->fn_flags & RTN_TL_ROOT)
1011                         return NULL;
1012                 pn = rcu_dereference(fn->parent);
1013                 sn = FIB6_SUBTREE(pn);
1014                 if (sn && sn != fn)
1015                         fn = fib6_node_lookup(sn, NULL, saddr);
1016                 else
1017                         fn = pn;
1018                 if (fn->fn_flags & RTN_RTINFO)
1019                         return fn;
1020         }
1021 }
1022
1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1024                           bool null_fallback)
1025 {
1026         struct rt6_info *rt = *prt;
1027
1028         if (dst_hold_safe(&rt->dst))
1029                 return true;
1030         if (null_fallback) {
1031                 rt = net->ipv6.ip6_null_entry;
1032                 dst_hold(&rt->dst);
1033         } else {
1034                 rt = NULL;
1035         }
1036         *prt = rt;
1037         return false;
1038 }
1039
1040 /* called with rcu_lock held */
1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1042 {
1043         unsigned short flags = fib6_info_dst_flags(rt);
1044         struct net_device *dev = rt->fib6_nh.nh_dev;
1045         struct rt6_info *nrt;
1046
1047         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1048         if (nrt)
1049                 ip6_rt_copy_init(nrt, rt);
1050
1051         return nrt;
1052 }
1053
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055                                              struct fib6_table *table,
1056                                              struct flowi6 *fl6,
1057                                              const struct sk_buff *skb,
1058                                              int flags)
1059 {
1060         struct fib6_info *f6i;
1061         struct fib6_node *fn;
1062         struct rt6_info *rt;
1063
1064         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065                 flags &= ~RT6_LOOKUP_F_IFACE;
1066
1067         rcu_read_lock();
1068         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 restart:
1070         f6i = rcu_dereference(fn->leaf);
1071         if (!f6i) {
1072                 f6i = net->ipv6.fib6_null_entry;
1073         } else {
1074                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075                                       fl6->flowi6_oif, flags);
1076                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077                         f6i = fib6_multipath_select(net, f6i, fl6,
1078                                                     fl6->flowi6_oif, skb,
1079                                                     flags);
1080         }
1081         if (f6i == net->ipv6.fib6_null_entry) {
1082                 fn = fib6_backtrack(fn, &fl6->saddr);
1083                 if (fn)
1084                         goto restart;
1085         }
1086
1087         trace_fib6_table_lookup(net, f6i, table, fl6);
1088
1089         /* Search through exception table */
1090         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091         if (rt) {
1092                 if (ip6_hold_safe(net, &rt, true))
1093                         dst_use_noref(&rt->dst, jiffies);
1094         } else if (f6i == net->ipv6.fib6_null_entry) {
1095                 rt = net->ipv6.ip6_null_entry;
1096                 dst_hold(&rt->dst);
1097         } else {
1098                 rt = ip6_create_rt_rcu(f6i);
1099                 if (!rt) {
1100                         rt = net->ipv6.ip6_null_entry;
1101                         dst_hold(&rt->dst);
1102                 }
1103         }
1104
1105         rcu_read_unlock();
1106
1107         return rt;
1108 }
1109
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111                                    const struct sk_buff *skb, int flags)
1112 {
1113         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118                             const struct in6_addr *saddr, int oif,
1119                             const struct sk_buff *skb, int strict)
1120 {
1121         struct flowi6 fl6 = {
1122                 .flowi6_oif = oif,
1123                 .daddr = *daddr,
1124         };
1125         struct dst_entry *dst;
1126         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1127
1128         if (saddr) {
1129                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1131         }
1132
1133         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134         if (dst->error == 0)
1135                 return (struct rt6_info *) dst;
1136
1137         dst_release(dst);
1138
1139         return NULL;
1140 }
1141 EXPORT_SYMBOL(rt6_lookup);
1142
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144  * It takes new route entry, the addition fails by any reason the
1145  * route is released.
1146  * Caller must hold dst before calling it.
1147  */
1148
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150                         struct netlink_ext_ack *extack)
1151 {
1152         int err;
1153         struct fib6_table *table;
1154
1155         table = rt->fib6_table;
1156         spin_lock_bh(&table->tb6_lock);
1157         err = fib6_add(&table->tb6_root, rt, info, extack);
1158         spin_unlock_bh(&table->tb6_lock);
1159
1160         return err;
1161 }
1162
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 {
1165         struct nl_info info = { .nl_net = net, };
1166
1167         return __ip6_ins_rt(rt, &info, NULL);
1168 }
1169
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171                                            const struct in6_addr *daddr,
1172                                            const struct in6_addr *saddr)
1173 {
1174         struct net_device *dev;
1175         struct rt6_info *rt;
1176
1177         /*
1178          *      Clone the route.
1179          */
1180
1181         dev = ip6_rt_get_dev_rcu(ort);
1182         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1183         if (!rt)
1184                 return NULL;
1185
1186         ip6_rt_copy_init(rt, ort);
1187         rt->rt6i_flags |= RTF_CACHE;
1188         rt->dst.flags |= DST_HOST;
1189         rt->rt6i_dst.addr = *daddr;
1190         rt->rt6i_dst.plen = 128;
1191
1192         if (!rt6_is_gw_or_nonexthop(ort)) {
1193                 if (ort->fib6_dst.plen != 128 &&
1194                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1195                         rt->rt6i_flags |= RTF_ANYCAST;
1196 #ifdef CONFIG_IPV6_SUBTREES
1197                 if (rt->rt6i_src.plen && saddr) {
1198                         rt->rt6i_src.addr = *saddr;
1199                         rt->rt6i_src.plen = 128;
1200                 }
1201 #endif
1202         }
1203
1204         return rt;
1205 }
1206
1207 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1208 {
1209         unsigned short flags = fib6_info_dst_flags(rt);
1210         struct net_device *dev;
1211         struct rt6_info *pcpu_rt;
1212
1213         rcu_read_lock();
1214         dev = ip6_rt_get_dev_rcu(rt);
1215         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1216         rcu_read_unlock();
1217         if (!pcpu_rt)
1218                 return NULL;
1219         ip6_rt_copy_init(pcpu_rt, rt);
1220         pcpu_rt->rt6i_flags |= RTF_PCPU;
1221         return pcpu_rt;
1222 }
1223
1224 /* It should be called with rcu_read_lock() acquired */
1225 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1226 {
1227         struct rt6_info *pcpu_rt, **p;
1228
1229         p = this_cpu_ptr(rt->rt6i_pcpu);
1230         pcpu_rt = *p;
1231
1232         if (pcpu_rt)
1233                 ip6_hold_safe(NULL, &pcpu_rt, false);
1234
1235         return pcpu_rt;
1236 }
1237
1238 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1239                                             struct fib6_info *rt)
1240 {
1241         struct rt6_info *pcpu_rt, *prev, **p;
1242
1243         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1244         if (!pcpu_rt) {
1245                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1246                 return net->ipv6.ip6_null_entry;
1247         }
1248
1249         dst_hold(&pcpu_rt->dst);
1250         p = this_cpu_ptr(rt->rt6i_pcpu);
1251         prev = cmpxchg(p, NULL, pcpu_rt);
1252         BUG_ON(prev);
1253
1254         return pcpu_rt;
1255 }
1256
1257 /* exception hash table implementation
1258  */
1259 static DEFINE_SPINLOCK(rt6_exception_lock);
1260
1261 /* Remove rt6_ex from hash table and free the memory
1262  * Caller must hold rt6_exception_lock
1263  */
1264 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1265                                  struct rt6_exception *rt6_ex)
1266 {
1267         struct net *net;
1268
1269         if (!bucket || !rt6_ex)
1270                 return;
1271
1272         net = dev_net(rt6_ex->rt6i->dst.dev);
1273         hlist_del_rcu(&rt6_ex->hlist);
1274         dst_release(&rt6_ex->rt6i->dst);
1275         kfree_rcu(rt6_ex, rcu);
1276         WARN_ON_ONCE(!bucket->depth);
1277         bucket->depth--;
1278         net->ipv6.rt6_stats->fib_rt_cache--;
1279 }
1280
1281 /* Remove oldest rt6_ex in bucket and free the memory
1282  * Caller must hold rt6_exception_lock
1283  */
1284 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1285 {
1286         struct rt6_exception *rt6_ex, *oldest = NULL;
1287
1288         if (!bucket)
1289                 return;
1290
1291         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1292                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1293                         oldest = rt6_ex;
1294         }
1295         rt6_remove_exception(bucket, oldest);
1296 }
1297
1298 static u32 rt6_exception_hash(const struct in6_addr *dst,
1299                               const struct in6_addr *src)
1300 {
1301         static u32 seed __read_mostly;
1302         u32 val;
1303
1304         net_get_random_once(&seed, sizeof(seed));
1305         val = jhash(dst, sizeof(*dst), seed);
1306
1307 #ifdef CONFIG_IPV6_SUBTREES
1308         if (src)
1309                 val = jhash(src, sizeof(*src), val);
1310 #endif
1311         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1312 }
1313
1314 /* Helper function to find the cached rt in the hash table
1315  * and update bucket pointer to point to the bucket for this
1316  * (daddr, saddr) pair
1317  * Caller must hold rt6_exception_lock
1318  */
1319 static struct rt6_exception *
1320 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1321                               const struct in6_addr *daddr,
1322                               const struct in6_addr *saddr)
1323 {
1324         struct rt6_exception *rt6_ex;
1325         u32 hval;
1326
1327         if (!(*bucket) || !daddr)
1328                 return NULL;
1329
1330         hval = rt6_exception_hash(daddr, saddr);
1331         *bucket += hval;
1332
1333         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1334                 struct rt6_info *rt6 = rt6_ex->rt6i;
1335                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1336
1337 #ifdef CONFIG_IPV6_SUBTREES
1338                 if (matched && saddr)
1339                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1340 #endif
1341                 if (matched)
1342                         return rt6_ex;
1343         }
1344         return NULL;
1345 }
1346
1347 /* Helper function to find the cached rt in the hash table
1348  * and update bucket pointer to point to the bucket for this
1349  * (daddr, saddr) pair
1350  * Caller must hold rcu_read_lock()
1351  */
1352 static struct rt6_exception *
1353 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1354                          const struct in6_addr *daddr,
1355                          const struct in6_addr *saddr)
1356 {
1357         struct rt6_exception *rt6_ex;
1358         u32 hval;
1359
1360         WARN_ON_ONCE(!rcu_read_lock_held());
1361
1362         if (!(*bucket) || !daddr)
1363                 return NULL;
1364
1365         hval = rt6_exception_hash(daddr, saddr);
1366         *bucket += hval;
1367
1368         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1369                 struct rt6_info *rt6 = rt6_ex->rt6i;
1370                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1371
1372 #ifdef CONFIG_IPV6_SUBTREES
1373                 if (matched && saddr)
1374                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1375 #endif
1376                 if (matched)
1377                         return rt6_ex;
1378         }
1379         return NULL;
1380 }
1381
1382 static unsigned int fib6_mtu(const struct fib6_info *rt)
1383 {
1384         unsigned int mtu;
1385
1386         if (rt->fib6_pmtu) {
1387                 mtu = rt->fib6_pmtu;
1388         } else {
1389                 struct net_device *dev = fib6_info_nh_dev(rt);
1390                 struct inet6_dev *idev;
1391
1392                 rcu_read_lock();
1393                 idev = __in6_dev_get(dev);
1394                 mtu = idev->cnf.mtu6;
1395                 rcu_read_unlock();
1396         }
1397
1398         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1399
1400         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1401 }
1402
1403 static int rt6_insert_exception(struct rt6_info *nrt,
1404                                 struct fib6_info *ort)
1405 {
1406         struct net *net = dev_net(nrt->dst.dev);
1407         struct rt6_exception_bucket *bucket;
1408         struct in6_addr *src_key = NULL;
1409         struct rt6_exception *rt6_ex;
1410         int err = 0;
1411
1412         spin_lock_bh(&rt6_exception_lock);
1413
1414         if (ort->exception_bucket_flushed) {
1415                 err = -EINVAL;
1416                 goto out;
1417         }
1418
1419         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1420                                         lockdep_is_held(&rt6_exception_lock));
1421         if (!bucket) {
1422                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1423                                  GFP_ATOMIC);
1424                 if (!bucket) {
1425                         err = -ENOMEM;
1426                         goto out;
1427                 }
1428                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1429         }
1430
1431 #ifdef CONFIG_IPV6_SUBTREES
1432         /* rt6i_src.plen != 0 indicates ort is in subtree
1433          * and exception table is indexed by a hash of
1434          * both rt6i_dst and rt6i_src.
1435          * Otherwise, the exception table is indexed by
1436          * a hash of only rt6i_dst.
1437          */
1438         if (ort->fib6_src.plen)
1439                 src_key = &nrt->rt6i_src.addr;
1440 #endif
1441
1442         /* Update rt6i_prefsrc as it could be changed
1443          * in rt6_remove_prefsrc()
1444          */
1445         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1446         /* rt6_mtu_change() might lower mtu on ort.
1447          * Only insert this exception route if its mtu
1448          * is less than ort's mtu value.
1449          */
1450         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1451                 err = -EINVAL;
1452                 goto out;
1453         }
1454
1455         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1456                                                src_key);
1457         if (rt6_ex)
1458                 rt6_remove_exception(bucket, rt6_ex);
1459
1460         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1461         if (!rt6_ex) {
1462                 err = -ENOMEM;
1463                 goto out;
1464         }
1465         rt6_ex->rt6i = nrt;
1466         rt6_ex->stamp = jiffies;
1467         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1468         bucket->depth++;
1469         net->ipv6.rt6_stats->fib_rt_cache++;
1470
1471         if (bucket->depth > FIB6_MAX_DEPTH)
1472                 rt6_exception_remove_oldest(bucket);
1473
1474 out:
1475         spin_unlock_bh(&rt6_exception_lock);
1476
1477         /* Update fn->fn_sernum to invalidate all cached dst */
1478         if (!err) {
1479                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1480                 fib6_update_sernum(net, ort);
1481                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1482                 fib6_force_start_gc(net);
1483         }
1484
1485         return err;
1486 }
1487
1488 void rt6_flush_exceptions(struct fib6_info *rt)
1489 {
1490         struct rt6_exception_bucket *bucket;
1491         struct rt6_exception *rt6_ex;
1492         struct hlist_node *tmp;
1493         int i;
1494
1495         spin_lock_bh(&rt6_exception_lock);
1496         /* Prevent rt6_insert_exception() to recreate the bucket list */
1497         rt->exception_bucket_flushed = 1;
1498
1499         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500                                     lockdep_is_held(&rt6_exception_lock));
1501         if (!bucket)
1502                 goto out;
1503
1504         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1505                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1506                         rt6_remove_exception(bucket, rt6_ex);
1507                 WARN_ON_ONCE(bucket->depth);
1508                 bucket++;
1509         }
1510
1511 out:
1512         spin_unlock_bh(&rt6_exception_lock);
1513 }
1514
1515 /* Find cached rt in the hash table inside passed in rt
1516  * Caller has to hold rcu_read_lock()
1517  */
1518 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1519                                            struct in6_addr *daddr,
1520                                            struct in6_addr *saddr)
1521 {
1522         struct rt6_exception_bucket *bucket;
1523         struct in6_addr *src_key = NULL;
1524         struct rt6_exception *rt6_ex;
1525         struct rt6_info *res = NULL;
1526
1527         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1528
1529 #ifdef CONFIG_IPV6_SUBTREES
1530         /* rt6i_src.plen != 0 indicates rt is in subtree
1531          * and exception table is indexed by a hash of
1532          * both rt6i_dst and rt6i_src.
1533          * Otherwise, the exception table is indexed by
1534          * a hash of only rt6i_dst.
1535          */
1536         if (rt->fib6_src.plen)
1537                 src_key = saddr;
1538 #endif
1539         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1540
1541         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1542                 res = rt6_ex->rt6i;
1543
1544         return res;
1545 }
1546
1547 /* Remove the passed in cached rt from the hash table that contains it */
1548 static int rt6_remove_exception_rt(struct rt6_info *rt)
1549 {
1550         struct rt6_exception_bucket *bucket;
1551         struct in6_addr *src_key = NULL;
1552         struct rt6_exception *rt6_ex;
1553         struct fib6_info *from;
1554         int err;
1555
1556         from = rcu_dereference(rt->from);
1557         if (!from ||
1558             !(rt->rt6i_flags & RTF_CACHE))
1559                 return -EINVAL;
1560
1561         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1562                 return -ENOENT;
1563
1564         spin_lock_bh(&rt6_exception_lock);
1565         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1566                                     lockdep_is_held(&rt6_exception_lock));
1567 #ifdef CONFIG_IPV6_SUBTREES
1568         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1569          * and exception table is indexed by a hash of
1570          * both rt6i_dst and rt6i_src.
1571          * Otherwise, the exception table is indexed by
1572          * a hash of only rt6i_dst.
1573          */
1574         if (from->fib6_src.plen)
1575                 src_key = &rt->rt6i_src.addr;
1576 #endif
1577         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1578                                                &rt->rt6i_dst.addr,
1579                                                src_key);
1580         if (rt6_ex) {
1581                 rt6_remove_exception(bucket, rt6_ex);
1582                 err = 0;
1583         } else {
1584                 err = -ENOENT;
1585         }
1586
1587         spin_unlock_bh(&rt6_exception_lock);
1588         return err;
1589 }
1590
1591 /* Find rt6_ex which contains the passed in rt cache and
1592  * refresh its stamp
1593  */
1594 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1595 {
1596         struct rt6_exception_bucket *bucket;
1597         struct fib6_info *from = rt->from;
1598         struct in6_addr *src_key = NULL;
1599         struct rt6_exception *rt6_ex;
1600
1601         if (!from ||
1602             !(rt->rt6i_flags & RTF_CACHE))
1603                 return;
1604
1605         rcu_read_lock();
1606         bucket = rcu_dereference(from->rt6i_exception_bucket);
1607
1608 #ifdef CONFIG_IPV6_SUBTREES
1609         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1610          * and exception table is indexed by a hash of
1611          * both rt6i_dst and rt6i_src.
1612          * Otherwise, the exception table is indexed by
1613          * a hash of only rt6i_dst.
1614          */
1615         if (from->fib6_src.plen)
1616                 src_key = &rt->rt6i_src.addr;
1617 #endif
1618         rt6_ex = __rt6_find_exception_rcu(&bucket,
1619                                           &rt->rt6i_dst.addr,
1620                                           src_key);
1621         if (rt6_ex)
1622                 rt6_ex->stamp = jiffies;
1623
1624         rcu_read_unlock();
1625 }
1626
1627 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1628 {
1629         struct rt6_exception_bucket *bucket;
1630         struct rt6_exception *rt6_ex;
1631         int i;
1632
1633         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1634                                         lockdep_is_held(&rt6_exception_lock));
1635
1636         if (bucket) {
1637                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1638                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1639                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1640                         }
1641                         bucket++;
1642                 }
1643         }
1644 }
1645
1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1647                                          struct rt6_info *rt, int mtu)
1648 {
1649         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1650          * lowest MTU in the path: always allow updating the route PMTU to
1651          * reflect PMTU decreases.
1652          *
1653          * If the new MTU is higher, and the route PMTU is equal to the local
1654          * MTU, this means the old MTU is the lowest in the path, so allow
1655          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1656          * handle this.
1657          */
1658
1659         if (dst_mtu(&rt->dst) >= mtu)
1660                 return true;
1661
1662         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1663                 return true;
1664
1665         return false;
1666 }
1667
1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1669                                        struct fib6_info *rt, int mtu)
1670 {
1671         struct rt6_exception_bucket *bucket;
1672         struct rt6_exception *rt6_ex;
1673         int i;
1674
1675         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1676                                         lockdep_is_held(&rt6_exception_lock));
1677
1678         if (!bucket)
1679                 return;
1680
1681         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1682                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1683                         struct rt6_info *entry = rt6_ex->rt6i;
1684
1685                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1686                          * route), the metrics of its rt->from have already
1687                          * been updated.
1688                          */
1689                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1690                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1691                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1692                 }
1693                 bucket++;
1694         }
1695 }
1696
1697 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1698
1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1700                                         struct in6_addr *gateway)
1701 {
1702         struct rt6_exception_bucket *bucket;
1703         struct rt6_exception *rt6_ex;
1704         struct hlist_node *tmp;
1705         int i;
1706
1707         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1708                 return;
1709
1710         spin_lock_bh(&rt6_exception_lock);
1711         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1712                                      lockdep_is_held(&rt6_exception_lock));
1713
1714         if (bucket) {
1715                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1716                         hlist_for_each_entry_safe(rt6_ex, tmp,
1717                                                   &bucket->chain, hlist) {
1718                                 struct rt6_info *entry = rt6_ex->rt6i;
1719
1720                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1721                                     RTF_CACHE_GATEWAY &&
1722                                     ipv6_addr_equal(gateway,
1723                                                     &entry->rt6i_gateway)) {
1724                                         rt6_remove_exception(bucket, rt6_ex);
1725                                 }
1726                         }
1727                         bucket++;
1728                 }
1729         }
1730
1731         spin_unlock_bh(&rt6_exception_lock);
1732 }
1733
1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1735                                       struct rt6_exception *rt6_ex,
1736                                       struct fib6_gc_args *gc_args,
1737                                       unsigned long now)
1738 {
1739         struct rt6_info *rt = rt6_ex->rt6i;
1740
1741         /* we are pruning and obsoleting aged-out and non gateway exceptions
1742          * even if others have still references to them, so that on next
1743          * dst_check() such references can be dropped.
1744          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1745          * expired, independently from their aging, as per RFC 8201 section 4
1746          */
1747         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1748                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1749                         RT6_TRACE("aging clone %p\n", rt);
1750                         rt6_remove_exception(bucket, rt6_ex);
1751                         return;
1752                 }
1753         } else if (time_after(jiffies, rt->dst.expires)) {
1754                 RT6_TRACE("purging expired route %p\n", rt);
1755                 rt6_remove_exception(bucket, rt6_ex);
1756                 return;
1757         }
1758
1759         if (rt->rt6i_flags & RTF_GATEWAY) {
1760                 struct neighbour *neigh;
1761                 __u8 neigh_flags = 0;
1762
1763                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1764                 if (neigh)
1765                         neigh_flags = neigh->flags;
1766
1767                 if (!(neigh_flags & NTF_ROUTER)) {
1768                         RT6_TRACE("purging route %p via non-router but gateway\n",
1769                                   rt);
1770                         rt6_remove_exception(bucket, rt6_ex);
1771                         return;
1772                 }
1773         }
1774
1775         gc_args->more++;
1776 }
1777
1778 void rt6_age_exceptions(struct fib6_info *rt,
1779                         struct fib6_gc_args *gc_args,
1780                         unsigned long now)
1781 {
1782         struct rt6_exception_bucket *bucket;
1783         struct rt6_exception *rt6_ex;
1784         struct hlist_node *tmp;
1785         int i;
1786
1787         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1788                 return;
1789
1790         rcu_read_lock_bh();
1791         spin_lock(&rt6_exception_lock);
1792         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1793                                     lockdep_is_held(&rt6_exception_lock));
1794
1795         if (bucket) {
1796                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1797                         hlist_for_each_entry_safe(rt6_ex, tmp,
1798                                                   &bucket->chain, hlist) {
1799                                 rt6_age_examine_exception(bucket, rt6_ex,
1800                                                           gc_args, now);
1801                         }
1802                         bucket++;
1803                 }
1804         }
1805         spin_unlock(&rt6_exception_lock);
1806         rcu_read_unlock_bh();
1807 }
1808
1809 /* must be called with rcu lock held */
1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1811                                     int oif, struct flowi6 *fl6, int strict)
1812 {
1813         struct fib6_node *fn, *saved_fn;
1814         struct fib6_info *f6i;
1815
1816         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1817         saved_fn = fn;
1818
1819         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1820                 oif = 0;
1821
1822 redo_rt6_select:
1823         f6i = rt6_select(net, fn, oif, strict);
1824         if (f6i == net->ipv6.fib6_null_entry) {
1825                 fn = fib6_backtrack(fn, &fl6->saddr);
1826                 if (fn)
1827                         goto redo_rt6_select;
1828                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1829                         /* also consider unreachable route */
1830                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1831                         fn = saved_fn;
1832                         goto redo_rt6_select;
1833                 }
1834         }
1835
1836         trace_fib6_table_lookup(net, f6i, table, fl6);
1837
1838         return f6i;
1839 }
1840
1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1842                                int oif, struct flowi6 *fl6,
1843                                const struct sk_buff *skb, int flags)
1844 {
1845         struct fib6_info *f6i;
1846         struct rt6_info *rt;
1847         int strict = 0;
1848
1849         strict |= flags & RT6_LOOKUP_F_IFACE;
1850         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1851         if (net->ipv6.devconf_all->forwarding == 0)
1852                 strict |= RT6_LOOKUP_F_REACHABLE;
1853
1854         rcu_read_lock();
1855
1856         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1857         if (f6i->fib6_nsiblings)
1858                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1859
1860         if (f6i == net->ipv6.fib6_null_entry) {
1861                 rt = net->ipv6.ip6_null_entry;
1862                 rcu_read_unlock();
1863                 dst_hold(&rt->dst);
1864                 return rt;
1865         }
1866
1867         /*Search through exception table */
1868         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1869         if (rt) {
1870                 if (ip6_hold_safe(net, &rt, true))
1871                         dst_use_noref(&rt->dst, jiffies);
1872
1873                 rcu_read_unlock();
1874                 return rt;
1875         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1876                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1877                 /* Create a RTF_CACHE clone which will not be
1878                  * owned by the fib6 tree.  It is for the special case where
1879                  * the daddr in the skb during the neighbor look-up is different
1880                  * from the fl6->daddr used to look-up route here.
1881                  */
1882                 struct rt6_info *uncached_rt;
1883
1884                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1885
1886                 rcu_read_unlock();
1887
1888                 if (uncached_rt) {
1889                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1890                          * No need for another dst_hold()
1891                          */
1892                         rt6_uncached_list_add(uncached_rt);
1893                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1894                 } else {
1895                         uncached_rt = net->ipv6.ip6_null_entry;
1896                         dst_hold(&uncached_rt->dst);
1897                 }
1898
1899                 return uncached_rt;
1900         } else {
1901                 /* Get a percpu copy */
1902
1903                 struct rt6_info *pcpu_rt;
1904
1905                 local_bh_disable();
1906                 pcpu_rt = rt6_get_pcpu_route(f6i);
1907
1908                 if (!pcpu_rt)
1909                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1910
1911                 local_bh_enable();
1912                 rcu_read_unlock();
1913
1914                 return pcpu_rt;
1915         }
1916 }
1917 EXPORT_SYMBOL_GPL(ip6_pol_route);
1918
1919 static struct rt6_info *ip6_pol_route_input(struct net *net,
1920                                             struct fib6_table *table,
1921                                             struct flowi6 *fl6,
1922                                             const struct sk_buff *skb,
1923                                             int flags)
1924 {
1925         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1926 }
1927
1928 struct dst_entry *ip6_route_input_lookup(struct net *net,
1929                                          struct net_device *dev,
1930                                          struct flowi6 *fl6,
1931                                          const struct sk_buff *skb,
1932                                          int flags)
1933 {
1934         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1935                 flags |= RT6_LOOKUP_F_IFACE;
1936
1937         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1938 }
1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1940
1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1942                                   struct flow_keys *keys,
1943                                   struct flow_keys *flkeys)
1944 {
1945         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1946         const struct ipv6hdr *key_iph = outer_iph;
1947         struct flow_keys *_flkeys = flkeys;
1948         const struct ipv6hdr *inner_iph;
1949         const struct icmp6hdr *icmph;
1950         struct ipv6hdr _inner_iph;
1951         struct icmp6hdr _icmph;
1952
1953         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1954                 goto out;
1955
1956         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1957                                    sizeof(_icmph), &_icmph);
1958         if (!icmph)
1959                 goto out;
1960
1961         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1962             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1963             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1964             icmph->icmp6_type != ICMPV6_PARAMPROB)
1965                 goto out;
1966
1967         inner_iph = skb_header_pointer(skb,
1968                                        skb_transport_offset(skb) + sizeof(*icmph),
1969                                        sizeof(_inner_iph), &_inner_iph);
1970         if (!inner_iph)
1971                 goto out;
1972
1973         key_iph = inner_iph;
1974         _flkeys = NULL;
1975 out:
1976         if (_flkeys) {
1977                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1978                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1979                 keys->tags.flow_label = _flkeys->tags.flow_label;
1980                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1981         } else {
1982                 keys->addrs.v6addrs.src = key_iph->saddr;
1983                 keys->addrs.v6addrs.dst = key_iph->daddr;
1984                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1985                 keys->basic.ip_proto = key_iph->nexthdr;
1986         }
1987 }
1988
1989 /* if skb is set it will be used and fl6 can be NULL */
1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1991                        const struct sk_buff *skb, struct flow_keys *flkeys)
1992 {
1993         struct flow_keys hash_keys;
1994         u32 mhash;
1995
1996         switch (ip6_multipath_hash_policy(net)) {
1997         case 0:
1998                 memset(&hash_keys, 0, sizeof(hash_keys));
1999                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2000                 if (skb) {
2001                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2002                 } else {
2003                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2004                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2005                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2006                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2007                 }
2008                 break;
2009         case 1:
2010                 if (skb) {
2011                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2012                         struct flow_keys keys;
2013
2014                         /* short-circuit if we already have L4 hash present */
2015                         if (skb->l4_hash)
2016                                 return skb_get_hash_raw(skb) >> 1;
2017
2018                         memset(&hash_keys, 0, sizeof(hash_keys));
2019
2020                         if (!flkeys) {
2021                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2022                                 flkeys = &keys;
2023                         }
2024                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2025                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2026                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2027                         hash_keys.ports.src = flkeys->ports.src;
2028                         hash_keys.ports.dst = flkeys->ports.dst;
2029                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2030                 } else {
2031                         memset(&hash_keys, 0, sizeof(hash_keys));
2032                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2033                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2034                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035                         hash_keys.ports.src = fl6->fl6_sport;
2036                         hash_keys.ports.dst = fl6->fl6_dport;
2037                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2038                 }
2039                 break;
2040         }
2041         mhash = flow_hash_from_keys(&hash_keys);
2042
2043         return mhash >> 1;
2044 }
2045
2046 void ip6_route_input(struct sk_buff *skb)
2047 {
2048         const struct ipv6hdr *iph = ipv6_hdr(skb);
2049         struct net *net = dev_net(skb->dev);
2050         int flags = RT6_LOOKUP_F_HAS_SADDR;
2051         struct ip_tunnel_info *tun_info;
2052         struct flowi6 fl6 = {
2053                 .flowi6_iif = skb->dev->ifindex,
2054                 .daddr = iph->daddr,
2055                 .saddr = iph->saddr,
2056                 .flowlabel = ip6_flowinfo(iph),
2057                 .flowi6_mark = skb->mark,
2058                 .flowi6_proto = iph->nexthdr,
2059         };
2060         struct flow_keys *flkeys = NULL, _flkeys;
2061
2062         tun_info = skb_tunnel_info(skb);
2063         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2064                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2065
2066         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2067                 flkeys = &_flkeys;
2068
2069         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2070                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2071         skb_dst_drop(skb);
2072         skb_dst_set(skb,
2073                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2074 }
2075
2076 static struct rt6_info *ip6_pol_route_output(struct net *net,
2077                                              struct fib6_table *table,
2078                                              struct flowi6 *fl6,
2079                                              const struct sk_buff *skb,
2080                                              int flags)
2081 {
2082         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2083 }
2084
2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2086                                          struct flowi6 *fl6, int flags)
2087 {
2088         bool any_src;
2089
2090         if (rt6_need_strict(&fl6->daddr)) {
2091                 struct dst_entry *dst;
2092
2093                 dst = l3mdev_link_scope_lookup(net, fl6);
2094                 if (dst)
2095                         return dst;
2096         }
2097
2098         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2099
2100         any_src = ipv6_addr_any(&fl6->saddr);
2101         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2102             (fl6->flowi6_oif && any_src))
2103                 flags |= RT6_LOOKUP_F_IFACE;
2104
2105         if (!any_src)
2106                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2107         else if (sk)
2108                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2109
2110         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2111 }
2112 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2113
2114 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2115 {
2116         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2117         struct net_device *loopback_dev = net->loopback_dev;
2118         struct dst_entry *new = NULL;
2119
2120         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2121                        DST_OBSOLETE_DEAD, 0);
2122         if (rt) {
2123                 rt6_info_init(rt);
2124                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2125
2126                 new = &rt->dst;
2127                 new->__use = 1;
2128                 new->input = dst_discard;
2129                 new->output = dst_discard_out;
2130
2131                 dst_copy_metrics(new, &ort->dst);
2132
2133                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2134                 rt->rt6i_gateway = ort->rt6i_gateway;
2135                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2136
2137                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2138 #ifdef CONFIG_IPV6_SUBTREES
2139                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2140 #endif
2141         }
2142
2143         dst_release(dst_orig);
2144         return new ? new : ERR_PTR(-ENOMEM);
2145 }
2146
2147 /*
2148  *      Destination cache support functions
2149  */
2150
2151 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2152 {
2153         u32 rt_cookie = 0;
2154
2155         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2156                 return false;
2157
2158         if (fib6_check_expired(f6i))
2159                 return false;
2160
2161         return true;
2162 }
2163
2164 static struct dst_entry *rt6_check(struct rt6_info *rt,
2165                                    struct fib6_info *from,
2166                                    u32 cookie)
2167 {
2168         u32 rt_cookie = 0;
2169
2170         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2171             rt_cookie != cookie)
2172                 return NULL;
2173
2174         if (rt6_check_expired(rt))
2175                 return NULL;
2176
2177         return &rt->dst;
2178 }
2179
2180 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2181                                             struct fib6_info *from,
2182                                             u32 cookie)
2183 {
2184         if (!__rt6_check_expired(rt) &&
2185             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2186             fib6_check(from, cookie))
2187                 return &rt->dst;
2188         else
2189                 return NULL;
2190 }
2191
2192 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2193 {
2194         struct dst_entry *dst_ret;
2195         struct fib6_info *from;
2196         struct rt6_info *rt;
2197
2198         rt = container_of(dst, struct rt6_info, dst);
2199
2200         rcu_read_lock();
2201
2202         /* All IPV6 dsts are created with ->obsolete set to the value
2203          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2204          * into this function always.
2205          */
2206
2207         from = rcu_dereference(rt->from);
2208
2209         if (from && (rt->rt6i_flags & RTF_PCPU ||
2210             unlikely(!list_empty(&rt->rt6i_uncached))))
2211                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2212         else
2213                 dst_ret = rt6_check(rt, from, cookie);
2214
2215         rcu_read_unlock();
2216
2217         return dst_ret;
2218 }
2219
2220 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2221 {
2222         struct rt6_info *rt = (struct rt6_info *) dst;
2223
2224         if (rt) {
2225                 if (rt->rt6i_flags & RTF_CACHE) {
2226                         rcu_read_lock();
2227                         if (rt6_check_expired(rt)) {
2228                                 rt6_remove_exception_rt(rt);
2229                                 dst = NULL;
2230                         }
2231                         rcu_read_unlock();
2232                 } else {
2233                         dst_release(dst);
2234                         dst = NULL;
2235                 }
2236         }
2237         return dst;
2238 }
2239
2240 static void ip6_link_failure(struct sk_buff *skb)
2241 {
2242         struct rt6_info *rt;
2243
2244         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2245
2246         rt = (struct rt6_info *) skb_dst(skb);
2247         if (rt) {
2248                 rcu_read_lock();
2249                 if (rt->rt6i_flags & RTF_CACHE) {
2250                         if (dst_hold_safe(&rt->dst))
2251                                 rt6_remove_exception_rt(rt);
2252                 } else {
2253                         struct fib6_info *from;
2254                         struct fib6_node *fn;
2255
2256                         from = rcu_dereference(rt->from);
2257                         if (from) {
2258                                 fn = rcu_dereference(from->fib6_node);
2259                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2260                                         fn->fn_sernum = -1;
2261                         }
2262                 }
2263                 rcu_read_unlock();
2264         }
2265 }
2266
2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2268 {
2269         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2270                 struct fib6_info *from;
2271
2272                 rcu_read_lock();
2273                 from = rcu_dereference(rt0->from);
2274                 if (from)
2275                         rt0->dst.expires = from->expires;
2276                 rcu_read_unlock();
2277         }
2278
2279         dst_set_expires(&rt0->dst, timeout);
2280         rt0->rt6i_flags |= RTF_EXPIRES;
2281 }
2282
2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2284 {
2285         struct net *net = dev_net(rt->dst.dev);
2286
2287         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2288         rt->rt6i_flags |= RTF_MODIFIED;
2289         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2290 }
2291
2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2293 {
2294         bool from_set;
2295
2296         rcu_read_lock();
2297         from_set = !!rcu_dereference(rt->from);
2298         rcu_read_unlock();
2299
2300         return !(rt->rt6i_flags & RTF_CACHE) &&
2301                 (rt->rt6i_flags & RTF_PCPU || from_set);
2302 }
2303
2304 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2305                                  const struct ipv6hdr *iph, u32 mtu)
2306 {
2307         const struct in6_addr *daddr, *saddr;
2308         struct rt6_info *rt6 = (struct rt6_info *)dst;
2309
2310         if (dst_metric_locked(dst, RTAX_MTU))
2311                 return;
2312
2313         if (iph) {
2314                 daddr = &iph->daddr;
2315                 saddr = &iph->saddr;
2316         } else if (sk) {
2317                 daddr = &sk->sk_v6_daddr;
2318                 saddr = &inet6_sk(sk)->saddr;
2319         } else {
2320                 daddr = NULL;
2321                 saddr = NULL;
2322         }
2323         dst_confirm_neigh(dst, daddr);
2324         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2325         if (mtu >= dst_mtu(dst))
2326                 return;
2327
2328         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2329                 rt6_do_update_pmtu(rt6, mtu);
2330                 /* update rt6_ex->stamp for cache */
2331                 if (rt6->rt6i_flags & RTF_CACHE)
2332                         rt6_update_exception_stamp_rt(rt6);
2333         } else if (daddr) {
2334                 struct fib6_info *from;
2335                 struct rt6_info *nrt6;
2336
2337                 rcu_read_lock();
2338                 from = rcu_dereference(rt6->from);
2339                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2340                 if (nrt6) {
2341                         rt6_do_update_pmtu(nrt6, mtu);
2342                         if (rt6_insert_exception(nrt6, from))
2343                                 dst_release_immediate(&nrt6->dst);
2344                 }
2345                 rcu_read_unlock();
2346         }
2347 }
2348
2349 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2350                                struct sk_buff *skb, u32 mtu)
2351 {
2352         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2353 }
2354
2355 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2356                      int oif, u32 mark, kuid_t uid)
2357 {
2358         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2359         struct dst_entry *dst;
2360         struct flowi6 fl6;
2361
2362         memset(&fl6, 0, sizeof(fl6));
2363         fl6.flowi6_oif = oif;
2364         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2365         fl6.daddr = iph->daddr;
2366         fl6.saddr = iph->saddr;
2367         fl6.flowlabel = ip6_flowinfo(iph);
2368         fl6.flowi6_uid = uid;
2369
2370         dst = ip6_route_output(net, NULL, &fl6);
2371         if (!dst->error)
2372                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2373         dst_release(dst);
2374 }
2375 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2376
2377 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2378 {
2379         struct dst_entry *dst;
2380
2381         ip6_update_pmtu(skb, sock_net(sk), mtu,
2382                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2383
2384         dst = __sk_dst_get(sk);
2385         if (!dst || !dst->obsolete ||
2386             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2387                 return;
2388
2389         bh_lock_sock(sk);
2390         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2391                 ip6_datagram_dst_update(sk, false);
2392         bh_unlock_sock(sk);
2393 }
2394 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2395
2396 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2397                            const struct flowi6 *fl6)
2398 {
2399 #ifdef CONFIG_IPV6_SUBTREES
2400         struct ipv6_pinfo *np = inet6_sk(sk);
2401 #endif
2402
2403         ip6_dst_store(sk, dst,
2404                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2405                       &sk->sk_v6_daddr : NULL,
2406 #ifdef CONFIG_IPV6_SUBTREES
2407                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2408                       &np->saddr :
2409 #endif
2410                       NULL);
2411 }
2412
2413 /* Handle redirects */
2414 struct ip6rd_flowi {
2415         struct flowi6 fl6;
2416         struct in6_addr gateway;
2417 };
2418
2419 static struct rt6_info *__ip6_route_redirect(struct net *net,
2420                                              struct fib6_table *table,
2421                                              struct flowi6 *fl6,
2422                                              const struct sk_buff *skb,
2423                                              int flags)
2424 {
2425         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2426         struct rt6_info *ret = NULL, *rt_cache;
2427         struct fib6_info *rt;
2428         struct fib6_node *fn;
2429
2430         /* Get the "current" route for this destination and
2431          * check if the redirect has come from appropriate router.
2432          *
2433          * RFC 4861 specifies that redirects should only be
2434          * accepted if they come from the nexthop to the target.
2435          * Due to the way the routes are chosen, this notion
2436          * is a bit fuzzy and one might need to check all possible
2437          * routes.
2438          */
2439
2440         rcu_read_lock();
2441         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2442 restart:
2443         for_each_fib6_node_rt_rcu(fn) {
2444                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2445                         continue;
2446                 if (fib6_check_expired(rt))
2447                         continue;
2448                 if (rt->fib6_flags & RTF_REJECT)
2449                         break;
2450                 if (!(rt->fib6_flags & RTF_GATEWAY))
2451                         continue;
2452                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2453                         continue;
2454                 /* rt_cache's gateway might be different from its 'parent'
2455                  * in the case of an ip redirect.
2456                  * So we keep searching in the exception table if the gateway
2457                  * is different.
2458                  */
2459                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2460                         rt_cache = rt6_find_cached_rt(rt,
2461                                                       &fl6->daddr,
2462                                                       &fl6->saddr);
2463                         if (rt_cache &&
2464                             ipv6_addr_equal(&rdfl->gateway,
2465                                             &rt_cache->rt6i_gateway)) {
2466                                 ret = rt_cache;
2467                                 break;
2468                         }
2469                         continue;
2470                 }
2471                 break;
2472         }
2473
2474         if (!rt)
2475                 rt = net->ipv6.fib6_null_entry;
2476         else if (rt->fib6_flags & RTF_REJECT) {
2477                 ret = net->ipv6.ip6_null_entry;
2478                 goto out;
2479         }
2480
2481         if (rt == net->ipv6.fib6_null_entry) {
2482                 fn = fib6_backtrack(fn, &fl6->saddr);
2483                 if (fn)
2484                         goto restart;
2485         }
2486
2487 out:
2488         if (ret)
2489                 dst_hold(&ret->dst);
2490         else
2491                 ret = ip6_create_rt_rcu(rt);
2492
2493         rcu_read_unlock();
2494
2495         trace_fib6_table_lookup(net, rt, table, fl6);
2496         return ret;
2497 };
2498
2499 static struct dst_entry *ip6_route_redirect(struct net *net,
2500                                             const struct flowi6 *fl6,
2501                                             const struct sk_buff *skb,
2502                                             const struct in6_addr *gateway)
2503 {
2504         int flags = RT6_LOOKUP_F_HAS_SADDR;
2505         struct ip6rd_flowi rdfl;
2506
2507         rdfl.fl6 = *fl6;
2508         rdfl.gateway = *gateway;
2509
2510         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2511                                 flags, __ip6_route_redirect);
2512 }
2513
2514 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2515                   kuid_t uid)
2516 {
2517         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2518         struct dst_entry *dst;
2519         struct flowi6 fl6;
2520
2521         memset(&fl6, 0, sizeof(fl6));
2522         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2523         fl6.flowi6_oif = oif;
2524         fl6.flowi6_mark = mark;
2525         fl6.daddr = iph->daddr;
2526         fl6.saddr = iph->saddr;
2527         fl6.flowlabel = ip6_flowinfo(iph);
2528         fl6.flowi6_uid = uid;
2529
2530         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2531         rt6_do_redirect(dst, NULL, skb);
2532         dst_release(dst);
2533 }
2534 EXPORT_SYMBOL_GPL(ip6_redirect);
2535
2536 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2537                             u32 mark)
2538 {
2539         const struct ipv6hdr *iph = ipv6_hdr(skb);
2540         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2541         struct dst_entry *dst;
2542         struct flowi6 fl6;
2543
2544         memset(&fl6, 0, sizeof(fl6));
2545         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2546         fl6.flowi6_oif = oif;
2547         fl6.flowi6_mark = mark;
2548         fl6.daddr = msg->dest;
2549         fl6.saddr = iph->daddr;
2550         fl6.flowi6_uid = sock_net_uid(net, NULL);
2551
2552         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2553         rt6_do_redirect(dst, NULL, skb);
2554         dst_release(dst);
2555 }
2556
2557 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2558 {
2559         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2560                      sk->sk_uid);
2561 }
2562 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2563
2564 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2565 {
2566         struct net_device *dev = dst->dev;
2567         unsigned int mtu = dst_mtu(dst);
2568         struct net *net = dev_net(dev);
2569
2570         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2571
2572         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2573                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2574
2575         /*
2576          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2577          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2578          * IPV6_MAXPLEN is also valid and means: "any MSS,
2579          * rely only on pmtu discovery"
2580          */
2581         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2582                 mtu = IPV6_MAXPLEN;
2583         return mtu;
2584 }
2585
2586 static unsigned int ip6_mtu(const struct dst_entry *dst)
2587 {
2588         struct inet6_dev *idev;
2589         unsigned int mtu;
2590
2591         mtu = dst_metric_raw(dst, RTAX_MTU);
2592         if (mtu)
2593                 goto out;
2594
2595         mtu = IPV6_MIN_MTU;
2596
2597         rcu_read_lock();
2598         idev = __in6_dev_get(dst->dev);
2599         if (idev)
2600                 mtu = idev->cnf.mtu6;
2601         rcu_read_unlock();
2602
2603 out:
2604         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2605
2606         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2607 }
2608
2609 /* MTU selection:
2610  * 1. mtu on route is locked - use it
2611  * 2. mtu from nexthop exception
2612  * 3. mtu from egress device
2613  *
2614  * based on ip6_dst_mtu_forward and exception logic of
2615  * rt6_find_cached_rt; called with rcu_read_lock
2616  */
2617 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2618                       struct in6_addr *saddr)
2619 {
2620         struct rt6_exception_bucket *bucket;
2621         struct rt6_exception *rt6_ex;
2622         struct in6_addr *src_key;
2623         struct inet6_dev *idev;
2624         u32 mtu = 0;
2625
2626         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2627                 mtu = f6i->fib6_pmtu;
2628                 if (mtu)
2629                         goto out;
2630         }
2631
2632         src_key = NULL;
2633 #ifdef CONFIG_IPV6_SUBTREES
2634         if (f6i->fib6_src.plen)
2635                 src_key = saddr;
2636 #endif
2637
2638         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2639         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2640         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2641                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2642
2643         if (likely(!mtu)) {
2644                 struct net_device *dev = fib6_info_nh_dev(f6i);
2645
2646                 mtu = IPV6_MIN_MTU;
2647                 idev = __in6_dev_get(dev);
2648                 if (idev && idev->cnf.mtu6 > mtu)
2649                         mtu = idev->cnf.mtu6;
2650         }
2651
2652         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2653 out:
2654         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2655 }
2656
2657 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2658                                   struct flowi6 *fl6)
2659 {
2660         struct dst_entry *dst;
2661         struct rt6_info *rt;
2662         struct inet6_dev *idev = in6_dev_get(dev);
2663         struct net *net = dev_net(dev);
2664
2665         if (unlikely(!idev))
2666                 return ERR_PTR(-ENODEV);
2667
2668         rt = ip6_dst_alloc(net, dev, 0);
2669         if (unlikely(!rt)) {
2670                 in6_dev_put(idev);
2671                 dst = ERR_PTR(-ENOMEM);
2672                 goto out;
2673         }
2674
2675         rt->dst.flags |= DST_HOST;
2676         rt->dst.input = ip6_input;
2677         rt->dst.output  = ip6_output;
2678         rt->rt6i_gateway  = fl6->daddr;
2679         rt->rt6i_dst.addr = fl6->daddr;
2680         rt->rt6i_dst.plen = 128;
2681         rt->rt6i_idev     = idev;
2682         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2683
2684         /* Add this dst into uncached_list so that rt6_disable_ip() can
2685          * do proper release of the net_device
2686          */
2687         rt6_uncached_list_add(rt);
2688         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2689
2690         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2691
2692 out:
2693         return dst;
2694 }
2695
2696 static int ip6_dst_gc(struct dst_ops *ops)
2697 {
2698         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2699         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2700         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2701         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2702         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2703         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2704         int entries;
2705
2706         entries = dst_entries_get_fast(ops);
2707         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2708             entries <= rt_max_size)
2709                 goto out;
2710
2711         net->ipv6.ip6_rt_gc_expire++;
2712         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2713         entries = dst_entries_get_slow(ops);
2714         if (entries < ops->gc_thresh)
2715                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2716 out:
2717         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2718         return entries > rt_max_size;
2719 }
2720
2721 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2722                                struct fib6_config *cfg)
2723 {
2724         struct dst_metrics *p;
2725
2726         if (!cfg->fc_mx)
2727                 return 0;
2728
2729         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2730         if (unlikely(!p))
2731                 return -ENOMEM;
2732
2733         refcount_set(&p->refcnt, 1);
2734         rt->fib6_metrics = p;
2735
2736         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2737 }
2738
2739 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2740                                             struct fib6_config *cfg,
2741                                             const struct in6_addr *gw_addr,
2742                                             u32 tbid, int flags)
2743 {
2744         struct flowi6 fl6 = {
2745                 .flowi6_oif = cfg->fc_ifindex,
2746                 .daddr = *gw_addr,
2747                 .saddr = cfg->fc_prefsrc,
2748         };
2749         struct fib6_table *table;
2750         struct rt6_info *rt;
2751
2752         table = fib6_get_table(net, tbid);
2753         if (!table)
2754                 return NULL;
2755
2756         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2757                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2758
2759         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2760         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2761
2762         /* if table lookup failed, fall back to full lookup */
2763         if (rt == net->ipv6.ip6_null_entry) {
2764                 ip6_rt_put(rt);
2765                 rt = NULL;
2766         }
2767
2768         return rt;
2769 }
2770
2771 static int ip6_route_check_nh_onlink(struct net *net,
2772                                      struct fib6_config *cfg,
2773                                      const struct net_device *dev,
2774                                      struct netlink_ext_ack *extack)
2775 {
2776         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2777         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2778         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2779         struct rt6_info *grt;
2780         int err;
2781
2782         err = 0;
2783         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2784         if (grt) {
2785                 if (!grt->dst.error &&
2786                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2787                         NL_SET_ERR_MSG(extack,
2788                                        "Nexthop has invalid gateway or device mismatch");
2789                         err = -EINVAL;
2790                 }
2791
2792                 ip6_rt_put(grt);
2793         }
2794
2795         return err;
2796 }
2797
2798 static int ip6_route_check_nh(struct net *net,
2799                               struct fib6_config *cfg,
2800                               struct net_device **_dev,
2801                               struct inet6_dev **idev)
2802 {
2803         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2804         struct net_device *dev = _dev ? *_dev : NULL;
2805         struct rt6_info *grt = NULL;
2806         int err = -EHOSTUNREACH;
2807
2808         if (cfg->fc_table) {
2809                 int flags = RT6_LOOKUP_F_IFACE;
2810
2811                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2812                                           cfg->fc_table, flags);
2813                 if (grt) {
2814                         if (grt->rt6i_flags & RTF_GATEWAY ||
2815                             (dev && dev != grt->dst.dev)) {
2816                                 ip6_rt_put(grt);
2817                                 grt = NULL;
2818                         }
2819                 }
2820         }
2821
2822         if (!grt)
2823                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2824
2825         if (!grt)
2826                 goto out;
2827
2828         if (dev) {
2829                 if (dev != grt->dst.dev) {
2830                         ip6_rt_put(grt);
2831                         goto out;
2832                 }
2833         } else {
2834                 *_dev = dev = grt->dst.dev;
2835                 *idev = grt->rt6i_idev;
2836                 dev_hold(dev);
2837                 in6_dev_hold(grt->rt6i_idev);
2838         }
2839
2840         if (!(grt->rt6i_flags & RTF_GATEWAY))
2841                 err = 0;
2842
2843         ip6_rt_put(grt);
2844
2845 out:
2846         return err;
2847 }
2848
2849 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2850                            struct net_device **_dev, struct inet6_dev **idev,
2851                            struct netlink_ext_ack *extack)
2852 {
2853         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2854         int gwa_type = ipv6_addr_type(gw_addr);
2855         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2856         const struct net_device *dev = *_dev;
2857         bool need_addr_check = !dev;
2858         int err = -EINVAL;
2859
2860         /* if gw_addr is local we will fail to detect this in case
2861          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2862          * will return already-added prefix route via interface that
2863          * prefix route was assigned to, which might be non-loopback.
2864          */
2865         if (dev &&
2866             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2867                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2868                 goto out;
2869         }
2870
2871         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2872                 /* IPv6 strictly inhibits using not link-local
2873                  * addresses as nexthop address.
2874                  * Otherwise, router will not able to send redirects.
2875                  * It is very good, but in some (rare!) circumstances
2876                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2877                  * some exceptions. --ANK
2878                  * We allow IPv4-mapped nexthops to support RFC4798-type
2879                  * addressing
2880                  */
2881                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2882                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2883                         goto out;
2884                 }
2885
2886                 if (cfg->fc_flags & RTNH_F_ONLINK)
2887                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2888                 else
2889                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2890
2891                 if (err)
2892                         goto out;
2893         }
2894
2895         /* reload in case device was changed */
2896         dev = *_dev;
2897
2898         err = -EINVAL;
2899         if (!dev) {
2900                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2901                 goto out;
2902         } else if (dev->flags & IFF_LOOPBACK) {
2903                 NL_SET_ERR_MSG(extack,
2904                                "Egress device can not be loopback device for this route");
2905                 goto out;
2906         }
2907
2908         /* if we did not check gw_addr above, do so now that the
2909          * egress device has been resolved.
2910          */
2911         if (need_addr_check &&
2912             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2913                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2914                 goto out;
2915         }
2916
2917         err = 0;
2918 out:
2919         return err;
2920 }
2921
2922 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2923                                               gfp_t gfp_flags,
2924                                               struct netlink_ext_ack *extack)
2925 {
2926         struct net *net = cfg->fc_nlinfo.nl_net;
2927         struct fib6_info *rt = NULL;
2928         struct net_device *dev = NULL;
2929         struct inet6_dev *idev = NULL;
2930         struct fib6_table *table;
2931         int addr_type;
2932         int err = -EINVAL;
2933
2934         /* RTF_PCPU is an internal flag; can not be set by userspace */
2935         if (cfg->fc_flags & RTF_PCPU) {
2936                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2937                 goto out;
2938         }
2939
2940         /* RTF_CACHE is an internal flag; can not be set by userspace */
2941         if (cfg->fc_flags & RTF_CACHE) {
2942                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2943                 goto out;
2944         }
2945
2946         if (cfg->fc_type > RTN_MAX) {
2947                 NL_SET_ERR_MSG(extack, "Invalid route type");
2948                 goto out;
2949         }
2950
2951         if (cfg->fc_dst_len > 128) {
2952                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2953                 goto out;
2954         }
2955         if (cfg->fc_src_len > 128) {
2956                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2957                 goto out;
2958         }
2959 #ifndef CONFIG_IPV6_SUBTREES
2960         if (cfg->fc_src_len) {
2961                 NL_SET_ERR_MSG(extack,
2962                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2963                 goto out;
2964         }
2965 #endif
2966         if (cfg->fc_ifindex) {
2967                 err = -ENODEV;
2968                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2969                 if (!dev)
2970                         goto out;
2971                 idev = in6_dev_get(dev);
2972                 if (!idev)
2973                         goto out;
2974         }
2975
2976         if (cfg->fc_metric == 0)
2977                 cfg->fc_metric = IP6_RT_PRIO_USER;
2978
2979         if (cfg->fc_flags & RTNH_F_ONLINK) {
2980                 if (!dev) {
2981                         NL_SET_ERR_MSG(extack,
2982                                        "Nexthop device required for onlink");
2983                         err = -ENODEV;
2984                         goto out;
2985                 }
2986
2987                 if (!(dev->flags & IFF_UP)) {
2988                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2989                         err = -ENETDOWN;
2990                         goto out;
2991                 }
2992         }
2993
2994         err = -ENOBUFS;
2995         if (cfg->fc_nlinfo.nlh &&
2996             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2997                 table = fib6_get_table(net, cfg->fc_table);
2998                 if (!table) {
2999                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3000                         table = fib6_new_table(net, cfg->fc_table);
3001                 }
3002         } else {
3003                 table = fib6_new_table(net, cfg->fc_table);
3004         }
3005
3006         if (!table)
3007                 goto out;
3008
3009         err = -ENOMEM;
3010         rt = fib6_info_alloc(gfp_flags);
3011         if (!rt)
3012                 goto out;
3013
3014         if (cfg->fc_flags & RTF_ADDRCONF)
3015                 rt->dst_nocount = true;
3016
3017         err = ip6_convert_metrics(net, rt, cfg);
3018         if (err < 0)
3019                 goto out;
3020
3021         if (cfg->fc_flags & RTF_EXPIRES)
3022                 fib6_set_expires(rt, jiffies +
3023                                 clock_t_to_jiffies(cfg->fc_expires));
3024         else
3025                 fib6_clean_expires(rt);
3026
3027         if (cfg->fc_protocol == RTPROT_UNSPEC)
3028                 cfg->fc_protocol = RTPROT_BOOT;
3029         rt->fib6_protocol = cfg->fc_protocol;
3030
3031         addr_type = ipv6_addr_type(&cfg->fc_dst);
3032
3033         if (cfg->fc_encap) {
3034                 struct lwtunnel_state *lwtstate;
3035
3036                 err = lwtunnel_build_state(cfg->fc_encap_type,
3037                                            cfg->fc_encap, AF_INET6, cfg,
3038                                            &lwtstate, extack);
3039                 if (err)
3040                         goto out;
3041                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3042         }
3043
3044         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3045         rt->fib6_dst.plen = cfg->fc_dst_len;
3046         if (rt->fib6_dst.plen == 128)
3047                 rt->dst_host = true;
3048
3049 #ifdef CONFIG_IPV6_SUBTREES
3050         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3051         rt->fib6_src.plen = cfg->fc_src_len;
3052 #endif
3053
3054         rt->fib6_metric = cfg->fc_metric;
3055         rt->fib6_nh.nh_weight = 1;
3056
3057         rt->fib6_type = cfg->fc_type;
3058
3059         /* We cannot add true routes via loopback here,
3060            they would result in kernel looping; promote them to reject routes
3061          */
3062         if ((cfg->fc_flags & RTF_REJECT) ||
3063             (dev && (dev->flags & IFF_LOOPBACK) &&
3064              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3065              !(cfg->fc_flags & RTF_LOCAL))) {
3066                 /* hold loopback dev/idev if we haven't done so. */
3067                 if (dev != net->loopback_dev) {
3068                         if (dev) {
3069                                 dev_put(dev);
3070                                 in6_dev_put(idev);
3071                         }
3072                         dev = net->loopback_dev;
3073                         dev_hold(dev);
3074                         idev = in6_dev_get(dev);
3075                         if (!idev) {
3076                                 err = -ENODEV;
3077                                 goto out;
3078                         }
3079                 }
3080                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3081                 goto install_route;
3082         }
3083
3084         if (cfg->fc_flags & RTF_GATEWAY) {
3085                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3086                 if (err)
3087                         goto out;
3088
3089                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3090         }
3091
3092         err = -ENODEV;
3093         if (!dev)
3094                 goto out;
3095
3096         if (idev->cnf.disable_ipv6) {
3097                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3098                 err = -EACCES;
3099                 goto out;
3100         }
3101
3102         if (!(dev->flags & IFF_UP)) {
3103                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3104                 err = -ENETDOWN;
3105                 goto out;
3106         }
3107
3108         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3109                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3110                         NL_SET_ERR_MSG(extack, "Invalid source address");
3111                         err = -EINVAL;
3112                         goto out;
3113                 }
3114                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3115                 rt->fib6_prefsrc.plen = 128;
3116         } else
3117                 rt->fib6_prefsrc.plen = 0;
3118
3119         rt->fib6_flags = cfg->fc_flags;
3120
3121 install_route:
3122         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3123             !netif_carrier_ok(dev))
3124                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3125         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3126         rt->fib6_nh.nh_dev = dev;
3127         rt->fib6_table = table;
3128
3129         cfg->fc_nlinfo.nl_net = dev_net(dev);
3130
3131         if (idev)
3132                 in6_dev_put(idev);
3133
3134         return rt;
3135 out:
3136         if (dev)
3137                 dev_put(dev);
3138         if (idev)
3139                 in6_dev_put(idev);
3140
3141         fib6_info_release(rt);
3142         return ERR_PTR(err);
3143 }
3144
3145 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3146                   struct netlink_ext_ack *extack)
3147 {
3148         struct fib6_info *rt;
3149         int err;
3150
3151         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3152         if (IS_ERR(rt))
3153                 return PTR_ERR(rt);
3154
3155         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3156         fib6_info_release(rt);
3157
3158         return err;
3159 }
3160
3161 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3162 {
3163         struct net *net = info->nl_net;
3164         struct fib6_table *table;
3165         int err;
3166
3167         if (rt == net->ipv6.fib6_null_entry) {
3168                 err = -ENOENT;
3169                 goto out;
3170         }
3171
3172         table = rt->fib6_table;
3173         spin_lock_bh(&table->tb6_lock);
3174         err = fib6_del(rt, info);
3175         spin_unlock_bh(&table->tb6_lock);
3176
3177 out:
3178         fib6_info_release(rt);
3179         return err;
3180 }
3181
3182 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3183 {
3184         struct nl_info info = { .nl_net = net };
3185
3186         return __ip6_del_rt(rt, &info);
3187 }
3188
3189 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3190 {
3191         struct nl_info *info = &cfg->fc_nlinfo;
3192         struct net *net = info->nl_net;
3193         struct sk_buff *skb = NULL;
3194         struct fib6_table *table;
3195         int err = -ENOENT;
3196
3197         if (rt == net->ipv6.fib6_null_entry)
3198                 goto out_put;
3199         table = rt->fib6_table;
3200         spin_lock_bh(&table->tb6_lock);
3201
3202         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3203                 struct fib6_info *sibling, *next_sibling;
3204
3205                 /* prefer to send a single notification with all hops */
3206                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3207                 if (skb) {
3208                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3209
3210                         if (rt6_fill_node(net, skb, rt, NULL,
3211                                           NULL, NULL, 0, RTM_DELROUTE,
3212                                           info->portid, seq, 0) < 0) {
3213                                 kfree_skb(skb);
3214                                 skb = NULL;
3215                         } else
3216                                 info->skip_notify = 1;
3217                 }
3218
3219                 list_for_each_entry_safe(sibling, next_sibling,
3220                                          &rt->fib6_siblings,
3221                                          fib6_siblings) {
3222                         err = fib6_del(sibling, info);
3223                         if (err)
3224                                 goto out_unlock;
3225                 }
3226         }
3227
3228         err = fib6_del(rt, info);
3229 out_unlock:
3230         spin_unlock_bh(&table->tb6_lock);
3231 out_put:
3232         fib6_info_release(rt);
3233
3234         if (skb) {
3235                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3236                             info->nlh, gfp_any());
3237         }
3238         return err;
3239 }
3240
3241 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3242 {
3243         int rc = -ESRCH;
3244
3245         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3246                 goto out;
3247
3248         if (cfg->fc_flags & RTF_GATEWAY &&
3249             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3250                 goto out;
3251         if (dst_hold_safe(&rt->dst))
3252                 rc = rt6_remove_exception_rt(rt);
3253 out:
3254         return rc;
3255 }
3256
3257 static int ip6_route_del(struct fib6_config *cfg,
3258                          struct netlink_ext_ack *extack)
3259 {
3260         struct rt6_info *rt_cache;
3261         struct fib6_table *table;
3262         struct fib6_info *rt;
3263         struct fib6_node *fn;
3264         int err = -ESRCH;
3265
3266         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3267         if (!table) {
3268                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3269                 return err;
3270         }
3271
3272         rcu_read_lock();
3273
3274         fn = fib6_locate(&table->tb6_root,
3275                          &cfg->fc_dst, cfg->fc_dst_len,
3276                          &cfg->fc_src, cfg->fc_src_len,
3277                          !(cfg->fc_flags & RTF_CACHE));
3278
3279         if (fn) {
3280                 for_each_fib6_node_rt_rcu(fn) {
3281                         if (cfg->fc_flags & RTF_CACHE) {
3282                                 int rc;
3283
3284                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3285                                                               &cfg->fc_src);
3286                                 if (rt_cache) {
3287                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3288                                         if (rc != -ESRCH) {
3289                                                 rcu_read_unlock();
3290                                                 return rc;
3291                                         }
3292                                 }
3293                                 continue;
3294                         }
3295                         if (cfg->fc_ifindex &&
3296                             (!rt->fib6_nh.nh_dev ||
3297                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3298                                 continue;
3299                         if (cfg->fc_flags & RTF_GATEWAY &&
3300                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3301                                 continue;
3302                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3303                                 continue;
3304                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3305                                 continue;
3306                         fib6_info_hold(rt);
3307                         rcu_read_unlock();
3308
3309                         /* if gateway was specified only delete the one hop */
3310                         if (cfg->fc_flags & RTF_GATEWAY)
3311                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3312
3313                         return __ip6_del_rt_siblings(rt, cfg);
3314                 }
3315         }
3316         rcu_read_unlock();
3317
3318         return err;
3319 }
3320
3321 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3322 {
3323         struct netevent_redirect netevent;
3324         struct rt6_info *rt, *nrt = NULL;
3325         struct ndisc_options ndopts;
3326         struct inet6_dev *in6_dev;
3327         struct neighbour *neigh;
3328         struct fib6_info *from;
3329         struct rd_msg *msg;
3330         int optlen, on_link;
3331         u8 *lladdr;
3332
3333         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3334         optlen -= sizeof(*msg);
3335
3336         if (optlen < 0) {
3337                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3338                 return;
3339         }
3340
3341         msg = (struct rd_msg *)icmp6_hdr(skb);
3342
3343         if (ipv6_addr_is_multicast(&msg->dest)) {
3344                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3345                 return;
3346         }
3347
3348         on_link = 0;
3349         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3350                 on_link = 1;
3351         } else if (ipv6_addr_type(&msg->target) !=
3352                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3353                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3354                 return;
3355         }
3356
3357         in6_dev = __in6_dev_get(skb->dev);
3358         if (!in6_dev)
3359                 return;
3360         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3361                 return;
3362
3363         /* RFC2461 8.1:
3364          *      The IP source address of the Redirect MUST be the same as the current
3365          *      first-hop router for the specified ICMP Destination Address.
3366          */
3367
3368         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3369                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3370                 return;
3371         }
3372
3373         lladdr = NULL;
3374         if (ndopts.nd_opts_tgt_lladdr) {
3375                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3376                                              skb->dev);
3377                 if (!lladdr) {
3378                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3379                         return;
3380                 }
3381         }
3382
3383         rt = (struct rt6_info *) dst;
3384         if (rt->rt6i_flags & RTF_REJECT) {
3385                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3386                 return;
3387         }
3388
3389         /* Redirect received -> path was valid.
3390          * Look, redirects are sent only in response to data packets,
3391          * so that this nexthop apparently is reachable. --ANK
3392          */
3393         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3394
3395         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3396         if (!neigh)
3397                 return;
3398
3399         /*
3400          *      We have finally decided to accept it.
3401          */
3402
3403         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3404                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3405                      NEIGH_UPDATE_F_OVERRIDE|
3406                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3407                                      NEIGH_UPDATE_F_ISROUTER)),
3408                      NDISC_REDIRECT, &ndopts);
3409
3410         rcu_read_lock();
3411         from = rcu_dereference(rt->from);
3412         fib6_info_hold(from);
3413         rcu_read_unlock();
3414
3415         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3416         if (!nrt)
3417                 goto out;
3418
3419         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3420         if (on_link)
3421                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3422
3423         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3424
3425         /* No need to remove rt from the exception table if rt is
3426          * a cached route because rt6_insert_exception() will
3427          * takes care of it
3428          */
3429         if (rt6_insert_exception(nrt, from)) {
3430                 dst_release_immediate(&nrt->dst);
3431                 goto out;
3432         }
3433
3434         netevent.old = &rt->dst;
3435         netevent.new = &nrt->dst;
3436         netevent.daddr = &msg->dest;
3437         netevent.neigh = neigh;
3438         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3439
3440 out:
3441         fib6_info_release(from);
3442         neigh_release(neigh);
3443 }
3444
3445 #ifdef CONFIG_IPV6_ROUTE_INFO
3446 static struct fib6_info *rt6_get_route_info(struct net *net,
3447                                            const struct in6_addr *prefix, int prefixlen,
3448                                            const struct in6_addr *gwaddr,
3449                                            struct net_device *dev)
3450 {
3451         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3452         int ifindex = dev->ifindex;
3453         struct fib6_node *fn;
3454         struct fib6_info *rt = NULL;
3455         struct fib6_table *table;
3456
3457         table = fib6_get_table(net, tb_id);
3458         if (!table)
3459                 return NULL;
3460
3461         rcu_read_lock();
3462         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3463         if (!fn)
3464                 goto out;
3465
3466         for_each_fib6_node_rt_rcu(fn) {
3467                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3468                         continue;
3469                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3470                         continue;
3471                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3472                         continue;
3473                 fib6_info_hold(rt);
3474                 break;
3475         }
3476 out:
3477         rcu_read_unlock();
3478         return rt;
3479 }
3480
3481 static struct fib6_info *rt6_add_route_info(struct net *net,
3482                                            const struct in6_addr *prefix, int prefixlen,
3483                                            const struct in6_addr *gwaddr,
3484                                            struct net_device *dev,
3485                                            unsigned int pref)
3486 {
3487         struct fib6_config cfg = {
3488                 .fc_metric      = IP6_RT_PRIO_USER,
3489                 .fc_ifindex     = dev->ifindex,
3490                 .fc_dst_len     = prefixlen,
3491                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3492                                   RTF_UP | RTF_PREF(pref),
3493                 .fc_protocol = RTPROT_RA,
3494                 .fc_type = RTN_UNICAST,
3495                 .fc_nlinfo.portid = 0,
3496                 .fc_nlinfo.nlh = NULL,
3497                 .fc_nlinfo.nl_net = net,
3498         };
3499
3500         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3501         cfg.fc_dst = *prefix;
3502         cfg.fc_gateway = *gwaddr;
3503
3504         /* We should treat it as a default route if prefix length is 0. */
3505         if (!prefixlen)
3506                 cfg.fc_flags |= RTF_DEFAULT;
3507
3508         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3509
3510         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3511 }
3512 #endif
3513
3514 struct fib6_info *rt6_get_dflt_router(struct net *net,
3515                                      const struct in6_addr *addr,
3516                                      struct net_device *dev)
3517 {
3518         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3519         struct fib6_info *rt;
3520         struct fib6_table *table;
3521
3522         table = fib6_get_table(net, tb_id);
3523         if (!table)
3524                 return NULL;
3525
3526         rcu_read_lock();
3527         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3528                 if (dev == rt->fib6_nh.nh_dev &&
3529                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3530                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3531                         break;
3532         }
3533         if (rt)
3534                 fib6_info_hold(rt);
3535         rcu_read_unlock();
3536         return rt;
3537 }
3538
3539 struct fib6_info *rt6_add_dflt_router(struct net *net,
3540                                      const struct in6_addr *gwaddr,
3541                                      struct net_device *dev,
3542                                      unsigned int pref)
3543 {
3544         struct fib6_config cfg = {
3545                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3546                 .fc_metric      = IP6_RT_PRIO_USER,
3547                 .fc_ifindex     = dev->ifindex,
3548                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3549                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3550                 .fc_protocol = RTPROT_RA,
3551                 .fc_type = RTN_UNICAST,
3552                 .fc_nlinfo.portid = 0,
3553                 .fc_nlinfo.nlh = NULL,
3554                 .fc_nlinfo.nl_net = net,
3555         };
3556
3557         cfg.fc_gateway = *gwaddr;
3558
3559         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3560                 struct fib6_table *table;
3561
3562                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3563                 if (table)
3564                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3565         }
3566
3567         return rt6_get_dflt_router(net, gwaddr, dev);
3568 }
3569
3570 static void __rt6_purge_dflt_routers(struct net *net,
3571                                      struct fib6_table *table)
3572 {
3573         struct fib6_info *rt;
3574
3575 restart:
3576         rcu_read_lock();
3577         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3578                 struct net_device *dev = fib6_info_nh_dev(rt);
3579                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3580
3581                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3582                     (!idev || idev->cnf.accept_ra != 2)) {
3583                         fib6_info_hold(rt);
3584                         rcu_read_unlock();
3585                         ip6_del_rt(net, rt);
3586                         goto restart;
3587                 }
3588         }
3589         rcu_read_unlock();
3590
3591         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3592 }
3593
3594 void rt6_purge_dflt_routers(struct net *net)
3595 {
3596         struct fib6_table *table;
3597         struct hlist_head *head;
3598         unsigned int h;
3599
3600         rcu_read_lock();
3601
3602         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3603                 head = &net->ipv6.fib_table_hash[h];
3604                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3605                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3606                                 __rt6_purge_dflt_routers(net, table);
3607                 }
3608         }
3609
3610         rcu_read_unlock();
3611 }
3612
3613 static void rtmsg_to_fib6_config(struct net *net,
3614                                  struct in6_rtmsg *rtmsg,
3615                                  struct fib6_config *cfg)
3616 {
3617         memset(cfg, 0, sizeof(*cfg));
3618
3619         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3620                          : RT6_TABLE_MAIN;
3621         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3622         cfg->fc_metric = rtmsg->rtmsg_metric;
3623         cfg->fc_expires = rtmsg->rtmsg_info;
3624         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3625         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3626         cfg->fc_flags = rtmsg->rtmsg_flags;
3627         cfg->fc_type = rtmsg->rtmsg_type;
3628
3629         cfg->fc_nlinfo.nl_net = net;
3630
3631         cfg->fc_dst = rtmsg->rtmsg_dst;
3632         cfg->fc_src = rtmsg->rtmsg_src;
3633         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3634 }
3635
3636 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3637 {
3638         struct fib6_config cfg;
3639         struct in6_rtmsg rtmsg;
3640         int err;
3641
3642         switch (cmd) {
3643         case SIOCADDRT:         /* Add a route */
3644         case SIOCDELRT:         /* Delete a route */
3645                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3646                         return -EPERM;
3647                 err = copy_from_user(&rtmsg, arg,
3648                                      sizeof(struct in6_rtmsg));
3649                 if (err)
3650                         return -EFAULT;
3651
3652                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3653
3654                 rtnl_lock();
3655                 switch (cmd) {
3656                 case SIOCADDRT:
3657                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3658                         break;
3659                 case SIOCDELRT:
3660                         err = ip6_route_del(&cfg, NULL);
3661                         break;
3662                 default:
3663                         err = -EINVAL;
3664                 }
3665                 rtnl_unlock();
3666
3667                 return err;
3668         }
3669
3670         return -EINVAL;
3671 }
3672
3673 /*
3674  *      Drop the packet on the floor
3675  */
3676
3677 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3678 {
3679         int type;
3680         struct dst_entry *dst = skb_dst(skb);
3681         switch (ipstats_mib_noroutes) {
3682         case IPSTATS_MIB_INNOROUTES:
3683                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3684                 if (type == IPV6_ADDR_ANY) {
3685                         IP6_INC_STATS(dev_net(dst->dev),
3686                                       __in6_dev_get_safely(skb->dev),
3687                                       IPSTATS_MIB_INADDRERRORS);
3688                         break;
3689                 }
3690                 /* FALLTHROUGH */
3691         case IPSTATS_MIB_OUTNOROUTES:
3692                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3693                               ipstats_mib_noroutes);
3694                 break;
3695         }
3696         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3697         kfree_skb(skb);
3698         return 0;
3699 }
3700
3701 static int ip6_pkt_discard(struct sk_buff *skb)
3702 {
3703         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3704 }
3705
3706 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3707 {
3708         skb->dev = skb_dst(skb)->dev;
3709         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3710 }
3711
3712 static int ip6_pkt_prohibit(struct sk_buff *skb)
3713 {
3714         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3715 }
3716
3717 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3718 {
3719         skb->dev = skb_dst(skb)->dev;
3720         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3721 }
3722
3723 /*
3724  *      Allocate a dst for local (unicast / anycast) address.
3725  */
3726
3727 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3728                                      struct inet6_dev *idev,
3729                                      const struct in6_addr *addr,
3730                                      bool anycast, gfp_t gfp_flags)
3731 {
3732         u32 tb_id;
3733         struct net_device *dev = idev->dev;
3734         struct fib6_info *f6i;
3735
3736         f6i = fib6_info_alloc(gfp_flags);
3737         if (!f6i)
3738                 return ERR_PTR(-ENOMEM);
3739
3740         f6i->dst_nocount = true;
3741         f6i->dst_host = true;
3742         f6i->fib6_protocol = RTPROT_KERNEL;
3743         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3744         if (anycast) {
3745                 f6i->fib6_type = RTN_ANYCAST;
3746                 f6i->fib6_flags |= RTF_ANYCAST;
3747         } else {
3748                 f6i->fib6_type = RTN_LOCAL;
3749                 f6i->fib6_flags |= RTF_LOCAL;
3750         }
3751
3752         f6i->fib6_nh.nh_gw = *addr;
3753         dev_hold(dev);
3754         f6i->fib6_nh.nh_dev = dev;
3755         f6i->fib6_dst.addr = *addr;
3756         f6i->fib6_dst.plen = 128;
3757         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3758         f6i->fib6_table = fib6_get_table(net, tb_id);
3759
3760         return f6i;
3761 }
3762
3763 /* remove deleted ip from prefsrc entries */
3764 struct arg_dev_net_ip {
3765         struct net_device *dev;
3766         struct net *net;
3767         struct in6_addr *addr;
3768 };
3769
3770 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3771 {
3772         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3773         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3774         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3775
3776         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3777             rt != net->ipv6.fib6_null_entry &&
3778             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3779                 spin_lock_bh(&rt6_exception_lock);
3780                 /* remove prefsrc entry */
3781                 rt->fib6_prefsrc.plen = 0;
3782                 /* need to update cache as well */
3783                 rt6_exceptions_remove_prefsrc(rt);
3784                 spin_unlock_bh(&rt6_exception_lock);
3785         }
3786         return 0;
3787 }
3788
3789 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3790 {
3791         struct net *net = dev_net(ifp->idev->dev);
3792         struct arg_dev_net_ip adni = {
3793                 .dev = ifp->idev->dev,
3794                 .net = net,
3795                 .addr = &ifp->addr,
3796         };
3797         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3798 }
3799
3800 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3801
3802 /* Remove routers and update dst entries when gateway turn into host. */
3803 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3804 {
3805         struct in6_addr *gateway = (struct in6_addr *)arg;
3806
3807         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3808             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3809                 return -1;
3810         }
3811
3812         /* Further clean up cached routes in exception table.
3813          * This is needed because cached route may have a different
3814          * gateway than its 'parent' in the case of an ip redirect.
3815          */
3816         rt6_exceptions_clean_tohost(rt, gateway);
3817
3818         return 0;
3819 }
3820
3821 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3822 {
3823         fib6_clean_all(net, fib6_clean_tohost, gateway);
3824 }
3825
3826 struct arg_netdev_event {
3827         const struct net_device *dev;
3828         union {
3829                 unsigned int nh_flags;
3830                 unsigned long event;
3831         };
3832 };
3833
3834 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3835 {
3836         struct fib6_info *iter;
3837         struct fib6_node *fn;
3838
3839         fn = rcu_dereference_protected(rt->fib6_node,
3840                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3841         iter = rcu_dereference_protected(fn->leaf,
3842                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3843         while (iter) {
3844                 if (iter->fib6_metric == rt->fib6_metric &&
3845                     iter->fib6_nsiblings)
3846                         return iter;
3847                 iter = rcu_dereference_protected(iter->fib6_next,
3848                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3849         }
3850
3851         return NULL;
3852 }
3853
3854 static bool rt6_is_dead(const struct fib6_info *rt)
3855 {
3856         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3857             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3858              fib6_ignore_linkdown(rt)))
3859                 return true;
3860
3861         return false;
3862 }
3863
3864 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3865 {
3866         struct fib6_info *iter;
3867         int total = 0;
3868
3869         if (!rt6_is_dead(rt))
3870                 total += rt->fib6_nh.nh_weight;
3871
3872         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3873                 if (!rt6_is_dead(iter))
3874                         total += iter->fib6_nh.nh_weight;
3875         }
3876
3877         return total;
3878 }
3879
3880 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3881 {
3882         int upper_bound = -1;
3883
3884         if (!rt6_is_dead(rt)) {
3885                 *weight += rt->fib6_nh.nh_weight;
3886                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3887                                                     total) - 1;
3888         }
3889         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3890 }
3891
3892 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3893 {
3894         struct fib6_info *iter;
3895         int weight = 0;
3896
3897         rt6_upper_bound_set(rt, &weight, total);
3898
3899         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3900                 rt6_upper_bound_set(iter, &weight, total);
3901 }
3902
3903 void rt6_multipath_rebalance(struct fib6_info *rt)
3904 {
3905         struct fib6_info *first;
3906         int total;
3907
3908         /* In case the entire multipath route was marked for flushing,
3909          * then there is no need to rebalance upon the removal of every
3910          * sibling route.
3911          */
3912         if (!rt->fib6_nsiblings || rt->should_flush)
3913                 return;
3914
3915         /* During lookup routes are evaluated in order, so we need to
3916          * make sure upper bounds are assigned from the first sibling
3917          * onwards.
3918          */
3919         first = rt6_multipath_first_sibling(rt);
3920         if (WARN_ON_ONCE(!first))
3921                 return;
3922
3923         total = rt6_multipath_total_weight(first);
3924         rt6_multipath_upper_bound_set(first, total);
3925 }
3926
3927 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3928 {
3929         const struct arg_netdev_event *arg = p_arg;
3930         struct net *net = dev_net(arg->dev);
3931
3932         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3933                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3934                 fib6_update_sernum_upto_root(net, rt);
3935                 rt6_multipath_rebalance(rt);
3936         }
3937
3938         return 0;
3939 }
3940
3941 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3942 {
3943         struct arg_netdev_event arg = {
3944                 .dev = dev,
3945                 {
3946                         .nh_flags = nh_flags,
3947                 },
3948         };
3949
3950         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3951                 arg.nh_flags |= RTNH_F_LINKDOWN;
3952
3953         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3954 }
3955
3956 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3957                                    const struct net_device *dev)
3958 {
3959         struct fib6_info *iter;
3960
3961         if (rt->fib6_nh.nh_dev == dev)
3962                 return true;
3963         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3964                 if (iter->fib6_nh.nh_dev == dev)
3965                         return true;
3966
3967         return false;
3968 }
3969
3970 static void rt6_multipath_flush(struct fib6_info *rt)
3971 {
3972         struct fib6_info *iter;
3973
3974         rt->should_flush = 1;
3975         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976                 iter->should_flush = 1;
3977 }
3978
3979 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3980                                              const struct net_device *down_dev)
3981 {
3982         struct fib6_info *iter;
3983         unsigned int dead = 0;
3984
3985         if (rt->fib6_nh.nh_dev == down_dev ||
3986             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3987                 dead++;
3988         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3989                 if (iter->fib6_nh.nh_dev == down_dev ||
3990                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3991                         dead++;
3992
3993         return dead;
3994 }
3995
3996 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3997                                        const struct net_device *dev,
3998                                        unsigned int nh_flags)
3999 {
4000         struct fib6_info *iter;
4001
4002         if (rt->fib6_nh.nh_dev == dev)
4003                 rt->fib6_nh.nh_flags |= nh_flags;
4004         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4005                 if (iter->fib6_nh.nh_dev == dev)
4006                         iter->fib6_nh.nh_flags |= nh_flags;
4007 }
4008
4009 /* called with write lock held for table with rt */
4010 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4011 {
4012         const struct arg_netdev_event *arg = p_arg;
4013         const struct net_device *dev = arg->dev;
4014         struct net *net = dev_net(dev);
4015
4016         if (rt == net->ipv6.fib6_null_entry)
4017                 return 0;
4018
4019         switch (arg->event) {
4020         case NETDEV_UNREGISTER:
4021                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4022         case NETDEV_DOWN:
4023                 if (rt->should_flush)
4024                         return -1;
4025                 if (!rt->fib6_nsiblings)
4026                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4027                 if (rt6_multipath_uses_dev(rt, dev)) {
4028                         unsigned int count;
4029
4030                         count = rt6_multipath_dead_count(rt, dev);
4031                         if (rt->fib6_nsiblings + 1 == count) {
4032                                 rt6_multipath_flush(rt);
4033                                 return -1;
4034                         }
4035                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4036                                                    RTNH_F_LINKDOWN);
4037                         fib6_update_sernum(net, rt);
4038                         rt6_multipath_rebalance(rt);
4039                 }
4040                 return -2;
4041         case NETDEV_CHANGE:
4042                 if (rt->fib6_nh.nh_dev != dev ||
4043                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4044                         break;
4045                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4046                 rt6_multipath_rebalance(rt);
4047                 break;
4048         }
4049
4050         return 0;
4051 }
4052
4053 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4054 {
4055         struct arg_netdev_event arg = {
4056                 .dev = dev,
4057                 {
4058                         .event = event,
4059                 },
4060         };
4061
4062         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4063 }
4064
4065 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4066 {
4067         rt6_sync_down_dev(dev, event);
4068         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4069         neigh_ifdown(&nd_tbl, dev);
4070 }
4071
4072 struct rt6_mtu_change_arg {
4073         struct net_device *dev;
4074         unsigned int mtu;
4075 };
4076
4077 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4078 {
4079         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4080         struct inet6_dev *idev;
4081
4082         /* In IPv6 pmtu discovery is not optional,
4083            so that RTAX_MTU lock cannot disable it.
4084            We still use this lock to block changes
4085            caused by addrconf/ndisc.
4086         */
4087
4088         idev = __in6_dev_get(arg->dev);
4089         if (!idev)
4090                 return 0;
4091
4092         /* For administrative MTU increase, there is no way to discover
4093            IPv6 PMTU increase, so PMTU increase should be updated here.
4094            Since RFC 1981 doesn't include administrative MTU increase
4095            update PMTU increase is a MUST. (i.e. jumbo frame)
4096          */
4097         if (rt->fib6_nh.nh_dev == arg->dev &&
4098             !fib6_metric_locked(rt, RTAX_MTU)) {
4099                 u32 mtu = rt->fib6_pmtu;
4100
4101                 if (mtu >= arg->mtu ||
4102                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4103                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4104
4105                 spin_lock_bh(&rt6_exception_lock);
4106                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4107                 spin_unlock_bh(&rt6_exception_lock);
4108         }
4109         return 0;
4110 }
4111
4112 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4113 {
4114         struct rt6_mtu_change_arg arg = {
4115                 .dev = dev,
4116                 .mtu = mtu,
4117         };
4118
4119         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4120 }
4121
4122 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4123         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4124         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4125         [RTA_OIF]               = { .type = NLA_U32 },
4126         [RTA_IIF]               = { .type = NLA_U32 },
4127         [RTA_PRIORITY]          = { .type = NLA_U32 },
4128         [RTA_METRICS]           = { .type = NLA_NESTED },
4129         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4130         [RTA_PREF]              = { .type = NLA_U8 },
4131         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4132         [RTA_ENCAP]             = { .type = NLA_NESTED },
4133         [RTA_EXPIRES]           = { .type = NLA_U32 },
4134         [RTA_UID]               = { .type = NLA_U32 },
4135         [RTA_MARK]              = { .type = NLA_U32 },
4136         [RTA_TABLE]             = { .type = NLA_U32 },
4137         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4138         [RTA_SPORT]             = { .type = NLA_U16 },
4139         [RTA_DPORT]             = { .type = NLA_U16 },
4140 };
4141
4142 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4143                               struct fib6_config *cfg,
4144                               struct netlink_ext_ack *extack)
4145 {
4146         struct rtmsg *rtm;
4147         struct nlattr *tb[RTA_MAX+1];
4148         unsigned int pref;
4149         int err;
4150
4151         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4152                           NULL);
4153         if (err < 0)
4154                 goto errout;
4155
4156         err = -EINVAL;
4157         rtm = nlmsg_data(nlh);
4158         memset(cfg, 0, sizeof(*cfg));
4159
4160         cfg->fc_table = rtm->rtm_table;
4161         cfg->fc_dst_len = rtm->rtm_dst_len;
4162         cfg->fc_src_len = rtm->rtm_src_len;
4163         cfg->fc_flags = RTF_UP;
4164         cfg->fc_protocol = rtm->rtm_protocol;
4165         cfg->fc_type = rtm->rtm_type;
4166
4167         if (rtm->rtm_type == RTN_UNREACHABLE ||
4168             rtm->rtm_type == RTN_BLACKHOLE ||
4169             rtm->rtm_type == RTN_PROHIBIT ||
4170             rtm->rtm_type == RTN_THROW)
4171                 cfg->fc_flags |= RTF_REJECT;
4172
4173         if (rtm->rtm_type == RTN_LOCAL)
4174                 cfg->fc_flags |= RTF_LOCAL;
4175
4176         if (rtm->rtm_flags & RTM_F_CLONED)
4177                 cfg->fc_flags |= RTF_CACHE;
4178
4179         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4180
4181         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4182         cfg->fc_nlinfo.nlh = nlh;
4183         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4184
4185         if (tb[RTA_GATEWAY]) {
4186                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4187                 cfg->fc_flags |= RTF_GATEWAY;
4188         }
4189
4190         if (tb[RTA_DST]) {
4191                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4192
4193                 if (nla_len(tb[RTA_DST]) < plen)
4194                         goto errout;
4195
4196                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4197         }
4198
4199         if (tb[RTA_SRC]) {
4200                 int plen = (rtm->rtm_src_len + 7) >> 3;
4201
4202                 if (nla_len(tb[RTA_SRC]) < plen)
4203                         goto errout;
4204
4205                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4206         }
4207
4208         if (tb[RTA_PREFSRC])
4209                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4210
4211         if (tb[RTA_OIF])
4212                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4213
4214         if (tb[RTA_PRIORITY])
4215                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4216
4217         if (tb[RTA_METRICS]) {
4218                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4219                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4220         }
4221
4222         if (tb[RTA_TABLE])
4223                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4224
4225         if (tb[RTA_MULTIPATH]) {
4226                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4227                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4228
4229                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4230                                                      cfg->fc_mp_len, extack);
4231                 if (err < 0)
4232                         goto errout;
4233         }
4234
4235         if (tb[RTA_PREF]) {
4236                 pref = nla_get_u8(tb[RTA_PREF]);
4237                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4238                     pref != ICMPV6_ROUTER_PREF_HIGH)
4239                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4240                 cfg->fc_flags |= RTF_PREF(pref);
4241         }
4242
4243         if (tb[RTA_ENCAP])
4244                 cfg->fc_encap = tb[RTA_ENCAP];
4245
4246         if (tb[RTA_ENCAP_TYPE]) {
4247                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4248
4249                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4250                 if (err < 0)
4251                         goto errout;
4252         }
4253
4254         if (tb[RTA_EXPIRES]) {
4255                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4256
4257                 if (addrconf_finite_timeout(timeout)) {
4258                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4259                         cfg->fc_flags |= RTF_EXPIRES;
4260                 }
4261         }
4262
4263         err = 0;
4264 errout:
4265         return err;
4266 }
4267
4268 struct rt6_nh {
4269         struct fib6_info *fib6_info;
4270         struct fib6_config r_cfg;
4271         struct list_head next;
4272 };
4273
4274 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4275 {
4276         struct rt6_nh *nh;
4277
4278         list_for_each_entry(nh, rt6_nh_list, next) {
4279                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4280                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4281                         nh->r_cfg.fc_ifindex);
4282         }
4283 }
4284
4285 static int ip6_route_info_append(struct net *net,
4286                                  struct list_head *rt6_nh_list,
4287                                  struct fib6_info *rt,
4288                                  struct fib6_config *r_cfg)
4289 {
4290         struct rt6_nh *nh;
4291         int err = -EEXIST;
4292
4293         list_for_each_entry(nh, rt6_nh_list, next) {
4294                 /* check if fib6_info already exists */
4295                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4296                         return err;
4297         }
4298
4299         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4300         if (!nh)
4301                 return -ENOMEM;
4302         nh->fib6_info = rt;
4303         err = ip6_convert_metrics(net, rt, r_cfg);
4304         if (err) {
4305                 kfree(nh);
4306                 return err;
4307         }
4308         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4309         list_add_tail(&nh->next, rt6_nh_list);
4310
4311         return 0;
4312 }
4313
4314 static void ip6_route_mpath_notify(struct fib6_info *rt,
4315                                    struct fib6_info *rt_last,
4316                                    struct nl_info *info,
4317                                    __u16 nlflags)
4318 {
4319         /* if this is an APPEND route, then rt points to the first route
4320          * inserted and rt_last points to last route inserted. Userspace
4321          * wants a consistent dump of the route which starts at the first
4322          * nexthop. Since sibling routes are always added at the end of
4323          * the list, find the first sibling of the last route appended
4324          */
4325         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4326                 rt = list_first_entry(&rt_last->fib6_siblings,
4327                                       struct fib6_info,
4328                                       fib6_siblings);
4329         }
4330
4331         if (rt)
4332                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4333 }
4334
4335 static int ip6_route_multipath_add(struct fib6_config *cfg,
4336                                    struct netlink_ext_ack *extack)
4337 {
4338         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4339         struct nl_info *info = &cfg->fc_nlinfo;
4340         struct fib6_config r_cfg;
4341         struct rtnexthop *rtnh;
4342         struct fib6_info *rt;
4343         struct rt6_nh *err_nh;
4344         struct rt6_nh *nh, *nh_safe;
4345         __u16 nlflags;
4346         int remaining;
4347         int attrlen;
4348         int err = 1;
4349         int nhn = 0;
4350         int replace = (cfg->fc_nlinfo.nlh &&
4351                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4352         LIST_HEAD(rt6_nh_list);
4353
4354         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4355         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4356                 nlflags |= NLM_F_APPEND;
4357
4358         remaining = cfg->fc_mp_len;
4359         rtnh = (struct rtnexthop *)cfg->fc_mp;
4360
4361         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4362          * fib6_info structs per nexthop
4363          */
4364         while (rtnh_ok(rtnh, remaining)) {
4365                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4366                 if (rtnh->rtnh_ifindex)
4367                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4368
4369                 attrlen = rtnh_attrlen(rtnh);
4370                 if (attrlen > 0) {
4371                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4372
4373                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4374                         if (nla) {
4375                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4376                                 r_cfg.fc_flags |= RTF_GATEWAY;
4377                         }
4378                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4379                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4380                         if (nla)
4381                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4382                 }
4383
4384                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4385                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4386                 if (IS_ERR(rt)) {
4387                         err = PTR_ERR(rt);
4388                         rt = NULL;
4389                         goto cleanup;
4390                 }
4391
4392                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4393
4394                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4395                                             rt, &r_cfg);
4396                 if (err) {
4397                         fib6_info_release(rt);
4398                         goto cleanup;
4399                 }
4400
4401                 rtnh = rtnh_next(rtnh, &remaining);
4402         }
4403
4404         /* for add and replace send one notification with all nexthops.
4405          * Skip the notification in fib6_add_rt2node and send one with
4406          * the full route when done
4407          */
4408         info->skip_notify = 1;
4409
4410         err_nh = NULL;
4411         list_for_each_entry(nh, &rt6_nh_list, next) {
4412                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4413                 fib6_info_release(nh->fib6_info);
4414
4415                 if (!err) {
4416                         /* save reference to last route successfully inserted */
4417                         rt_last = nh->fib6_info;
4418
4419                         /* save reference to first route for notification */
4420                         if (!rt_notif)
4421                                 rt_notif = nh->fib6_info;
4422                 }
4423
4424                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4425                 nh->fib6_info = NULL;
4426                 if (err) {
4427                         if (replace && nhn)
4428                                 ip6_print_replace_route_err(&rt6_nh_list);
4429                         err_nh = nh;
4430                         goto add_errout;
4431                 }
4432
4433                 /* Because each route is added like a single route we remove
4434                  * these flags after the first nexthop: if there is a collision,
4435                  * we have already failed to add the first nexthop:
4436                  * fib6_add_rt2node() has rejected it; when replacing, old
4437                  * nexthops have been replaced by first new, the rest should
4438                  * be added to it.
4439                  */
4440                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4441                                                      NLM_F_REPLACE);
4442                 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
4443                 nhn++;
4444         }
4445
4446         /* success ... tell user about new route */
4447         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4448         goto cleanup;
4449
4450 add_errout:
4451         /* send notification for routes that were added so that
4452          * the delete notifications sent by ip6_route_del are
4453          * coherent
4454          */
4455         if (rt_notif)
4456                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4457
4458         /* Delete routes that were already added */
4459         list_for_each_entry(nh, &rt6_nh_list, next) {
4460                 if (err_nh == nh)
4461                         break;
4462                 ip6_route_del(&nh->r_cfg, extack);
4463         }
4464
4465 cleanup:
4466         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4467                 if (nh->fib6_info)
4468                         fib6_info_release(nh->fib6_info);
4469                 list_del(&nh->next);
4470                 kfree(nh);
4471         }
4472
4473         return err;
4474 }
4475
4476 static int ip6_route_multipath_del(struct fib6_config *cfg,
4477                                    struct netlink_ext_ack *extack)
4478 {
4479         struct fib6_config r_cfg;
4480         struct rtnexthop *rtnh;
4481         int remaining;
4482         int attrlen;
4483         int err = 1, last_err = 0;
4484
4485         remaining = cfg->fc_mp_len;
4486         rtnh = (struct rtnexthop *)cfg->fc_mp;
4487
4488         /* Parse a Multipath Entry */
4489         while (rtnh_ok(rtnh, remaining)) {
4490                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4491                 if (rtnh->rtnh_ifindex)
4492                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4493
4494                 attrlen = rtnh_attrlen(rtnh);
4495                 if (attrlen > 0) {
4496                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4497
4498                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4499                         if (nla) {
4500                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4501                                 r_cfg.fc_flags |= RTF_GATEWAY;
4502                         }
4503                 }
4504                 err = ip6_route_del(&r_cfg, extack);
4505                 if (err)
4506                         last_err = err;
4507
4508                 rtnh = rtnh_next(rtnh, &remaining);
4509         }
4510
4511         return last_err;
4512 }
4513
4514 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4515                               struct netlink_ext_ack *extack)
4516 {
4517         struct fib6_config cfg;
4518         int err;
4519
4520         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4521         if (err < 0)
4522                 return err;
4523
4524         if (cfg.fc_mp)
4525                 return ip6_route_multipath_del(&cfg, extack);
4526         else {
4527                 cfg.fc_delete_all_nh = 1;
4528                 return ip6_route_del(&cfg, extack);
4529         }
4530 }
4531
4532 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4533                               struct netlink_ext_ack *extack)
4534 {
4535         struct fib6_config cfg;
4536         int err;
4537
4538         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4539         if (err < 0)
4540                 return err;
4541
4542         if (cfg.fc_mp)
4543                 return ip6_route_multipath_add(&cfg, extack);
4544         else
4545                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4546 }
4547
4548 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4549 {
4550         int nexthop_len = 0;
4551
4552         if (rt->fib6_nsiblings) {
4553                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4554                             + NLA_ALIGN(sizeof(struct rtnexthop))
4555                             + nla_total_size(16) /* RTA_GATEWAY */
4556                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4557
4558                 nexthop_len *= rt->fib6_nsiblings;
4559         }
4560
4561         return NLMSG_ALIGN(sizeof(struct rtmsg))
4562                + nla_total_size(16) /* RTA_SRC */
4563                + nla_total_size(16) /* RTA_DST */
4564                + nla_total_size(16) /* RTA_GATEWAY */
4565                + nla_total_size(16) /* RTA_PREFSRC */
4566                + nla_total_size(4) /* RTA_TABLE */
4567                + nla_total_size(4) /* RTA_IIF */
4568                + nla_total_size(4) /* RTA_OIF */
4569                + nla_total_size(4) /* RTA_PRIORITY */
4570                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4571                + nla_total_size(sizeof(struct rta_cacheinfo))
4572                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4573                + nla_total_size(1) /* RTA_PREF */
4574                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4575                + nexthop_len;
4576 }
4577
4578 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4579                             unsigned int *flags, bool skip_oif)
4580 {
4581         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4582                 *flags |= RTNH_F_DEAD;
4583
4584         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4585                 *flags |= RTNH_F_LINKDOWN;
4586
4587                 rcu_read_lock();
4588                 if (fib6_ignore_linkdown(rt))
4589                         *flags |= RTNH_F_DEAD;
4590                 rcu_read_unlock();
4591         }
4592
4593         if (rt->fib6_flags & RTF_GATEWAY) {
4594                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4595                         goto nla_put_failure;
4596         }
4597
4598         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4599         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4600                 *flags |= RTNH_F_OFFLOAD;
4601
4602         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4603         if (!skip_oif && rt->fib6_nh.nh_dev &&
4604             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4605                 goto nla_put_failure;
4606
4607         if (rt->fib6_nh.nh_lwtstate &&
4608             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4609                 goto nla_put_failure;
4610
4611         return 0;
4612
4613 nla_put_failure:
4614         return -EMSGSIZE;
4615 }
4616
4617 /* add multipath next hop */
4618 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4619 {
4620         const struct net_device *dev = rt->fib6_nh.nh_dev;
4621         struct rtnexthop *rtnh;
4622         unsigned int flags = 0;
4623
4624         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4625         if (!rtnh)
4626                 goto nla_put_failure;
4627
4628         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4629         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4630
4631         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4632                 goto nla_put_failure;
4633
4634         rtnh->rtnh_flags = flags;
4635
4636         /* length of rtnetlink header + attributes */
4637         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4638
4639         return 0;
4640
4641 nla_put_failure:
4642         return -EMSGSIZE;
4643 }
4644
4645 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4646                          struct fib6_info *rt, struct dst_entry *dst,
4647                          struct in6_addr *dest, struct in6_addr *src,
4648                          int iif, int type, u32 portid, u32 seq,
4649                          unsigned int flags)
4650 {
4651         struct rtmsg *rtm;
4652         struct nlmsghdr *nlh;
4653         long expires = 0;
4654         u32 *pmetrics;
4655         u32 table;
4656
4657         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4658         if (!nlh)
4659                 return -EMSGSIZE;
4660
4661         rtm = nlmsg_data(nlh);
4662         rtm->rtm_family = AF_INET6;
4663         rtm->rtm_dst_len = rt->fib6_dst.plen;
4664         rtm->rtm_src_len = rt->fib6_src.plen;
4665         rtm->rtm_tos = 0;
4666         if (rt->fib6_table)
4667                 table = rt->fib6_table->tb6_id;
4668         else
4669                 table = RT6_TABLE_UNSPEC;
4670         rtm->rtm_table = table;
4671         if (nla_put_u32(skb, RTA_TABLE, table))
4672                 goto nla_put_failure;
4673
4674         rtm->rtm_type = rt->fib6_type;
4675         rtm->rtm_flags = 0;
4676         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4677         rtm->rtm_protocol = rt->fib6_protocol;
4678
4679         if (rt->fib6_flags & RTF_CACHE)
4680                 rtm->rtm_flags |= RTM_F_CLONED;
4681
4682         if (dest) {
4683                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4684                         goto nla_put_failure;
4685                 rtm->rtm_dst_len = 128;
4686         } else if (rtm->rtm_dst_len)
4687                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4688                         goto nla_put_failure;
4689 #ifdef CONFIG_IPV6_SUBTREES
4690         if (src) {
4691                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4692                         goto nla_put_failure;
4693                 rtm->rtm_src_len = 128;
4694         } else if (rtm->rtm_src_len &&
4695                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4696                 goto nla_put_failure;
4697 #endif
4698         if (iif) {
4699 #ifdef CONFIG_IPV6_MROUTE
4700                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4701                         int err = ip6mr_get_route(net, skb, rtm, portid);
4702
4703                         if (err == 0)
4704                                 return 0;
4705                         if (err < 0)
4706                                 goto nla_put_failure;
4707                 } else
4708 #endif
4709                         if (nla_put_u32(skb, RTA_IIF, iif))
4710                                 goto nla_put_failure;
4711         } else if (dest) {
4712                 struct in6_addr saddr_buf;
4713                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4714                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4715                         goto nla_put_failure;
4716         }
4717
4718         if (rt->fib6_prefsrc.plen) {
4719                 struct in6_addr saddr_buf;
4720                 saddr_buf = rt->fib6_prefsrc.addr;
4721                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4722                         goto nla_put_failure;
4723         }
4724
4725         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4726         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4727                 goto nla_put_failure;
4728
4729         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4730                 goto nla_put_failure;
4731
4732         /* For multipath routes, walk the siblings list and add
4733          * each as a nexthop within RTA_MULTIPATH.
4734          */
4735         if (rt->fib6_nsiblings) {
4736                 struct fib6_info *sibling, *next_sibling;
4737                 struct nlattr *mp;
4738
4739                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4740                 if (!mp)
4741                         goto nla_put_failure;
4742
4743                 if (rt6_add_nexthop(skb, rt) < 0)
4744                         goto nla_put_failure;
4745
4746                 list_for_each_entry_safe(sibling, next_sibling,
4747                                          &rt->fib6_siblings, fib6_siblings) {
4748                         if (rt6_add_nexthop(skb, sibling) < 0)
4749                                 goto nla_put_failure;
4750                 }
4751
4752                 nla_nest_end(skb, mp);
4753         } else {
4754                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4755                         goto nla_put_failure;
4756         }
4757
4758         if (rt->fib6_flags & RTF_EXPIRES) {
4759                 expires = dst ? dst->expires : rt->expires;
4760                 expires -= jiffies;
4761         }
4762
4763         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4764                 goto nla_put_failure;
4765
4766         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4767                 goto nla_put_failure;
4768
4769
4770         nlmsg_end(skb, nlh);
4771         return 0;
4772
4773 nla_put_failure:
4774         nlmsg_cancel(skb, nlh);
4775         return -EMSGSIZE;
4776 }
4777
4778 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4779 {
4780         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4781         struct net *net = arg->net;
4782
4783         if (rt == net->ipv6.fib6_null_entry)
4784                 return 0;
4785
4786         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4787                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4788
4789                 /* user wants prefix routes only */
4790                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4791                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4792                         /* success since this is not a prefix route */
4793                         return 1;
4794                 }
4795         }
4796
4797         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4798                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4799                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4800 }
4801
4802 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4803                               struct netlink_ext_ack *extack)
4804 {
4805         struct net *net = sock_net(in_skb->sk);
4806         struct nlattr *tb[RTA_MAX+1];
4807         int err, iif = 0, oif = 0;
4808         struct fib6_info *from;
4809         struct dst_entry *dst;
4810         struct rt6_info *rt;
4811         struct sk_buff *skb;
4812         struct rtmsg *rtm;
4813         struct flowi6 fl6;
4814         bool fibmatch;
4815
4816         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4817                           extack);
4818         if (err < 0)
4819                 goto errout;
4820
4821         err = -EINVAL;
4822         memset(&fl6, 0, sizeof(fl6));
4823         rtm = nlmsg_data(nlh);
4824         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4825         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4826
4827         if (tb[RTA_SRC]) {
4828                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4829                         goto errout;
4830
4831                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4832         }
4833
4834         if (tb[RTA_DST]) {
4835                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4836                         goto errout;
4837
4838                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4839         }
4840
4841         if (tb[RTA_IIF])
4842                 iif = nla_get_u32(tb[RTA_IIF]);
4843
4844         if (tb[RTA_OIF])
4845                 oif = nla_get_u32(tb[RTA_OIF]);
4846
4847         if (tb[RTA_MARK])
4848                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4849
4850         if (tb[RTA_UID])
4851                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4852                                            nla_get_u32(tb[RTA_UID]));
4853         else
4854                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4855
4856         if (tb[RTA_SPORT])
4857                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4858
4859         if (tb[RTA_DPORT])
4860                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4861
4862         if (tb[RTA_IP_PROTO]) {
4863                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4864                                                   &fl6.flowi6_proto, extack);
4865                 if (err)
4866                         goto errout;
4867         }
4868
4869         if (iif) {
4870                 struct net_device *dev;
4871                 int flags = 0;
4872
4873                 rcu_read_lock();
4874
4875                 dev = dev_get_by_index_rcu(net, iif);
4876                 if (!dev) {
4877                         rcu_read_unlock();
4878                         err = -ENODEV;
4879                         goto errout;
4880                 }
4881
4882                 fl6.flowi6_iif = iif;
4883
4884                 if (!ipv6_addr_any(&fl6.saddr))
4885                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4886
4887                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4888
4889                 rcu_read_unlock();
4890         } else {
4891                 fl6.flowi6_oif = oif;
4892
4893                 dst = ip6_route_output(net, NULL, &fl6);
4894         }
4895
4896
4897         rt = container_of(dst, struct rt6_info, dst);
4898         if (rt->dst.error) {
4899                 err = rt->dst.error;
4900                 ip6_rt_put(rt);
4901                 goto errout;
4902         }
4903
4904         if (rt == net->ipv6.ip6_null_entry) {
4905                 err = rt->dst.error;
4906                 ip6_rt_put(rt);
4907                 goto errout;
4908         }
4909
4910         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4911         if (!skb) {
4912                 ip6_rt_put(rt);
4913                 err = -ENOBUFS;
4914                 goto errout;
4915         }
4916
4917         skb_dst_set(skb, &rt->dst);
4918
4919         rcu_read_lock();
4920         from = rcu_dereference(rt->from);
4921
4922         if (fibmatch)
4923                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4924                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4925                                     nlh->nlmsg_seq, 0);
4926         else
4927                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4928                                     &fl6.saddr, iif, RTM_NEWROUTE,
4929                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4930                                     0);
4931         rcu_read_unlock();
4932
4933         if (err < 0) {
4934                 kfree_skb(skb);
4935                 goto errout;
4936         }
4937
4938         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4939 errout:
4940         return err;
4941 }
4942
4943 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4944                      unsigned int nlm_flags)
4945 {
4946         struct sk_buff *skb;
4947         struct net *net = info->nl_net;
4948         u32 seq;
4949         int err;
4950
4951         err = -ENOBUFS;
4952         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4953
4954         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4955         if (!skb)
4956                 goto errout;
4957
4958         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4959                             event, info->portid, seq, nlm_flags);
4960         if (err < 0) {
4961                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4962                 WARN_ON(err == -EMSGSIZE);
4963                 kfree_skb(skb);
4964                 goto errout;
4965         }
4966         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4967                     info->nlh, gfp_any());
4968         return;
4969 errout:
4970         if (err < 0)
4971                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4972 }
4973
4974 static int ip6_route_dev_notify(struct notifier_block *this,
4975                                 unsigned long event, void *ptr)
4976 {
4977         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4978         struct net *net = dev_net(dev);
4979
4980         if (!(dev->flags & IFF_LOOPBACK))
4981                 return NOTIFY_OK;
4982
4983         if (event == NETDEV_REGISTER) {
4984                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4985                 net->ipv6.ip6_null_entry->dst.dev = dev;
4986                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4987 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4988                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4989                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4990                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4991                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4992 #endif
4993          } else if (event == NETDEV_UNREGISTER &&
4994                     dev->reg_state != NETREG_UNREGISTERED) {
4995                 /* NETDEV_UNREGISTER could be fired for multiple times by
4996                  * netdev_wait_allrefs(). Make sure we only call this once.
4997                  */
4998                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4999 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5000                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5001                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5002 #endif
5003         }
5004
5005         return NOTIFY_OK;
5006 }
5007
5008 /*
5009  *      /proc
5010  */
5011
5012 #ifdef CONFIG_PROC_FS
5013 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5014 {
5015         struct net *net = (struct net *)seq->private;
5016         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5017                    net->ipv6.rt6_stats->fib_nodes,
5018                    net->ipv6.rt6_stats->fib_route_nodes,
5019                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5020                    net->ipv6.rt6_stats->fib_rt_entries,
5021                    net->ipv6.rt6_stats->fib_rt_cache,
5022                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5023                    net->ipv6.rt6_stats->fib_discarded_routes);
5024
5025         return 0;
5026 }
5027 #endif  /* CONFIG_PROC_FS */
5028
5029 #ifdef CONFIG_SYSCTL
5030
5031 static
5032 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5033                               void __user *buffer, size_t *lenp, loff_t *ppos)
5034 {
5035         struct net *net;
5036         int delay;
5037         if (!write)
5038                 return -EINVAL;
5039
5040         net = (struct net *)ctl->extra1;
5041         delay = net->ipv6.sysctl.flush_delay;
5042         proc_dointvec(ctl, write, buffer, lenp, ppos);
5043         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5044         return 0;
5045 }
5046
5047 struct ctl_table ipv6_route_table_template[] = {
5048         {
5049                 .procname       =       "flush",
5050                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5051                 .maxlen         =       sizeof(int),
5052                 .mode           =       0200,
5053                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5054         },
5055         {
5056                 .procname       =       "gc_thresh",
5057                 .data           =       &ip6_dst_ops_template.gc_thresh,
5058                 .maxlen         =       sizeof(int),
5059                 .mode           =       0644,
5060                 .proc_handler   =       proc_dointvec,
5061         },
5062         {
5063                 .procname       =       "max_size",
5064                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5065                 .maxlen         =       sizeof(int),
5066                 .mode           =       0644,
5067                 .proc_handler   =       proc_dointvec,
5068         },
5069         {
5070                 .procname       =       "gc_min_interval",
5071                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5072                 .maxlen         =       sizeof(int),
5073                 .mode           =       0644,
5074                 .proc_handler   =       proc_dointvec_jiffies,
5075         },
5076         {
5077                 .procname       =       "gc_timeout",
5078                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5079                 .maxlen         =       sizeof(int),
5080                 .mode           =       0644,
5081                 .proc_handler   =       proc_dointvec_jiffies,
5082         },
5083         {
5084                 .procname       =       "gc_interval",
5085                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5086                 .maxlen         =       sizeof(int),
5087                 .mode           =       0644,
5088                 .proc_handler   =       proc_dointvec_jiffies,
5089         },
5090         {
5091                 .procname       =       "gc_elasticity",
5092                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5093                 .maxlen         =       sizeof(int),
5094                 .mode           =       0644,
5095                 .proc_handler   =       proc_dointvec,
5096         },
5097         {
5098                 .procname       =       "mtu_expires",
5099                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5100                 .maxlen         =       sizeof(int),
5101                 .mode           =       0644,
5102                 .proc_handler   =       proc_dointvec_jiffies,
5103         },
5104         {
5105                 .procname       =       "min_adv_mss",
5106                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5107                 .maxlen         =       sizeof(int),
5108                 .mode           =       0644,
5109                 .proc_handler   =       proc_dointvec,
5110         },
5111         {
5112                 .procname       =       "gc_min_interval_ms",
5113                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5114                 .maxlen         =       sizeof(int),
5115                 .mode           =       0644,
5116                 .proc_handler   =       proc_dointvec_ms_jiffies,
5117         },
5118         { }
5119 };
5120
5121 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5122 {
5123         struct ctl_table *table;
5124
5125         table = kmemdup(ipv6_route_table_template,
5126                         sizeof(ipv6_route_table_template),
5127                         GFP_KERNEL);
5128
5129         if (table) {
5130                 table[0].data = &net->ipv6.sysctl.flush_delay;
5131                 table[0].extra1 = net;
5132                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5133                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5134                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5135                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5136                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5137                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5138                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5139                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5140                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5141
5142                 /* Don't export sysctls to unprivileged users */
5143                 if (net->user_ns != &init_user_ns)
5144                         table[0].procname = NULL;
5145         }
5146
5147         return table;
5148 }
5149 #endif
5150
5151 static int __net_init ip6_route_net_init(struct net *net)
5152 {
5153         int ret = -ENOMEM;
5154
5155         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5156                sizeof(net->ipv6.ip6_dst_ops));
5157
5158         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5159                 goto out_ip6_dst_ops;
5160
5161         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5162                                             sizeof(*net->ipv6.fib6_null_entry),
5163                                             GFP_KERNEL);
5164         if (!net->ipv6.fib6_null_entry)
5165                 goto out_ip6_dst_entries;
5166
5167         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5168                                            sizeof(*net->ipv6.ip6_null_entry),
5169                                            GFP_KERNEL);
5170         if (!net->ipv6.ip6_null_entry)
5171                 goto out_fib6_null_entry;
5172         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5173         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5174                          ip6_template_metrics, true);
5175
5176 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5177         net->ipv6.fib6_has_custom_rules = false;
5178         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5179                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5180                                                GFP_KERNEL);
5181         if (!net->ipv6.ip6_prohibit_entry)
5182                 goto out_ip6_null_entry;
5183         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5184         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5185                          ip6_template_metrics, true);
5186
5187         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5188                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5189                                                GFP_KERNEL);
5190         if (!net->ipv6.ip6_blk_hole_entry)
5191                 goto out_ip6_prohibit_entry;
5192         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5193         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5194                          ip6_template_metrics, true);
5195 #endif
5196
5197         net->ipv6.sysctl.flush_delay = 0;
5198         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5199         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5200         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5201         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5202         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5203         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5204         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5205
5206         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5207
5208         ret = 0;
5209 out:
5210         return ret;
5211
5212 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5213 out_ip6_prohibit_entry:
5214         kfree(net->ipv6.ip6_prohibit_entry);
5215 out_ip6_null_entry:
5216         kfree(net->ipv6.ip6_null_entry);
5217 #endif
5218 out_fib6_null_entry:
5219         kfree(net->ipv6.fib6_null_entry);
5220 out_ip6_dst_entries:
5221         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5222 out_ip6_dst_ops:
5223         goto out;
5224 }
5225
5226 static void __net_exit ip6_route_net_exit(struct net *net)
5227 {
5228         kfree(net->ipv6.fib6_null_entry);
5229         kfree(net->ipv6.ip6_null_entry);
5230 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5231         kfree(net->ipv6.ip6_prohibit_entry);
5232         kfree(net->ipv6.ip6_blk_hole_entry);
5233 #endif
5234         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5235 }
5236
5237 static int __net_init ip6_route_net_init_late(struct net *net)
5238 {
5239 #ifdef CONFIG_PROC_FS
5240         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5241                         sizeof(struct ipv6_route_iter));
5242         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5243                         rt6_stats_seq_show, NULL);
5244 #endif
5245         return 0;
5246 }
5247
5248 static void __net_exit ip6_route_net_exit_late(struct net *net)
5249 {
5250 #ifdef CONFIG_PROC_FS
5251         remove_proc_entry("ipv6_route", net->proc_net);
5252         remove_proc_entry("rt6_stats", net->proc_net);
5253 #endif
5254 }
5255
5256 static struct pernet_operations ip6_route_net_ops = {
5257         .init = ip6_route_net_init,
5258         .exit = ip6_route_net_exit,
5259 };
5260
5261 static int __net_init ipv6_inetpeer_init(struct net *net)
5262 {
5263         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5264
5265         if (!bp)
5266                 return -ENOMEM;
5267         inet_peer_base_init(bp);
5268         net->ipv6.peers = bp;
5269         return 0;
5270 }
5271
5272 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5273 {
5274         struct inet_peer_base *bp = net->ipv6.peers;
5275
5276         net->ipv6.peers = NULL;
5277         inetpeer_invalidate_tree(bp);
5278         kfree(bp);
5279 }
5280
5281 static struct pernet_operations ipv6_inetpeer_ops = {
5282         .init   =       ipv6_inetpeer_init,
5283         .exit   =       ipv6_inetpeer_exit,
5284 };
5285
5286 static struct pernet_operations ip6_route_net_late_ops = {
5287         .init = ip6_route_net_init_late,
5288         .exit = ip6_route_net_exit_late,
5289 };
5290
5291 static struct notifier_block ip6_route_dev_notifier = {
5292         .notifier_call = ip6_route_dev_notify,
5293         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5294 };
5295
5296 void __init ip6_route_init_special_entries(void)
5297 {
5298         /* Registering of the loopback is done before this portion of code,
5299          * the loopback reference in rt6_info will not be taken, do it
5300          * manually for init_net */
5301         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5302         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5303         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5304   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5305         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5306         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5307         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5308         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5309   #endif
5310 }
5311
5312 int __init ip6_route_init(void)
5313 {
5314         int ret;
5315         int cpu;
5316
5317         ret = -ENOMEM;
5318         ip6_dst_ops_template.kmem_cachep =
5319                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5320                                   SLAB_HWCACHE_ALIGN, NULL);
5321         if (!ip6_dst_ops_template.kmem_cachep)
5322                 goto out;
5323
5324         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5325         if (ret)
5326                 goto out_kmem_cache;
5327
5328         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5329         if (ret)
5330                 goto out_dst_entries;
5331
5332         ret = register_pernet_subsys(&ip6_route_net_ops);
5333         if (ret)
5334                 goto out_register_inetpeer;
5335
5336         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5337
5338         ret = fib6_init();
5339         if (ret)
5340                 goto out_register_subsys;
5341
5342         ret = xfrm6_init();
5343         if (ret)
5344                 goto out_fib6_init;
5345
5346         ret = fib6_rules_init();
5347         if (ret)
5348                 goto xfrm6_init;
5349
5350         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5351         if (ret)
5352                 goto fib6_rules_init;
5353
5354         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5355                                    inet6_rtm_newroute, NULL, 0);
5356         if (ret < 0)
5357                 goto out_register_late_subsys;
5358
5359         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5360                                    inet6_rtm_delroute, NULL, 0);
5361         if (ret < 0)
5362                 goto out_register_late_subsys;
5363
5364         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5365                                    inet6_rtm_getroute, NULL,
5366                                    RTNL_FLAG_DOIT_UNLOCKED);
5367         if (ret < 0)
5368                 goto out_register_late_subsys;
5369
5370         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5371         if (ret)
5372                 goto out_register_late_subsys;
5373
5374         for_each_possible_cpu(cpu) {
5375                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5376
5377                 INIT_LIST_HEAD(&ul->head);
5378                 spin_lock_init(&ul->lock);
5379         }
5380
5381 out:
5382         return ret;
5383
5384 out_register_late_subsys:
5385         rtnl_unregister_all(PF_INET6);
5386         unregister_pernet_subsys(&ip6_route_net_late_ops);
5387 fib6_rules_init:
5388         fib6_rules_cleanup();
5389 xfrm6_init:
5390         xfrm6_fini();
5391 out_fib6_init:
5392         fib6_gc_cleanup();
5393 out_register_subsys:
5394         unregister_pernet_subsys(&ip6_route_net_ops);
5395 out_register_inetpeer:
5396         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5397 out_dst_entries:
5398         dst_entries_destroy(&ip6_dst_blackhole_ops);
5399 out_kmem_cache:
5400         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5401         goto out;
5402 }
5403
5404 void ip6_route_cleanup(void)
5405 {
5406         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5407         unregister_pernet_subsys(&ip6_route_net_late_ops);
5408         fib6_rules_cleanup();
5409         xfrm6_fini();
5410         fib6_gc_cleanup();
5411         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5412         unregister_pernet_subsys(&ip6_route_net_ops);
5413         dst_entries_destroy(&ip6_dst_blackhole_ops);
5414         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5415 }