ipv6: fix incorrect bitwise operator used on rt6i_flags
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(rt->dst.from);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct dst_entry *from = dst->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         dst->from = NULL;
413         dst_release(from);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                        rt6_check_expired((struct rt6_info *)rt->dst.from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         /* We might have already computed the hash for ICMPv6 errors. In such
461          * case it will always be non-zero. Otherwise now is the time to do it.
462          */
463         if (!fl6->mp_hash)
464                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465
466         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 if (rt6_score_route(sibling, oif, strict) < 0)
476                                         break;
477                                 match = sibling;
478                                 break;
479                         }
480                 }
481         return match;
482 }
483
484 /*
485  *      Route lookup. rcu_read_lock() should be held.
486  */
487
488 static inline struct rt6_info *rt6_device_match(struct net *net,
489                                                     struct rt6_info *rt,
490                                                     const struct in6_addr *saddr,
491                                                     int oif,
492                                                     int flags)
493 {
494         struct rt6_info *local = NULL;
495         struct rt6_info *sprt;
496
497         if (!oif && ipv6_addr_any(saddr))
498                 goto out;
499
500         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
501                 struct net_device *dev = sprt->dst.dev;
502
503                 if (oif) {
504                         if (dev->ifindex == oif)
505                                 return sprt;
506                         if (dev->flags & IFF_LOOPBACK) {
507                                 if (!sprt->rt6i_idev ||
508                                     sprt->rt6i_idev->dev->ifindex != oif) {
509                                         if (flags & RT6_LOOKUP_F_IFACE)
510                                                 continue;
511                                         if (local &&
512                                             local->rt6i_idev->dev->ifindex == oif)
513                                                 continue;
514                                 }
515                                 local = sprt;
516                         }
517                 } else {
518                         if (ipv6_chk_addr(net, saddr, dev,
519                                           flags & RT6_LOOKUP_F_IFACE))
520                                 return sprt;
521                 }
522         }
523
524         if (oif) {
525                 if (local)
526                         return local;
527
528                 if (flags & RT6_LOOKUP_F_IFACE)
529                         return net->ipv6.ip6_null_entry;
530         }
531 out:
532         return rt;
533 }
534
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537         struct work_struct work;
538         struct in6_addr target;
539         struct net_device *dev;
540 };
541
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544         struct in6_addr mcaddr;
545         struct __rt6_probe_work *work =
546                 container_of(w, struct __rt6_probe_work, work);
547
548         addrconf_addr_solict_mult(&work->target, &mcaddr);
549         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550         dev_put(work->dev);
551         kfree(work);
552 }
553
554 static void rt6_probe(struct rt6_info *rt)
555 {
556         struct __rt6_probe_work *work;
557         struct neighbour *neigh;
558         /*
559          * Okay, this does not seem to be appropriate
560          * for now, however, we need to check if it
561          * is really so; aka Router Reachability Probing.
562          *
563          * Router Reachability Probe MUST be rate-limited
564          * to no more than one per minute.
565          */
566         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567                 return;
568         rcu_read_lock_bh();
569         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570         if (neigh) {
571                 if (neigh->nud_state & NUD_VALID)
572                         goto out;
573
574                 work = NULL;
575                 write_lock(&neigh->lock);
576                 if (!(neigh->nud_state & NUD_VALID) &&
577                     time_after(jiffies,
578                                neigh->updated +
579                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else {
586                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587         }
588
589         if (work) {
590                 INIT_WORK(&work->work, rt6_probe_deferred);
591                 work->target = rt->rt6i_gateway;
592                 dev_hold(rt->dst.dev);
593                 work->dev = rt->dst.dev;
594                 schedule_work(&work->work);
595         }
596
597 out:
598         rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611         struct net_device *dev = rt->dst.dev;
612         if (!oif || dev->ifindex == oif)
613                 return 2;
614         if ((dev->flags & IFF_LOOPBACK) &&
615             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616                 return 1;
617         return 0;
618 }
619
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622         struct neighbour *neigh;
623         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625         if (rt->rt6i_flags & RTF_NONEXTHOP ||
626             !(rt->rt6i_flags & RTF_GATEWAY))
627                 return RT6_NUD_SUCCEED;
628
629         rcu_read_lock_bh();
630         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631         if (neigh) {
632                 read_lock(&neigh->lock);
633                 if (neigh->nud_state & NUD_VALID)
634                         ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636                 else if (!(neigh->nud_state & NUD_FAILED))
637                         ret = RT6_NUD_SUCCEED;
638                 else
639                         ret = RT6_NUD_FAIL_PROBE;
640 #endif
641                 read_unlock(&neigh->lock);
642         } else {
643                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645         }
646         rcu_read_unlock_bh();
647
648         return ret;
649 }
650
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652                            int strict)
653 {
654         int m;
655
656         m = rt6_check_dev(rt, oif);
657         if (!m && (strict & RT6_LOOKUP_F_IFACE))
658                 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662         if (strict & RT6_LOOKUP_F_REACHABLE) {
663                 int n = rt6_check_neigh(rt);
664                 if (n < 0)
665                         return n;
666         }
667         return m;
668 }
669
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671                                    int *mpri, struct rt6_info *match,
672                                    bool *do_rr)
673 {
674         int m;
675         bool match_do_rr = false;
676         struct inet6_dev *idev = rt->rt6i_idev;
677         struct net_device *dev = rt->dst.dev;
678
679         if (dev && !netif_carrier_ok(dev) &&
680             idev->cnf.ignore_routes_with_linkdown &&
681             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
682                 goto out;
683
684         if (rt6_check_expired(rt))
685                 goto out;
686
687         m = rt6_score_route(rt, oif, strict);
688         if (m == RT6_NUD_FAIL_DO_RR) {
689                 match_do_rr = true;
690                 m = 0; /* lowest valid score */
691         } else if (m == RT6_NUD_FAIL_HARD) {
692                 goto out;
693         }
694
695         if (strict & RT6_LOOKUP_F_REACHABLE)
696                 rt6_probe(rt);
697
698         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
699         if (m > *mpri) {
700                 *do_rr = match_do_rr;
701                 *mpri = m;
702                 match = rt;
703         }
704 out:
705         return match;
706 }
707
708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
709                                      struct rt6_info *leaf,
710                                      struct rt6_info *rr_head,
711                                      u32 metric, int oif, int strict,
712                                      bool *do_rr)
713 {
714         struct rt6_info *rt, *match, *cont;
715         int mpri = -1;
716
717         match = NULL;
718         cont = NULL;
719         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
720                 if (rt->rt6i_metric != metric) {
721                         cont = rt;
722                         break;
723                 }
724
725                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
726         }
727
728         for (rt = leaf; rt && rt != rr_head;
729              rt = rcu_dereference(rt->dst.rt6_next)) {
730                 if (rt->rt6i_metric != metric) {
731                         cont = rt;
732                         break;
733                 }
734
735                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736         }
737
738         if (match || !cont)
739                 return match;
740
741         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
742                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
743
744         return match;
745 }
746
747 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
748                                    int oif, int strict)
749 {
750         struct rt6_info *leaf = rcu_dereference(fn->leaf);
751         struct rt6_info *match, *rt0;
752         bool do_rr = false;
753         int key_plen;
754
755         if (!leaf)
756                 return net->ipv6.ip6_null_entry;
757
758         rt0 = rcu_dereference(fn->rr_ptr);
759         if (!rt0)
760                 rt0 = leaf;
761
762         /* Double check to make sure fn is not an intermediate node
763          * and fn->leaf does not points to its child's leaf
764          * (This might happen if all routes under fn are deleted from
765          * the tree and fib6_repair_tree() is called on the node.)
766          */
767         key_plen = rt0->rt6i_dst.plen;
768 #ifdef CONFIG_IPV6_SUBTREES
769         if (rt0->rt6i_src.plen)
770                 key_plen = rt0->rt6i_src.plen;
771 #endif
772         if (fn->fn_bit != key_plen)
773                 return net->ipv6.ip6_null_entry;
774
775         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
776                              &do_rr);
777
778         if (do_rr) {
779                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
780
781                 /* no entries matched; do round-robin */
782                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
783                         next = leaf;
784
785                 if (next != rt0) {
786                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
787                         /* make sure next is not being deleted from the tree */
788                         if (next->rt6i_node)
789                                 rcu_assign_pointer(fn->rr_ptr, next);
790                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
791                 }
792         }
793
794         return match ? match : net->ipv6.ip6_null_entry;
795 }
796
797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
798 {
799         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
800 }
801
802 #ifdef CONFIG_IPV6_ROUTE_INFO
803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
804                   const struct in6_addr *gwaddr)
805 {
806         struct net *net = dev_net(dev);
807         struct route_info *rinfo = (struct route_info *) opt;
808         struct in6_addr prefix_buf, *prefix;
809         unsigned int pref;
810         unsigned long lifetime;
811         struct rt6_info *rt;
812
813         if (len < sizeof(struct route_info)) {
814                 return -EINVAL;
815         }
816
817         /* Sanity check for prefix_len and length */
818         if (rinfo->length > 3) {
819                 return -EINVAL;
820         } else if (rinfo->prefix_len > 128) {
821                 return -EINVAL;
822         } else if (rinfo->prefix_len > 64) {
823                 if (rinfo->length < 2) {
824                         return -EINVAL;
825                 }
826         } else if (rinfo->prefix_len > 0) {
827                 if (rinfo->length < 1) {
828                         return -EINVAL;
829                 }
830         }
831
832         pref = rinfo->route_pref;
833         if (pref == ICMPV6_ROUTER_PREF_INVALID)
834                 return -EINVAL;
835
836         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
837
838         if (rinfo->length == 3)
839                 prefix = (struct in6_addr *)rinfo->prefix;
840         else {
841                 /* this function is safe */
842                 ipv6_addr_prefix(&prefix_buf,
843                                  (struct in6_addr *)rinfo->prefix,
844                                  rinfo->prefix_len);
845                 prefix = &prefix_buf;
846         }
847
848         if (rinfo->prefix_len == 0)
849                 rt = rt6_get_dflt_router(gwaddr, dev);
850         else
851                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
852                                         gwaddr, dev);
853
854         if (rt && !lifetime) {
855                 ip6_del_rt(rt);
856                 rt = NULL;
857         }
858
859         if (!rt && lifetime)
860                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
861                                         dev, pref);
862         else if (rt)
863                 rt->rt6i_flags = RTF_ROUTEINFO |
864                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
865
866         if (rt) {
867                 if (!addrconf_finite_timeout(lifetime))
868                         rt6_clean_expires(rt);
869                 else
870                         rt6_set_expires(rt, jiffies + HZ * lifetime);
871
872                 ip6_rt_put(rt);
873         }
874         return 0;
875 }
876 #endif
877
878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
879                                         struct in6_addr *saddr)
880 {
881         struct fib6_node *pn, *sn;
882         while (1) {
883                 if (fn->fn_flags & RTN_TL_ROOT)
884                         return NULL;
885                 pn = rcu_dereference(fn->parent);
886                 sn = FIB6_SUBTREE(pn);
887                 if (sn && sn != fn)
888                         fn = fib6_lookup(sn, NULL, saddr);
889                 else
890                         fn = pn;
891                 if (fn->fn_flags & RTN_RTINFO)
892                         return fn;
893         }
894 }
895
896 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
897                           bool null_fallback)
898 {
899         struct rt6_info *rt = *prt;
900
901         if (dst_hold_safe(&rt->dst))
902                 return true;
903         if (null_fallback) {
904                 rt = net->ipv6.ip6_null_entry;
905                 dst_hold(&rt->dst);
906         } else {
907                 rt = NULL;
908         }
909         *prt = rt;
910         return false;
911 }
912
913 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
914                                              struct fib6_table *table,
915                                              struct flowi6 *fl6, int flags)
916 {
917         struct rt6_info *rt, *rt_cache;
918         struct fib6_node *fn;
919
920         rcu_read_lock();
921         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
922 restart:
923         rt = rcu_dereference(fn->leaf);
924         if (!rt) {
925                 rt = net->ipv6.ip6_null_entry;
926         } else {
927                 rt = rt6_device_match(net, rt, &fl6->saddr,
928                                       fl6->flowi6_oif, flags);
929                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
930                         rt = rt6_multipath_select(rt, fl6,
931                                                   fl6->flowi6_oif, flags);
932         }
933         if (rt == net->ipv6.ip6_null_entry) {
934                 fn = fib6_backtrack(fn, &fl6->saddr);
935                 if (fn)
936                         goto restart;
937         }
938         /* Search through exception table */
939         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
940         if (rt_cache)
941                 rt = rt_cache;
942
943         if (ip6_hold_safe(net, &rt, true))
944                 dst_use_noref(&rt->dst, jiffies);
945
946         rcu_read_unlock();
947
948         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
949
950         return rt;
951
952 }
953
954 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
955                                     int flags)
956 {
957         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
958 }
959 EXPORT_SYMBOL_GPL(ip6_route_lookup);
960
961 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
962                             const struct in6_addr *saddr, int oif, int strict)
963 {
964         struct flowi6 fl6 = {
965                 .flowi6_oif = oif,
966                 .daddr = *daddr,
967         };
968         struct dst_entry *dst;
969         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
970
971         if (saddr) {
972                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
973                 flags |= RT6_LOOKUP_F_HAS_SADDR;
974         }
975
976         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
977         if (dst->error == 0)
978                 return (struct rt6_info *) dst;
979
980         dst_release(dst);
981
982         return NULL;
983 }
984 EXPORT_SYMBOL(rt6_lookup);
985
986 /* ip6_ins_rt is called with FREE table->tb6_lock.
987  * It takes new route entry, the addition fails by any reason the
988  * route is released.
989  * Caller must hold dst before calling it.
990  */
991
992 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
993                         struct mx6_config *mxc,
994                         struct netlink_ext_ack *extack)
995 {
996         int err;
997         struct fib6_table *table;
998
999         table = rt->rt6i_table;
1000         spin_lock_bh(&table->tb6_lock);
1001         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1002         spin_unlock_bh(&table->tb6_lock);
1003
1004         return err;
1005 }
1006
1007 int ip6_ins_rt(struct rt6_info *rt)
1008 {
1009         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1010         struct mx6_config mxc = { .mx = NULL, };
1011
1012         /* Hold dst to account for the reference from the fib6 tree */
1013         dst_hold(&rt->dst);
1014         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1015 }
1016
1017 /* called with rcu_lock held */
1018 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1019 {
1020         struct net_device *dev = rt->dst.dev;
1021
1022         if (rt->rt6i_flags & RTF_LOCAL) {
1023                 /* for copies of local routes, dst->dev needs to be the
1024                  * device if it is a master device, the master device if
1025                  * device is enslaved, and the loopback as the default
1026                  */
1027                 if (netif_is_l3_slave(dev) &&
1028                     !rt6_need_strict(&rt->rt6i_dst.addr))
1029                         dev = l3mdev_master_dev_rcu(dev);
1030                 else if (!netif_is_l3_master(dev))
1031                         dev = dev_net(dev)->loopback_dev;
1032                 /* last case is netif_is_l3_master(dev) is true in which
1033                  * case we want dev returned to be dev
1034                  */
1035         }
1036
1037         return dev;
1038 }
1039
1040 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1041                                            const struct in6_addr *daddr,
1042                                            const struct in6_addr *saddr)
1043 {
1044         struct net_device *dev;
1045         struct rt6_info *rt;
1046
1047         /*
1048          *      Clone the route.
1049          */
1050
1051         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1052                 ort = (struct rt6_info *)ort->dst.from;
1053
1054         rcu_read_lock();
1055         dev = ip6_rt_get_dev_rcu(ort);
1056         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1057         rcu_read_unlock();
1058         if (!rt)
1059                 return NULL;
1060
1061         ip6_rt_copy_init(rt, ort);
1062         rt->rt6i_flags |= RTF_CACHE;
1063         rt->rt6i_metric = 0;
1064         rt->dst.flags |= DST_HOST;
1065         rt->rt6i_dst.addr = *daddr;
1066         rt->rt6i_dst.plen = 128;
1067
1068         if (!rt6_is_gw_or_nonexthop(ort)) {
1069                 if (ort->rt6i_dst.plen != 128 &&
1070                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1071                         rt->rt6i_flags |= RTF_ANYCAST;
1072 #ifdef CONFIG_IPV6_SUBTREES
1073                 if (rt->rt6i_src.plen && saddr) {
1074                         rt->rt6i_src.addr = *saddr;
1075                         rt->rt6i_src.plen = 128;
1076                 }
1077 #endif
1078         }
1079
1080         return rt;
1081 }
1082
1083 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1084 {
1085         struct net_device *dev;
1086         struct rt6_info *pcpu_rt;
1087
1088         rcu_read_lock();
1089         dev = ip6_rt_get_dev_rcu(rt);
1090         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1091         rcu_read_unlock();
1092         if (!pcpu_rt)
1093                 return NULL;
1094         ip6_rt_copy_init(pcpu_rt, rt);
1095         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1096         pcpu_rt->rt6i_flags |= RTF_PCPU;
1097         return pcpu_rt;
1098 }
1099
1100 /* It should be called with rcu_read_lock() acquired */
1101 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1102 {
1103         struct rt6_info *pcpu_rt, **p;
1104
1105         p = this_cpu_ptr(rt->rt6i_pcpu);
1106         pcpu_rt = *p;
1107
1108         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1109                 rt6_dst_from_metrics_check(pcpu_rt);
1110
1111         return pcpu_rt;
1112 }
1113
1114 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1115 {
1116         struct rt6_info *pcpu_rt, *prev, **p;
1117
1118         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1119         if (!pcpu_rt) {
1120                 struct net *net = dev_net(rt->dst.dev);
1121
1122                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1123                 return net->ipv6.ip6_null_entry;
1124         }
1125
1126         dst_hold(&pcpu_rt->dst);
1127         p = this_cpu_ptr(rt->rt6i_pcpu);
1128         prev = cmpxchg(p, NULL, pcpu_rt);
1129         BUG_ON(prev);
1130
1131         rt6_dst_from_metrics_check(pcpu_rt);
1132         return pcpu_rt;
1133 }
1134
1135 /* exception hash table implementation
1136  */
1137 static DEFINE_SPINLOCK(rt6_exception_lock);
1138
1139 /* Remove rt6_ex from hash table and free the memory
1140  * Caller must hold rt6_exception_lock
1141  */
1142 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1143                                  struct rt6_exception *rt6_ex)
1144 {
1145         struct net *net;
1146
1147         if (!bucket || !rt6_ex)
1148                 return;
1149
1150         net = dev_net(rt6_ex->rt6i->dst.dev);
1151         rt6_ex->rt6i->rt6i_node = NULL;
1152         hlist_del_rcu(&rt6_ex->hlist);
1153         rt6_release(rt6_ex->rt6i);
1154         kfree_rcu(rt6_ex, rcu);
1155         WARN_ON_ONCE(!bucket->depth);
1156         bucket->depth--;
1157         net->ipv6.rt6_stats->fib_rt_cache--;
1158 }
1159
1160 /* Remove oldest rt6_ex in bucket and free the memory
1161  * Caller must hold rt6_exception_lock
1162  */
1163 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1164 {
1165         struct rt6_exception *rt6_ex, *oldest = NULL;
1166
1167         if (!bucket)
1168                 return;
1169
1170         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1171                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1172                         oldest = rt6_ex;
1173         }
1174         rt6_remove_exception(bucket, oldest);
1175 }
1176
1177 static u32 rt6_exception_hash(const struct in6_addr *dst,
1178                               const struct in6_addr *src)
1179 {
1180         static u32 seed __read_mostly;
1181         u32 val;
1182
1183         net_get_random_once(&seed, sizeof(seed));
1184         val = jhash(dst, sizeof(*dst), seed);
1185
1186 #ifdef CONFIG_IPV6_SUBTREES
1187         if (src)
1188                 val = jhash(src, sizeof(*src), val);
1189 #endif
1190         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1191 }
1192
1193 /* Helper function to find the cached rt in the hash table
1194  * and update bucket pointer to point to the bucket for this
1195  * (daddr, saddr) pair
1196  * Caller must hold rt6_exception_lock
1197  */
1198 static struct rt6_exception *
1199 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1200                               const struct in6_addr *daddr,
1201                               const struct in6_addr *saddr)
1202 {
1203         struct rt6_exception *rt6_ex;
1204         u32 hval;
1205
1206         if (!(*bucket) || !daddr)
1207                 return NULL;
1208
1209         hval = rt6_exception_hash(daddr, saddr);
1210         *bucket += hval;
1211
1212         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1213                 struct rt6_info *rt6 = rt6_ex->rt6i;
1214                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1215
1216 #ifdef CONFIG_IPV6_SUBTREES
1217                 if (matched && saddr)
1218                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1219 #endif
1220                 if (matched)
1221                         return rt6_ex;
1222         }
1223         return NULL;
1224 }
1225
1226 /* Helper function to find the cached rt in the hash table
1227  * and update bucket pointer to point to the bucket for this
1228  * (daddr, saddr) pair
1229  * Caller must hold rcu_read_lock()
1230  */
1231 static struct rt6_exception *
1232 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1233                          const struct in6_addr *daddr,
1234                          const struct in6_addr *saddr)
1235 {
1236         struct rt6_exception *rt6_ex;
1237         u32 hval;
1238
1239         WARN_ON_ONCE(!rcu_read_lock_held());
1240
1241         if (!(*bucket) || !daddr)
1242                 return NULL;
1243
1244         hval = rt6_exception_hash(daddr, saddr);
1245         *bucket += hval;
1246
1247         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1248                 struct rt6_info *rt6 = rt6_ex->rt6i;
1249                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1250
1251 #ifdef CONFIG_IPV6_SUBTREES
1252                 if (matched && saddr)
1253                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1254 #endif
1255                 if (matched)
1256                         return rt6_ex;
1257         }
1258         return NULL;
1259 }
1260
1261 static int rt6_insert_exception(struct rt6_info *nrt,
1262                                 struct rt6_info *ort)
1263 {
1264         struct net *net = dev_net(ort->dst.dev);
1265         struct rt6_exception_bucket *bucket;
1266         struct in6_addr *src_key = NULL;
1267         struct rt6_exception *rt6_ex;
1268         int err = 0;
1269
1270         /* ort can't be a cache or pcpu route */
1271         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1272                 ort = (struct rt6_info *)ort->dst.from;
1273         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1274
1275         spin_lock_bh(&rt6_exception_lock);
1276
1277         if (ort->exception_bucket_flushed) {
1278                 err = -EINVAL;
1279                 goto out;
1280         }
1281
1282         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1283                                         lockdep_is_held(&rt6_exception_lock));
1284         if (!bucket) {
1285                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1286                                  GFP_ATOMIC);
1287                 if (!bucket) {
1288                         err = -ENOMEM;
1289                         goto out;
1290                 }
1291                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1292         }
1293
1294 #ifdef CONFIG_IPV6_SUBTREES
1295         /* rt6i_src.plen != 0 indicates ort is in subtree
1296          * and exception table is indexed by a hash of
1297          * both rt6i_dst and rt6i_src.
1298          * Otherwise, the exception table is indexed by
1299          * a hash of only rt6i_dst.
1300          */
1301         if (ort->rt6i_src.plen)
1302                 src_key = &nrt->rt6i_src.addr;
1303 #endif
1304
1305         /* Update rt6i_prefsrc as it could be changed
1306          * in rt6_remove_prefsrc()
1307          */
1308         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1309         /* rt6_mtu_change() might lower mtu on ort.
1310          * Only insert this exception route if its mtu
1311          * is less than ort's mtu value.
1312          */
1313         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1314                 err = -EINVAL;
1315                 goto out;
1316         }
1317
1318         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1319                                                src_key);
1320         if (rt6_ex)
1321                 rt6_remove_exception(bucket, rt6_ex);
1322
1323         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1324         if (!rt6_ex) {
1325                 err = -ENOMEM;
1326                 goto out;
1327         }
1328         rt6_ex->rt6i = nrt;
1329         rt6_ex->stamp = jiffies;
1330         atomic_inc(&nrt->rt6i_ref);
1331         nrt->rt6i_node = ort->rt6i_node;
1332         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1333         bucket->depth++;
1334         net->ipv6.rt6_stats->fib_rt_cache++;
1335
1336         if (bucket->depth > FIB6_MAX_DEPTH)
1337                 rt6_exception_remove_oldest(bucket);
1338
1339 out:
1340         spin_unlock_bh(&rt6_exception_lock);
1341
1342         /* Update fn->fn_sernum to invalidate all cached dst */
1343         if (!err)
1344                 fib6_update_sernum(ort);
1345
1346         return err;
1347 }
1348
1349 void rt6_flush_exceptions(struct rt6_info *rt)
1350 {
1351         struct rt6_exception_bucket *bucket;
1352         struct rt6_exception *rt6_ex;
1353         struct hlist_node *tmp;
1354         int i;
1355
1356         spin_lock_bh(&rt6_exception_lock);
1357         /* Prevent rt6_insert_exception() to recreate the bucket list */
1358         rt->exception_bucket_flushed = 1;
1359
1360         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1361                                     lockdep_is_held(&rt6_exception_lock));
1362         if (!bucket)
1363                 goto out;
1364
1365         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1366                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1367                         rt6_remove_exception(bucket, rt6_ex);
1368                 WARN_ON_ONCE(bucket->depth);
1369                 bucket++;
1370         }
1371
1372 out:
1373         spin_unlock_bh(&rt6_exception_lock);
1374 }
1375
1376 /* Find cached rt in the hash table inside passed in rt
1377  * Caller has to hold rcu_read_lock()
1378  */
1379 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1380                                            struct in6_addr *daddr,
1381                                            struct in6_addr *saddr)
1382 {
1383         struct rt6_exception_bucket *bucket;
1384         struct in6_addr *src_key = NULL;
1385         struct rt6_exception *rt6_ex;
1386         struct rt6_info *res = NULL;
1387
1388         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1389
1390 #ifdef CONFIG_IPV6_SUBTREES
1391         /* rt6i_src.plen != 0 indicates rt is in subtree
1392          * and exception table is indexed by a hash of
1393          * both rt6i_dst and rt6i_src.
1394          * Otherwise, the exception table is indexed by
1395          * a hash of only rt6i_dst.
1396          */
1397         if (rt->rt6i_src.plen)
1398                 src_key = saddr;
1399 #endif
1400         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1401
1402         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1403                 res = rt6_ex->rt6i;
1404
1405         return res;
1406 }
1407
1408 /* Remove the passed in cached rt from the hash table that contains it */
1409 int rt6_remove_exception_rt(struct rt6_info *rt)
1410 {
1411         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1412         struct rt6_exception_bucket *bucket;
1413         struct in6_addr *src_key = NULL;
1414         struct rt6_exception *rt6_ex;
1415         int err;
1416
1417         if (!from ||
1418             !(rt->rt6i_flags & RTF_CACHE))
1419                 return -EINVAL;
1420
1421         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1422                 return -ENOENT;
1423
1424         spin_lock_bh(&rt6_exception_lock);
1425         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1426                                     lockdep_is_held(&rt6_exception_lock));
1427 #ifdef CONFIG_IPV6_SUBTREES
1428         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1429          * and exception table is indexed by a hash of
1430          * both rt6i_dst and rt6i_src.
1431          * Otherwise, the exception table is indexed by
1432          * a hash of only rt6i_dst.
1433          */
1434         if (from->rt6i_src.plen)
1435                 src_key = &rt->rt6i_src.addr;
1436 #endif
1437         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1438                                                &rt->rt6i_dst.addr,
1439                                                src_key);
1440         if (rt6_ex) {
1441                 rt6_remove_exception(bucket, rt6_ex);
1442                 err = 0;
1443         } else {
1444                 err = -ENOENT;
1445         }
1446
1447         spin_unlock_bh(&rt6_exception_lock);
1448         return err;
1449 }
1450
1451 /* Find rt6_ex which contains the passed in rt cache and
1452  * refresh its stamp
1453  */
1454 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1455 {
1456         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1457         struct rt6_exception_bucket *bucket;
1458         struct in6_addr *src_key = NULL;
1459         struct rt6_exception *rt6_ex;
1460
1461         if (!from ||
1462             !(rt->rt6i_flags & RTF_CACHE))
1463                 return;
1464
1465         rcu_read_lock();
1466         bucket = rcu_dereference(from->rt6i_exception_bucket);
1467
1468 #ifdef CONFIG_IPV6_SUBTREES
1469         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1470          * and exception table is indexed by a hash of
1471          * both rt6i_dst and rt6i_src.
1472          * Otherwise, the exception table is indexed by
1473          * a hash of only rt6i_dst.
1474          */
1475         if (from->rt6i_src.plen)
1476                 src_key = &rt->rt6i_src.addr;
1477 #endif
1478         rt6_ex = __rt6_find_exception_rcu(&bucket,
1479                                           &rt->rt6i_dst.addr,
1480                                           src_key);
1481         if (rt6_ex)
1482                 rt6_ex->stamp = jiffies;
1483
1484         rcu_read_unlock();
1485 }
1486
1487 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1488 {
1489         struct rt6_exception_bucket *bucket;
1490         struct rt6_exception *rt6_ex;
1491         int i;
1492
1493         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1494                                         lockdep_is_held(&rt6_exception_lock));
1495
1496         if (bucket) {
1497                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1499                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1500                         }
1501                         bucket++;
1502                 }
1503         }
1504 }
1505
1506 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1507 {
1508         struct rt6_exception_bucket *bucket;
1509         struct rt6_exception *rt6_ex;
1510         int i;
1511
1512         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1513                                         lockdep_is_held(&rt6_exception_lock));
1514
1515         if (bucket) {
1516                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1517                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1518                                 struct rt6_info *entry = rt6_ex->rt6i;
1519                                 /* For RTF_CACHE with rt6i_pmtu == 0
1520                                  * (i.e. a redirected route),
1521                                  * the metrics of its rt->dst.from has already
1522                                  * been updated.
1523                                  */
1524                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1525                                         entry->rt6i_pmtu = mtu;
1526                         }
1527                         bucket++;
1528                 }
1529         }
1530 }
1531
1532 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1533
1534 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1535                                         struct in6_addr *gateway)
1536 {
1537         struct rt6_exception_bucket *bucket;
1538         struct rt6_exception *rt6_ex;
1539         struct hlist_node *tmp;
1540         int i;
1541
1542         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1543                 return;
1544
1545         spin_lock_bh(&rt6_exception_lock);
1546         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1547                                      lockdep_is_held(&rt6_exception_lock));
1548
1549         if (bucket) {
1550                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1551                         hlist_for_each_entry_safe(rt6_ex, tmp,
1552                                                   &bucket->chain, hlist) {
1553                                 struct rt6_info *entry = rt6_ex->rt6i;
1554
1555                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1556                                     RTF_CACHE_GATEWAY &&
1557                                     ipv6_addr_equal(gateway,
1558                                                     &entry->rt6i_gateway)) {
1559                                         rt6_remove_exception(bucket, rt6_ex);
1560                                 }
1561                         }
1562                         bucket++;
1563                 }
1564         }
1565
1566         spin_unlock_bh(&rt6_exception_lock);
1567 }
1568
1569 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1570                                       struct rt6_exception *rt6_ex,
1571                                       struct fib6_gc_args *gc_args,
1572                                       unsigned long now)
1573 {
1574         struct rt6_info *rt = rt6_ex->rt6i;
1575
1576         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1577             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1578                 RT6_TRACE("aging clone %p\n", rt);
1579                 rt6_remove_exception(bucket, rt6_ex);
1580                 return;
1581         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1582                 struct neighbour *neigh;
1583                 __u8 neigh_flags = 0;
1584
1585                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1586                 if (neigh) {
1587                         neigh_flags = neigh->flags;
1588                         neigh_release(neigh);
1589                 }
1590                 if (!(neigh_flags & NTF_ROUTER)) {
1591                         RT6_TRACE("purging route %p via non-router but gateway\n",
1592                                   rt);
1593                         rt6_remove_exception(bucket, rt6_ex);
1594                         return;
1595                 }
1596         }
1597         gc_args->more++;
1598 }
1599
1600 void rt6_age_exceptions(struct rt6_info *rt,
1601                         struct fib6_gc_args *gc_args,
1602                         unsigned long now)
1603 {
1604         struct rt6_exception_bucket *bucket;
1605         struct rt6_exception *rt6_ex;
1606         struct hlist_node *tmp;
1607         int i;
1608
1609         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1610                 return;
1611
1612         spin_lock_bh(&rt6_exception_lock);
1613         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1614                                     lockdep_is_held(&rt6_exception_lock));
1615
1616         if (bucket) {
1617                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1618                         hlist_for_each_entry_safe(rt6_ex, tmp,
1619                                                   &bucket->chain, hlist) {
1620                                 rt6_age_examine_exception(bucket, rt6_ex,
1621                                                           gc_args, now);
1622                         }
1623                         bucket++;
1624                 }
1625         }
1626         spin_unlock_bh(&rt6_exception_lock);
1627 }
1628
1629 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1630                                int oif, struct flowi6 *fl6, int flags)
1631 {
1632         struct fib6_node *fn, *saved_fn;
1633         struct rt6_info *rt, *rt_cache;
1634         int strict = 0;
1635
1636         strict |= flags & RT6_LOOKUP_F_IFACE;
1637         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1638         if (net->ipv6.devconf_all->forwarding == 0)
1639                 strict |= RT6_LOOKUP_F_REACHABLE;
1640
1641         rcu_read_lock();
1642
1643         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1644         saved_fn = fn;
1645
1646         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1647                 oif = 0;
1648
1649 redo_rt6_select:
1650         rt = rt6_select(net, fn, oif, strict);
1651         if (rt->rt6i_nsiblings)
1652                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1653         if (rt == net->ipv6.ip6_null_entry) {
1654                 fn = fib6_backtrack(fn, &fl6->saddr);
1655                 if (fn)
1656                         goto redo_rt6_select;
1657                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1658                         /* also consider unreachable route */
1659                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1660                         fn = saved_fn;
1661                         goto redo_rt6_select;
1662                 }
1663         }
1664
1665         /*Search through exception table */
1666         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1667         if (rt_cache)
1668                 rt = rt_cache;
1669
1670         if (rt == net->ipv6.ip6_null_entry) {
1671                 rcu_read_unlock();
1672                 dst_hold(&rt->dst);
1673                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1674                 return rt;
1675         } else if (rt->rt6i_flags & RTF_CACHE) {
1676                 if (ip6_hold_safe(net, &rt, true)) {
1677                         dst_use_noref(&rt->dst, jiffies);
1678                         rt6_dst_from_metrics_check(rt);
1679                 }
1680                 rcu_read_unlock();
1681                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1682                 return rt;
1683         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1684                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1685                 /* Create a RTF_CACHE clone which will not be
1686                  * owned by the fib6 tree.  It is for the special case where
1687                  * the daddr in the skb during the neighbor look-up is different
1688                  * from the fl6->daddr used to look-up route here.
1689                  */
1690
1691                 struct rt6_info *uncached_rt;
1692
1693                 if (ip6_hold_safe(net, &rt, true)) {
1694                         dst_use_noref(&rt->dst, jiffies);
1695                 } else {
1696                         rcu_read_unlock();
1697                         uncached_rt = rt;
1698                         goto uncached_rt_out;
1699                 }
1700                 rcu_read_unlock();
1701
1702                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1703                 dst_release(&rt->dst);
1704
1705                 if (uncached_rt) {
1706                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1707                          * No need for another dst_hold()
1708                          */
1709                         rt6_uncached_list_add(uncached_rt);
1710                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1711                 } else {
1712                         uncached_rt = net->ipv6.ip6_null_entry;
1713                         dst_hold(&uncached_rt->dst);
1714                 }
1715
1716 uncached_rt_out:
1717                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1718                 return uncached_rt;
1719
1720         } else {
1721                 /* Get a percpu copy */
1722
1723                 struct rt6_info *pcpu_rt;
1724
1725                 dst_use_noref(&rt->dst, jiffies);
1726                 local_bh_disable();
1727                 pcpu_rt = rt6_get_pcpu_route(rt);
1728
1729                 if (!pcpu_rt) {
1730                         /* atomic_inc_not_zero() is needed when using rcu */
1731                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1732                                 /* No dst_hold() on rt is needed because grabbing
1733                                  * rt->rt6i_ref makes sure rt can't be released.
1734                                  */
1735                                 pcpu_rt = rt6_make_pcpu_route(rt);
1736                                 rt6_release(rt);
1737                         } else {
1738                                 /* rt is already removed from tree */
1739                                 pcpu_rt = net->ipv6.ip6_null_entry;
1740                                 dst_hold(&pcpu_rt->dst);
1741                         }
1742                 }
1743                 local_bh_enable();
1744                 rcu_read_unlock();
1745                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1746                 return pcpu_rt;
1747         }
1748 }
1749 EXPORT_SYMBOL_GPL(ip6_pol_route);
1750
1751 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1752                                             struct flowi6 *fl6, int flags)
1753 {
1754         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1755 }
1756
1757 struct dst_entry *ip6_route_input_lookup(struct net *net,
1758                                          struct net_device *dev,
1759                                          struct flowi6 *fl6, int flags)
1760 {
1761         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1762                 flags |= RT6_LOOKUP_F_IFACE;
1763
1764         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1765 }
1766 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1767
1768 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1769                                   struct flow_keys *keys)
1770 {
1771         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1772         const struct ipv6hdr *key_iph = outer_iph;
1773         const struct ipv6hdr *inner_iph;
1774         const struct icmp6hdr *icmph;
1775         struct ipv6hdr _inner_iph;
1776
1777         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1778                 goto out;
1779
1780         icmph = icmp6_hdr(skb);
1781         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1782             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1783             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1784             icmph->icmp6_type != ICMPV6_PARAMPROB)
1785                 goto out;
1786
1787         inner_iph = skb_header_pointer(skb,
1788                                        skb_transport_offset(skb) + sizeof(*icmph),
1789                                        sizeof(_inner_iph), &_inner_iph);
1790         if (!inner_iph)
1791                 goto out;
1792
1793         key_iph = inner_iph;
1794 out:
1795         memset(keys, 0, sizeof(*keys));
1796         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1797         keys->addrs.v6addrs.src = key_iph->saddr;
1798         keys->addrs.v6addrs.dst = key_iph->daddr;
1799         keys->tags.flow_label = ip6_flowinfo(key_iph);
1800         keys->basic.ip_proto = key_iph->nexthdr;
1801 }
1802
1803 /* if skb is set it will be used and fl6 can be NULL */
1804 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1805 {
1806         struct flow_keys hash_keys;
1807
1808         if (skb) {
1809                 ip6_multipath_l3_keys(skb, &hash_keys);
1810                 return flow_hash_from_keys(&hash_keys);
1811         }
1812
1813         return get_hash_from_flowi6(fl6);
1814 }
1815
1816 void ip6_route_input(struct sk_buff *skb)
1817 {
1818         const struct ipv6hdr *iph = ipv6_hdr(skb);
1819         struct net *net = dev_net(skb->dev);
1820         int flags = RT6_LOOKUP_F_HAS_SADDR;
1821         struct ip_tunnel_info *tun_info;
1822         struct flowi6 fl6 = {
1823                 .flowi6_iif = skb->dev->ifindex,
1824                 .daddr = iph->daddr,
1825                 .saddr = iph->saddr,
1826                 .flowlabel = ip6_flowinfo(iph),
1827                 .flowi6_mark = skb->mark,
1828                 .flowi6_proto = iph->nexthdr,
1829         };
1830
1831         tun_info = skb_tunnel_info(skb);
1832         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1833                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1834         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1835                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1836         skb_dst_drop(skb);
1837         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1838 }
1839
1840 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1841                                              struct flowi6 *fl6, int flags)
1842 {
1843         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1844 }
1845
1846 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1847                                          struct flowi6 *fl6, int flags)
1848 {
1849         bool any_src;
1850
1851         if (rt6_need_strict(&fl6->daddr)) {
1852                 struct dst_entry *dst;
1853
1854                 dst = l3mdev_link_scope_lookup(net, fl6);
1855                 if (dst)
1856                         return dst;
1857         }
1858
1859         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1860
1861         any_src = ipv6_addr_any(&fl6->saddr);
1862         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1863             (fl6->flowi6_oif && any_src))
1864                 flags |= RT6_LOOKUP_F_IFACE;
1865
1866         if (!any_src)
1867                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1868         else if (sk)
1869                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1870
1871         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1872 }
1873 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1874
1875 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1876 {
1877         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1878         struct net_device *loopback_dev = net->loopback_dev;
1879         struct dst_entry *new = NULL;
1880
1881         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1882                        DST_OBSOLETE_DEAD, 0);
1883         if (rt) {
1884                 rt6_info_init(rt);
1885                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1886
1887                 new = &rt->dst;
1888                 new->__use = 1;
1889                 new->input = dst_discard;
1890                 new->output = dst_discard_out;
1891
1892                 dst_copy_metrics(new, &ort->dst);
1893
1894                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1895                 rt->rt6i_gateway = ort->rt6i_gateway;
1896                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1897                 rt->rt6i_metric = 0;
1898
1899                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1900 #ifdef CONFIG_IPV6_SUBTREES
1901                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1902 #endif
1903         }
1904
1905         dst_release(dst_orig);
1906         return new ? new : ERR_PTR(-ENOMEM);
1907 }
1908
1909 /*
1910  *      Destination cache support functions
1911  */
1912
1913 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1914 {
1915         if (rt->dst.from &&
1916             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1917                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1918 }
1919
1920 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1921 {
1922         u32 rt_cookie = 0;
1923
1924         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1925                 return NULL;
1926
1927         if (rt6_check_expired(rt))
1928                 return NULL;
1929
1930         return &rt->dst;
1931 }
1932
1933 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1934 {
1935         if (!__rt6_check_expired(rt) &&
1936             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1937             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1938                 return &rt->dst;
1939         else
1940                 return NULL;
1941 }
1942
1943 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1944 {
1945         struct rt6_info *rt;
1946
1947         rt = (struct rt6_info *) dst;
1948
1949         /* All IPV6 dsts are created with ->obsolete set to the value
1950          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1951          * into this function always.
1952          */
1953
1954         rt6_dst_from_metrics_check(rt);
1955
1956         if (rt->rt6i_flags & RTF_PCPU ||
1957             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1958                 return rt6_dst_from_check(rt, cookie);
1959         else
1960                 return rt6_check(rt, cookie);
1961 }
1962
1963 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1964 {
1965         struct rt6_info *rt = (struct rt6_info *) dst;
1966
1967         if (rt) {
1968                 if (rt->rt6i_flags & RTF_CACHE) {
1969                         if (rt6_check_expired(rt)) {
1970                                 ip6_del_rt(rt);
1971                                 dst = NULL;
1972                         }
1973                 } else {
1974                         dst_release(dst);
1975                         dst = NULL;
1976                 }
1977         }
1978         return dst;
1979 }
1980
1981 static void ip6_link_failure(struct sk_buff *skb)
1982 {
1983         struct rt6_info *rt;
1984
1985         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1986
1987         rt = (struct rt6_info *) skb_dst(skb);
1988         if (rt) {
1989                 if (rt->rt6i_flags & RTF_CACHE) {
1990                         if (dst_hold_safe(&rt->dst))
1991                                 ip6_del_rt(rt);
1992                 } else {
1993                         struct fib6_node *fn;
1994
1995                         rcu_read_lock();
1996                         fn = rcu_dereference(rt->rt6i_node);
1997                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1998                                 fn->fn_sernum = -1;
1999                         rcu_read_unlock();
2000                 }
2001         }
2002 }
2003
2004 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2005 {
2006         struct net *net = dev_net(rt->dst.dev);
2007
2008         rt->rt6i_flags |= RTF_MODIFIED;
2009         rt->rt6i_pmtu = mtu;
2010         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2011 }
2012
2013 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2014 {
2015         return !(rt->rt6i_flags & RTF_CACHE) &&
2016                 (rt->rt6i_flags & RTF_PCPU ||
2017                  rcu_access_pointer(rt->rt6i_node));
2018 }
2019
2020 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2021                                  const struct ipv6hdr *iph, u32 mtu)
2022 {
2023         const struct in6_addr *daddr, *saddr;
2024         struct rt6_info *rt6 = (struct rt6_info *)dst;
2025
2026         if (rt6->rt6i_flags & RTF_LOCAL)
2027                 return;
2028
2029         if (dst_metric_locked(dst, RTAX_MTU))
2030                 return;
2031
2032         if (iph) {
2033                 daddr = &iph->daddr;
2034                 saddr = &iph->saddr;
2035         } else if (sk) {
2036                 daddr = &sk->sk_v6_daddr;
2037                 saddr = &inet6_sk(sk)->saddr;
2038         } else {
2039                 daddr = NULL;
2040                 saddr = NULL;
2041         }
2042         dst_confirm_neigh(dst, daddr);
2043         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2044         if (mtu >= dst_mtu(dst))
2045                 return;
2046
2047         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2048                 rt6_do_update_pmtu(rt6, mtu);
2049                 /* update rt6_ex->stamp for cache */
2050                 if (rt6->rt6i_flags & RTF_CACHE)
2051                         rt6_update_exception_stamp_rt(rt6);
2052         } else if (daddr) {
2053                 struct rt6_info *nrt6;
2054
2055                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2056                 if (nrt6) {
2057                         rt6_do_update_pmtu(nrt6, mtu);
2058                         if (rt6_insert_exception(nrt6, rt6))
2059                                 dst_release_immediate(&nrt6->dst);
2060                 }
2061         }
2062 }
2063
2064 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2065                                struct sk_buff *skb, u32 mtu)
2066 {
2067         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2068 }
2069
2070 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2071                      int oif, u32 mark, kuid_t uid)
2072 {
2073         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2074         struct dst_entry *dst;
2075         struct flowi6 fl6;
2076
2077         memset(&fl6, 0, sizeof(fl6));
2078         fl6.flowi6_oif = oif;
2079         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2080         fl6.daddr = iph->daddr;
2081         fl6.saddr = iph->saddr;
2082         fl6.flowlabel = ip6_flowinfo(iph);
2083         fl6.flowi6_uid = uid;
2084
2085         dst = ip6_route_output(net, NULL, &fl6);
2086         if (!dst->error)
2087                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2088         dst_release(dst);
2089 }
2090 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2091
2092 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2093 {
2094         struct dst_entry *dst;
2095
2096         ip6_update_pmtu(skb, sock_net(sk), mtu,
2097                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2098
2099         dst = __sk_dst_get(sk);
2100         if (!dst || !dst->obsolete ||
2101             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2102                 return;
2103
2104         bh_lock_sock(sk);
2105         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2106                 ip6_datagram_dst_update(sk, false);
2107         bh_unlock_sock(sk);
2108 }
2109 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2110
2111 /* Handle redirects */
2112 struct ip6rd_flowi {
2113         struct flowi6 fl6;
2114         struct in6_addr gateway;
2115 };
2116
2117 static struct rt6_info *__ip6_route_redirect(struct net *net,
2118                                              struct fib6_table *table,
2119                                              struct flowi6 *fl6,
2120                                              int flags)
2121 {
2122         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2123         struct rt6_info *rt, *rt_cache;
2124         struct fib6_node *fn;
2125
2126         /* Get the "current" route for this destination and
2127          * check if the redirect has come from appropriate router.
2128          *
2129          * RFC 4861 specifies that redirects should only be
2130          * accepted if they come from the nexthop to the target.
2131          * Due to the way the routes are chosen, this notion
2132          * is a bit fuzzy and one might need to check all possible
2133          * routes.
2134          */
2135
2136         rcu_read_lock();
2137         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2138 restart:
2139         for_each_fib6_node_rt_rcu(fn) {
2140                 if (rt6_check_expired(rt))
2141                         continue;
2142                 if (rt->dst.error)
2143                         break;
2144                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2145                         continue;
2146                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2147                         continue;
2148                 /* rt_cache's gateway might be different from its 'parent'
2149                  * in the case of an ip redirect.
2150                  * So we keep searching in the exception table if the gateway
2151                  * is different.
2152                  */
2153                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2154                         rt_cache = rt6_find_cached_rt(rt,
2155                                                       &fl6->daddr,
2156                                                       &fl6->saddr);
2157                         if (rt_cache &&
2158                             ipv6_addr_equal(&rdfl->gateway,
2159                                             &rt_cache->rt6i_gateway)) {
2160                                 rt = rt_cache;
2161                                 break;
2162                         }
2163                         continue;
2164                 }
2165                 break;
2166         }
2167
2168         if (!rt)
2169                 rt = net->ipv6.ip6_null_entry;
2170         else if (rt->dst.error) {
2171                 rt = net->ipv6.ip6_null_entry;
2172                 goto out;
2173         }
2174
2175         if (rt == net->ipv6.ip6_null_entry) {
2176                 fn = fib6_backtrack(fn, &fl6->saddr);
2177                 if (fn)
2178                         goto restart;
2179         }
2180
2181 out:
2182         ip6_hold_safe(net, &rt, true);
2183
2184         rcu_read_unlock();
2185
2186         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2187         return rt;
2188 };
2189
2190 static struct dst_entry *ip6_route_redirect(struct net *net,
2191                                         const struct flowi6 *fl6,
2192                                         const struct in6_addr *gateway)
2193 {
2194         int flags = RT6_LOOKUP_F_HAS_SADDR;
2195         struct ip6rd_flowi rdfl;
2196
2197         rdfl.fl6 = *fl6;
2198         rdfl.gateway = *gateway;
2199
2200         return fib6_rule_lookup(net, &rdfl.fl6,
2201                                 flags, __ip6_route_redirect);
2202 }
2203
2204 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2205                   kuid_t uid)
2206 {
2207         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2208         struct dst_entry *dst;
2209         struct flowi6 fl6;
2210
2211         memset(&fl6, 0, sizeof(fl6));
2212         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2213         fl6.flowi6_oif = oif;
2214         fl6.flowi6_mark = mark;
2215         fl6.daddr = iph->daddr;
2216         fl6.saddr = iph->saddr;
2217         fl6.flowlabel = ip6_flowinfo(iph);
2218         fl6.flowi6_uid = uid;
2219
2220         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2221         rt6_do_redirect(dst, NULL, skb);
2222         dst_release(dst);
2223 }
2224 EXPORT_SYMBOL_GPL(ip6_redirect);
2225
2226 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2227                             u32 mark)
2228 {
2229         const struct ipv6hdr *iph = ipv6_hdr(skb);
2230         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2231         struct dst_entry *dst;
2232         struct flowi6 fl6;
2233
2234         memset(&fl6, 0, sizeof(fl6));
2235         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2236         fl6.flowi6_oif = oif;
2237         fl6.flowi6_mark = mark;
2238         fl6.daddr = msg->dest;
2239         fl6.saddr = iph->daddr;
2240         fl6.flowi6_uid = sock_net_uid(net, NULL);
2241
2242         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2243         rt6_do_redirect(dst, NULL, skb);
2244         dst_release(dst);
2245 }
2246
2247 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2248 {
2249         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2250                      sk->sk_uid);
2251 }
2252 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2253
2254 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2255 {
2256         struct net_device *dev = dst->dev;
2257         unsigned int mtu = dst_mtu(dst);
2258         struct net *net = dev_net(dev);
2259
2260         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2261
2262         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2263                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2264
2265         /*
2266          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2267          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2268          * IPV6_MAXPLEN is also valid and means: "any MSS,
2269          * rely only on pmtu discovery"
2270          */
2271         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2272                 mtu = IPV6_MAXPLEN;
2273         return mtu;
2274 }
2275
2276 static unsigned int ip6_mtu(const struct dst_entry *dst)
2277 {
2278         const struct rt6_info *rt = (const struct rt6_info *)dst;
2279         unsigned int mtu = rt->rt6i_pmtu;
2280         struct inet6_dev *idev;
2281
2282         if (mtu)
2283                 goto out;
2284
2285         mtu = dst_metric_raw(dst, RTAX_MTU);
2286         if (mtu)
2287                 goto out;
2288
2289         mtu = IPV6_MIN_MTU;
2290
2291         rcu_read_lock();
2292         idev = __in6_dev_get(dst->dev);
2293         if (idev)
2294                 mtu = idev->cnf.mtu6;
2295         rcu_read_unlock();
2296
2297 out:
2298         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2299
2300         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2301 }
2302
2303 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2304                                   struct flowi6 *fl6)
2305 {
2306         struct dst_entry *dst;
2307         struct rt6_info *rt;
2308         struct inet6_dev *idev = in6_dev_get(dev);
2309         struct net *net = dev_net(dev);
2310
2311         if (unlikely(!idev))
2312                 return ERR_PTR(-ENODEV);
2313
2314         rt = ip6_dst_alloc(net, dev, 0);
2315         if (unlikely(!rt)) {
2316                 in6_dev_put(idev);
2317                 dst = ERR_PTR(-ENOMEM);
2318                 goto out;
2319         }
2320
2321         rt->dst.flags |= DST_HOST;
2322         rt->dst.output  = ip6_output;
2323         rt->rt6i_gateway  = fl6->daddr;
2324         rt->rt6i_dst.addr = fl6->daddr;
2325         rt->rt6i_dst.plen = 128;
2326         rt->rt6i_idev     = idev;
2327         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2328
2329         /* Add this dst into uncached_list so that rt6_ifdown() can
2330          * do proper release of the net_device
2331          */
2332         rt6_uncached_list_add(rt);
2333         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2334
2335         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2336
2337 out:
2338         return dst;
2339 }
2340
2341 static int ip6_dst_gc(struct dst_ops *ops)
2342 {
2343         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2344         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2345         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2346         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2347         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2348         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2349         int entries;
2350
2351         entries = dst_entries_get_fast(ops);
2352         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2353             entries <= rt_max_size)
2354                 goto out;
2355
2356         net->ipv6.ip6_rt_gc_expire++;
2357         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2358         entries = dst_entries_get_slow(ops);
2359         if (entries < ops->gc_thresh)
2360                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2361 out:
2362         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2363         return entries > rt_max_size;
2364 }
2365
2366 static int ip6_convert_metrics(struct mx6_config *mxc,
2367                                const struct fib6_config *cfg)
2368 {
2369         bool ecn_ca = false;
2370         struct nlattr *nla;
2371         int remaining;
2372         u32 *mp;
2373
2374         if (!cfg->fc_mx)
2375                 return 0;
2376
2377         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2378         if (unlikely(!mp))
2379                 return -ENOMEM;
2380
2381         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2382                 int type = nla_type(nla);
2383                 u32 val;
2384
2385                 if (!type)
2386                         continue;
2387                 if (unlikely(type > RTAX_MAX))
2388                         goto err;
2389
2390                 if (type == RTAX_CC_ALGO) {
2391                         char tmp[TCP_CA_NAME_MAX];
2392
2393                         nla_strlcpy(tmp, nla, sizeof(tmp));
2394                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2395                         if (val == TCP_CA_UNSPEC)
2396                                 goto err;
2397                 } else {
2398                         val = nla_get_u32(nla);
2399                 }
2400                 if (type == RTAX_HOPLIMIT && val > 255)
2401                         val = 255;
2402                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2403                         goto err;
2404
2405                 mp[type - 1] = val;
2406                 __set_bit(type - 1, mxc->mx_valid);
2407         }
2408
2409         if (ecn_ca) {
2410                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2411                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2412         }
2413
2414         mxc->mx = mp;
2415         return 0;
2416  err:
2417         kfree(mp);
2418         return -EINVAL;
2419 }
2420
2421 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2422                                             struct fib6_config *cfg,
2423                                             const struct in6_addr *gw_addr)
2424 {
2425         struct flowi6 fl6 = {
2426                 .flowi6_oif = cfg->fc_ifindex,
2427                 .daddr = *gw_addr,
2428                 .saddr = cfg->fc_prefsrc,
2429         };
2430         struct fib6_table *table;
2431         struct rt6_info *rt;
2432         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2433
2434         table = fib6_get_table(net, cfg->fc_table);
2435         if (!table)
2436                 return NULL;
2437
2438         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2439                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2440
2441         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2442
2443         /* if table lookup failed, fall back to full lookup */
2444         if (rt == net->ipv6.ip6_null_entry) {
2445                 ip6_rt_put(rt);
2446                 rt = NULL;
2447         }
2448
2449         return rt;
2450 }
2451
2452 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2453                                               struct netlink_ext_ack *extack)
2454 {
2455         struct net *net = cfg->fc_nlinfo.nl_net;
2456         struct rt6_info *rt = NULL;
2457         struct net_device *dev = NULL;
2458         struct inet6_dev *idev = NULL;
2459         struct fib6_table *table;
2460         int addr_type;
2461         int err = -EINVAL;
2462
2463         /* RTF_PCPU is an internal flag; can not be set by userspace */
2464         if (cfg->fc_flags & RTF_PCPU) {
2465                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2466                 goto out;
2467         }
2468
2469         if (cfg->fc_dst_len > 128) {
2470                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2471                 goto out;
2472         }
2473         if (cfg->fc_src_len > 128) {
2474                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2475                 goto out;
2476         }
2477 #ifndef CONFIG_IPV6_SUBTREES
2478         if (cfg->fc_src_len) {
2479                 NL_SET_ERR_MSG(extack,
2480                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2481                 goto out;
2482         }
2483 #endif
2484         if (cfg->fc_ifindex) {
2485                 err = -ENODEV;
2486                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2487                 if (!dev)
2488                         goto out;
2489                 idev = in6_dev_get(dev);
2490                 if (!idev)
2491                         goto out;
2492         }
2493
2494         if (cfg->fc_metric == 0)
2495                 cfg->fc_metric = IP6_RT_PRIO_USER;
2496
2497         err = -ENOBUFS;
2498         if (cfg->fc_nlinfo.nlh &&
2499             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2500                 table = fib6_get_table(net, cfg->fc_table);
2501                 if (!table) {
2502                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2503                         table = fib6_new_table(net, cfg->fc_table);
2504                 }
2505         } else {
2506                 table = fib6_new_table(net, cfg->fc_table);
2507         }
2508
2509         if (!table)
2510                 goto out;
2511
2512         rt = ip6_dst_alloc(net, NULL,
2513                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2514
2515         if (!rt) {
2516                 err = -ENOMEM;
2517                 goto out;
2518         }
2519
2520         if (cfg->fc_flags & RTF_EXPIRES)
2521                 rt6_set_expires(rt, jiffies +
2522                                 clock_t_to_jiffies(cfg->fc_expires));
2523         else
2524                 rt6_clean_expires(rt);
2525
2526         if (cfg->fc_protocol == RTPROT_UNSPEC)
2527                 cfg->fc_protocol = RTPROT_BOOT;
2528         rt->rt6i_protocol = cfg->fc_protocol;
2529
2530         addr_type = ipv6_addr_type(&cfg->fc_dst);
2531
2532         if (addr_type & IPV6_ADDR_MULTICAST)
2533                 rt->dst.input = ip6_mc_input;
2534         else if (cfg->fc_flags & RTF_LOCAL)
2535                 rt->dst.input = ip6_input;
2536         else
2537                 rt->dst.input = ip6_forward;
2538
2539         rt->dst.output = ip6_output;
2540
2541         if (cfg->fc_encap) {
2542                 struct lwtunnel_state *lwtstate;
2543
2544                 err = lwtunnel_build_state(cfg->fc_encap_type,
2545                                            cfg->fc_encap, AF_INET6, cfg,
2546                                            &lwtstate, extack);
2547                 if (err)
2548                         goto out;
2549                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2550                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2551                         rt->dst.lwtstate->orig_output = rt->dst.output;
2552                         rt->dst.output = lwtunnel_output;
2553                 }
2554                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2555                         rt->dst.lwtstate->orig_input = rt->dst.input;
2556                         rt->dst.input = lwtunnel_input;
2557                 }
2558         }
2559
2560         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2561         rt->rt6i_dst.plen = cfg->fc_dst_len;
2562         if (rt->rt6i_dst.plen == 128)
2563                 rt->dst.flags |= DST_HOST;
2564
2565 #ifdef CONFIG_IPV6_SUBTREES
2566         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2567         rt->rt6i_src.plen = cfg->fc_src_len;
2568 #endif
2569
2570         rt->rt6i_metric = cfg->fc_metric;
2571
2572         /* We cannot add true routes via loopback here,
2573            they would result in kernel looping; promote them to reject routes
2574          */
2575         if ((cfg->fc_flags & RTF_REJECT) ||
2576             (dev && (dev->flags & IFF_LOOPBACK) &&
2577              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2578              !(cfg->fc_flags & RTF_LOCAL))) {
2579                 /* hold loopback dev/idev if we haven't done so. */
2580                 if (dev != net->loopback_dev) {
2581                         if (dev) {
2582                                 dev_put(dev);
2583                                 in6_dev_put(idev);
2584                         }
2585                         dev = net->loopback_dev;
2586                         dev_hold(dev);
2587                         idev = in6_dev_get(dev);
2588                         if (!idev) {
2589                                 err = -ENODEV;
2590                                 goto out;
2591                         }
2592                 }
2593                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2594                 switch (cfg->fc_type) {
2595                 case RTN_BLACKHOLE:
2596                         rt->dst.error = -EINVAL;
2597                         rt->dst.output = dst_discard_out;
2598                         rt->dst.input = dst_discard;
2599                         break;
2600                 case RTN_PROHIBIT:
2601                         rt->dst.error = -EACCES;
2602                         rt->dst.output = ip6_pkt_prohibit_out;
2603                         rt->dst.input = ip6_pkt_prohibit;
2604                         break;
2605                 case RTN_THROW:
2606                 case RTN_UNREACHABLE:
2607                 default:
2608                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2609                                         : (cfg->fc_type == RTN_UNREACHABLE)
2610                                         ? -EHOSTUNREACH : -ENETUNREACH;
2611                         rt->dst.output = ip6_pkt_discard_out;
2612                         rt->dst.input = ip6_pkt_discard;
2613                         break;
2614                 }
2615                 goto install_route;
2616         }
2617
2618         if (cfg->fc_flags & RTF_GATEWAY) {
2619                 const struct in6_addr *gw_addr;
2620                 int gwa_type;
2621
2622                 gw_addr = &cfg->fc_gateway;
2623                 gwa_type = ipv6_addr_type(gw_addr);
2624
2625                 /* if gw_addr is local we will fail to detect this in case
2626                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2627                  * will return already-added prefix route via interface that
2628                  * prefix route was assigned to, which might be non-loopback.
2629                  */
2630                 err = -EINVAL;
2631                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2632                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2633                                             dev : NULL, 0, 0)) {
2634                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2635                         goto out;
2636                 }
2637                 rt->rt6i_gateway = *gw_addr;
2638
2639                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2640                         struct rt6_info *grt = NULL;
2641
2642                         /* IPv6 strictly inhibits using not link-local
2643                            addresses as nexthop address.
2644                            Otherwise, router will not able to send redirects.
2645                            It is very good, but in some (rare!) circumstances
2646                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2647                            some exceptions. --ANK
2648                            We allow IPv4-mapped nexthops to support RFC4798-type
2649                            addressing
2650                          */
2651                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2652                                           IPV6_ADDR_MAPPED))) {
2653                                 NL_SET_ERR_MSG(extack,
2654                                                "Invalid gateway address");
2655                                 goto out;
2656                         }
2657
2658                         if (cfg->fc_table) {
2659                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2660
2661                                 if (grt) {
2662                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2663                                             (dev && dev != grt->dst.dev)) {
2664                                                 ip6_rt_put(grt);
2665                                                 grt = NULL;
2666                                         }
2667                                 }
2668                         }
2669
2670                         if (!grt)
2671                                 grt = rt6_lookup(net, gw_addr, NULL,
2672                                                  cfg->fc_ifindex, 1);
2673
2674                         err = -EHOSTUNREACH;
2675                         if (!grt)
2676                                 goto out;
2677                         if (dev) {
2678                                 if (dev != grt->dst.dev) {
2679                                         ip6_rt_put(grt);
2680                                         goto out;
2681                                 }
2682                         } else {
2683                                 dev = grt->dst.dev;
2684                                 idev = grt->rt6i_idev;
2685                                 dev_hold(dev);
2686                                 in6_dev_hold(grt->rt6i_idev);
2687                         }
2688                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2689                                 err = 0;
2690                         ip6_rt_put(grt);
2691
2692                         if (err)
2693                                 goto out;
2694                 }
2695                 err = -EINVAL;
2696                 if (!dev) {
2697                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2698                         goto out;
2699                 } else if (dev->flags & IFF_LOOPBACK) {
2700                         NL_SET_ERR_MSG(extack,
2701                                        "Egress device can not be loopback device for this route");
2702                         goto out;
2703                 }
2704         }
2705
2706         err = -ENODEV;
2707         if (!dev)
2708                 goto out;
2709
2710         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2711                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2712                         NL_SET_ERR_MSG(extack, "Invalid source address");
2713                         err = -EINVAL;
2714                         goto out;
2715                 }
2716                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2717                 rt->rt6i_prefsrc.plen = 128;
2718         } else
2719                 rt->rt6i_prefsrc.plen = 0;
2720
2721         rt->rt6i_flags = cfg->fc_flags;
2722
2723 install_route:
2724         rt->dst.dev = dev;
2725         rt->rt6i_idev = idev;
2726         rt->rt6i_table = table;
2727
2728         cfg->fc_nlinfo.nl_net = dev_net(dev);
2729
2730         return rt;
2731 out:
2732         if (dev)
2733                 dev_put(dev);
2734         if (idev)
2735                 in6_dev_put(idev);
2736         if (rt)
2737                 dst_release_immediate(&rt->dst);
2738
2739         return ERR_PTR(err);
2740 }
2741
2742 int ip6_route_add(struct fib6_config *cfg,
2743                   struct netlink_ext_ack *extack)
2744 {
2745         struct mx6_config mxc = { .mx = NULL, };
2746         struct rt6_info *rt;
2747         int err;
2748
2749         rt = ip6_route_info_create(cfg, extack);
2750         if (IS_ERR(rt)) {
2751                 err = PTR_ERR(rt);
2752                 rt = NULL;
2753                 goto out;
2754         }
2755
2756         err = ip6_convert_metrics(&mxc, cfg);
2757         if (err)
2758                 goto out;
2759
2760         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2761
2762         kfree(mxc.mx);
2763
2764         return err;
2765 out:
2766         if (rt)
2767                 dst_release_immediate(&rt->dst);
2768
2769         return err;
2770 }
2771
2772 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2773 {
2774         int err;
2775         struct fib6_table *table;
2776         struct net *net = dev_net(rt->dst.dev);
2777
2778         if (rt == net->ipv6.ip6_null_entry) {
2779                 err = -ENOENT;
2780                 goto out;
2781         }
2782
2783         table = rt->rt6i_table;
2784         spin_lock_bh(&table->tb6_lock);
2785         err = fib6_del(rt, info);
2786         spin_unlock_bh(&table->tb6_lock);
2787
2788 out:
2789         ip6_rt_put(rt);
2790         return err;
2791 }
2792
2793 int ip6_del_rt(struct rt6_info *rt)
2794 {
2795         struct nl_info info = {
2796                 .nl_net = dev_net(rt->dst.dev),
2797         };
2798         return __ip6_del_rt(rt, &info);
2799 }
2800
2801 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2802 {
2803         struct nl_info *info = &cfg->fc_nlinfo;
2804         struct net *net = info->nl_net;
2805         struct sk_buff *skb = NULL;
2806         struct fib6_table *table;
2807         int err = -ENOENT;
2808
2809         if (rt == net->ipv6.ip6_null_entry)
2810                 goto out_put;
2811         table = rt->rt6i_table;
2812         spin_lock_bh(&table->tb6_lock);
2813
2814         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2815                 struct rt6_info *sibling, *next_sibling;
2816
2817                 /* prefer to send a single notification with all hops */
2818                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2819                 if (skb) {
2820                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2821
2822                         if (rt6_fill_node(net, skb, rt,
2823                                           NULL, NULL, 0, RTM_DELROUTE,
2824                                           info->portid, seq, 0) < 0) {
2825                                 kfree_skb(skb);
2826                                 skb = NULL;
2827                         } else
2828                                 info->skip_notify = 1;
2829                 }
2830
2831                 list_for_each_entry_safe(sibling, next_sibling,
2832                                          &rt->rt6i_siblings,
2833                                          rt6i_siblings) {
2834                         err = fib6_del(sibling, info);
2835                         if (err)
2836                                 goto out_unlock;
2837                 }
2838         }
2839
2840         err = fib6_del(rt, info);
2841 out_unlock:
2842         spin_unlock_bh(&table->tb6_lock);
2843 out_put:
2844         ip6_rt_put(rt);
2845
2846         if (skb) {
2847                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2848                             info->nlh, gfp_any());
2849         }
2850         return err;
2851 }
2852
2853 static int ip6_route_del(struct fib6_config *cfg,
2854                          struct netlink_ext_ack *extack)
2855 {
2856         struct rt6_info *rt, *rt_cache;
2857         struct fib6_table *table;
2858         struct fib6_node *fn;
2859         int err = -ESRCH;
2860
2861         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2862         if (!table) {
2863                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2864                 return err;
2865         }
2866
2867         rcu_read_lock();
2868
2869         fn = fib6_locate(&table->tb6_root,
2870                          &cfg->fc_dst, cfg->fc_dst_len,
2871                          &cfg->fc_src, cfg->fc_src_len,
2872                          !(cfg->fc_flags & RTF_CACHE));
2873
2874         if (fn) {
2875                 for_each_fib6_node_rt_rcu(fn) {
2876                         if (cfg->fc_flags & RTF_CACHE) {
2877                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2878                                                               &cfg->fc_src);
2879                                 if (!rt_cache)
2880                                         continue;
2881                                 rt = rt_cache;
2882                         }
2883                         if (cfg->fc_ifindex &&
2884                             (!rt->dst.dev ||
2885                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2886                                 continue;
2887                         if (cfg->fc_flags & RTF_GATEWAY &&
2888                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2889                                 continue;
2890                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2891                                 continue;
2892                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2893                                 continue;
2894                         if (!dst_hold_safe(&rt->dst))
2895                                 break;
2896                         rcu_read_unlock();
2897
2898                         /* if gateway was specified only delete the one hop */
2899                         if (cfg->fc_flags & RTF_GATEWAY)
2900                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2901
2902                         return __ip6_del_rt_siblings(rt, cfg);
2903                 }
2904         }
2905         rcu_read_unlock();
2906
2907         return err;
2908 }
2909
2910 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2911 {
2912         struct netevent_redirect netevent;
2913         struct rt6_info *rt, *nrt = NULL;
2914         struct ndisc_options ndopts;
2915         struct inet6_dev *in6_dev;
2916         struct neighbour *neigh;
2917         struct rd_msg *msg;
2918         int optlen, on_link;
2919         u8 *lladdr;
2920
2921         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2922         optlen -= sizeof(*msg);
2923
2924         if (optlen < 0) {
2925                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2926                 return;
2927         }
2928
2929         msg = (struct rd_msg *)icmp6_hdr(skb);
2930
2931         if (ipv6_addr_is_multicast(&msg->dest)) {
2932                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2933                 return;
2934         }
2935
2936         on_link = 0;
2937         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2938                 on_link = 1;
2939         } else if (ipv6_addr_type(&msg->target) !=
2940                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2941                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2942                 return;
2943         }
2944
2945         in6_dev = __in6_dev_get(skb->dev);
2946         if (!in6_dev)
2947                 return;
2948         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2949                 return;
2950
2951         /* RFC2461 8.1:
2952          *      The IP source address of the Redirect MUST be the same as the current
2953          *      first-hop router for the specified ICMP Destination Address.
2954          */
2955
2956         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2957                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2958                 return;
2959         }
2960
2961         lladdr = NULL;
2962         if (ndopts.nd_opts_tgt_lladdr) {
2963                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2964                                              skb->dev);
2965                 if (!lladdr) {
2966                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2967                         return;
2968                 }
2969         }
2970
2971         rt = (struct rt6_info *) dst;
2972         if (rt->rt6i_flags & RTF_REJECT) {
2973                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2974                 return;
2975         }
2976
2977         /* Redirect received -> path was valid.
2978          * Look, redirects are sent only in response to data packets,
2979          * so that this nexthop apparently is reachable. --ANK
2980          */
2981         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2982
2983         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2984         if (!neigh)
2985                 return;
2986
2987         /*
2988          *      We have finally decided to accept it.
2989          */
2990
2991         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2992                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2993                      NEIGH_UPDATE_F_OVERRIDE|
2994                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2995                                      NEIGH_UPDATE_F_ISROUTER)),
2996                      NDISC_REDIRECT, &ndopts);
2997
2998         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2999         if (!nrt)
3000                 goto out;
3001
3002         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3003         if (on_link)
3004                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3005
3006         nrt->rt6i_protocol = RTPROT_REDIRECT;
3007         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3008
3009         /* No need to remove rt from the exception table if rt is
3010          * a cached route because rt6_insert_exception() will
3011          * takes care of it
3012          */
3013         if (rt6_insert_exception(nrt, rt)) {
3014                 dst_release_immediate(&nrt->dst);
3015                 goto out;
3016         }
3017
3018         netevent.old = &rt->dst;
3019         netevent.new = &nrt->dst;
3020         netevent.daddr = &msg->dest;
3021         netevent.neigh = neigh;
3022         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3023
3024 out:
3025         neigh_release(neigh);
3026 }
3027
3028 /*
3029  *      Misc support functions
3030  */
3031
3032 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3033 {
3034         BUG_ON(from->dst.from);
3035
3036         rt->rt6i_flags &= ~RTF_EXPIRES;
3037         dst_hold(&from->dst);
3038         rt->dst.from = &from->dst;
3039         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3040 }
3041
3042 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3043 {
3044         rt->dst.input = ort->dst.input;
3045         rt->dst.output = ort->dst.output;
3046         rt->rt6i_dst = ort->rt6i_dst;
3047         rt->dst.error = ort->dst.error;
3048         rt->rt6i_idev = ort->rt6i_idev;
3049         if (rt->rt6i_idev)
3050                 in6_dev_hold(rt->rt6i_idev);
3051         rt->dst.lastuse = jiffies;
3052         rt->rt6i_gateway = ort->rt6i_gateway;
3053         rt->rt6i_flags = ort->rt6i_flags;
3054         rt6_set_from(rt, ort);
3055         rt->rt6i_metric = ort->rt6i_metric;
3056 #ifdef CONFIG_IPV6_SUBTREES
3057         rt->rt6i_src = ort->rt6i_src;
3058 #endif
3059         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3060         rt->rt6i_table = ort->rt6i_table;
3061         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3062 }
3063
3064 #ifdef CONFIG_IPV6_ROUTE_INFO
3065 static struct rt6_info *rt6_get_route_info(struct net *net,
3066                                            const struct in6_addr *prefix, int prefixlen,
3067                                            const struct in6_addr *gwaddr,
3068                                            struct net_device *dev)
3069 {
3070         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3071         int ifindex = dev->ifindex;
3072         struct fib6_node *fn;
3073         struct rt6_info *rt = NULL;
3074         struct fib6_table *table;
3075
3076         table = fib6_get_table(net, tb_id);
3077         if (!table)
3078                 return NULL;
3079
3080         rcu_read_lock();
3081         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3082         if (!fn)
3083                 goto out;
3084
3085         for_each_fib6_node_rt_rcu(fn) {
3086                 if (rt->dst.dev->ifindex != ifindex)
3087                         continue;
3088                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3089                         continue;
3090                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3091                         continue;
3092                 ip6_hold_safe(NULL, &rt, false);
3093                 break;
3094         }
3095 out:
3096         rcu_read_unlock();
3097         return rt;
3098 }
3099
3100 static struct rt6_info *rt6_add_route_info(struct net *net,
3101                                            const struct in6_addr *prefix, int prefixlen,
3102                                            const struct in6_addr *gwaddr,
3103                                            struct net_device *dev,
3104                                            unsigned int pref)
3105 {
3106         struct fib6_config cfg = {
3107                 .fc_metric      = IP6_RT_PRIO_USER,
3108                 .fc_ifindex     = dev->ifindex,
3109                 .fc_dst_len     = prefixlen,
3110                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3111                                   RTF_UP | RTF_PREF(pref),
3112                 .fc_protocol = RTPROT_RA,
3113                 .fc_nlinfo.portid = 0,
3114                 .fc_nlinfo.nlh = NULL,
3115                 .fc_nlinfo.nl_net = net,
3116         };
3117
3118         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3119         cfg.fc_dst = *prefix;
3120         cfg.fc_gateway = *gwaddr;
3121
3122         /* We should treat it as a default route if prefix length is 0. */
3123         if (!prefixlen)
3124                 cfg.fc_flags |= RTF_DEFAULT;
3125
3126         ip6_route_add(&cfg, NULL);
3127
3128         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3129 }
3130 #endif
3131
3132 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3133 {
3134         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3135         struct rt6_info *rt;
3136         struct fib6_table *table;
3137
3138         table = fib6_get_table(dev_net(dev), tb_id);
3139         if (!table)
3140                 return NULL;
3141
3142         rcu_read_lock();
3143         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3144                 if (dev == rt->dst.dev &&
3145                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3146                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3147                         break;
3148         }
3149         if (rt)
3150                 ip6_hold_safe(NULL, &rt, false);
3151         rcu_read_unlock();
3152         return rt;
3153 }
3154
3155 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3156                                      struct net_device *dev,
3157                                      unsigned int pref)
3158 {
3159         struct fib6_config cfg = {
3160                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3161                 .fc_metric      = IP6_RT_PRIO_USER,
3162                 .fc_ifindex     = dev->ifindex,
3163                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3164                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3165                 .fc_protocol = RTPROT_RA,
3166                 .fc_nlinfo.portid = 0,
3167                 .fc_nlinfo.nlh = NULL,
3168                 .fc_nlinfo.nl_net = dev_net(dev),
3169         };
3170
3171         cfg.fc_gateway = *gwaddr;
3172
3173         if (!ip6_route_add(&cfg, NULL)) {
3174                 struct fib6_table *table;
3175
3176                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3177                 if (table)
3178                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3179         }
3180
3181         return rt6_get_dflt_router(gwaddr, dev);
3182 }
3183
3184 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3185 {
3186         struct rt6_info *rt;
3187
3188 restart:
3189         rcu_read_lock();
3190         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3191                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3192                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3193                         if (dst_hold_safe(&rt->dst)) {
3194                                 rcu_read_unlock();
3195                                 ip6_del_rt(rt);
3196                         } else {
3197                                 rcu_read_unlock();
3198                         }
3199                         goto restart;
3200                 }
3201         }
3202         rcu_read_unlock();
3203
3204         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3205 }
3206
3207 void rt6_purge_dflt_routers(struct net *net)
3208 {
3209         struct fib6_table *table;
3210         struct hlist_head *head;
3211         unsigned int h;
3212
3213         rcu_read_lock();
3214
3215         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3216                 head = &net->ipv6.fib_table_hash[h];
3217                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3218                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3219                                 __rt6_purge_dflt_routers(table);
3220                 }
3221         }
3222
3223         rcu_read_unlock();
3224 }
3225
3226 static void rtmsg_to_fib6_config(struct net *net,
3227                                  struct in6_rtmsg *rtmsg,
3228                                  struct fib6_config *cfg)
3229 {
3230         memset(cfg, 0, sizeof(*cfg));
3231
3232         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3233                          : RT6_TABLE_MAIN;
3234         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3235         cfg->fc_metric = rtmsg->rtmsg_metric;
3236         cfg->fc_expires = rtmsg->rtmsg_info;
3237         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3238         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3239         cfg->fc_flags = rtmsg->rtmsg_flags;
3240
3241         cfg->fc_nlinfo.nl_net = net;
3242
3243         cfg->fc_dst = rtmsg->rtmsg_dst;
3244         cfg->fc_src = rtmsg->rtmsg_src;
3245         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3246 }
3247
3248 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3249 {
3250         struct fib6_config cfg;
3251         struct in6_rtmsg rtmsg;
3252         int err;
3253
3254         switch (cmd) {
3255         case SIOCADDRT:         /* Add a route */
3256         case SIOCDELRT:         /* Delete a route */
3257                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3258                         return -EPERM;
3259                 err = copy_from_user(&rtmsg, arg,
3260                                      sizeof(struct in6_rtmsg));
3261                 if (err)
3262                         return -EFAULT;
3263
3264                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3265
3266                 rtnl_lock();
3267                 switch (cmd) {
3268                 case SIOCADDRT:
3269                         err = ip6_route_add(&cfg, NULL);
3270                         break;
3271                 case SIOCDELRT:
3272                         err = ip6_route_del(&cfg, NULL);
3273                         break;
3274                 default:
3275                         err = -EINVAL;
3276                 }
3277                 rtnl_unlock();
3278
3279                 return err;
3280         }
3281
3282         return -EINVAL;
3283 }
3284
3285 /*
3286  *      Drop the packet on the floor
3287  */
3288
3289 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3290 {
3291         int type;
3292         struct dst_entry *dst = skb_dst(skb);
3293         switch (ipstats_mib_noroutes) {
3294         case IPSTATS_MIB_INNOROUTES:
3295                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3296                 if (type == IPV6_ADDR_ANY) {
3297                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3298                                       IPSTATS_MIB_INADDRERRORS);
3299                         break;
3300                 }
3301                 /* FALLTHROUGH */
3302         case IPSTATS_MIB_OUTNOROUTES:
3303                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3304                               ipstats_mib_noroutes);
3305                 break;
3306         }
3307         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3308         kfree_skb(skb);
3309         return 0;
3310 }
3311
3312 static int ip6_pkt_discard(struct sk_buff *skb)
3313 {
3314         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3315 }
3316
3317 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3318 {
3319         skb->dev = skb_dst(skb)->dev;
3320         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3321 }
3322
3323 static int ip6_pkt_prohibit(struct sk_buff *skb)
3324 {
3325         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3326 }
3327
3328 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3329 {
3330         skb->dev = skb_dst(skb)->dev;
3331         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3332 }
3333
3334 /*
3335  *      Allocate a dst for local (unicast / anycast) address.
3336  */
3337
3338 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3339                                     const struct in6_addr *addr,
3340                                     bool anycast)
3341 {
3342         u32 tb_id;
3343         struct net *net = dev_net(idev->dev);
3344         struct net_device *dev = idev->dev;
3345         struct rt6_info *rt;
3346
3347         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3348         if (!rt)
3349                 return ERR_PTR(-ENOMEM);
3350
3351         in6_dev_hold(idev);
3352
3353         rt->dst.flags |= DST_HOST;
3354         rt->dst.input = ip6_input;
3355         rt->dst.output = ip6_output;
3356         rt->rt6i_idev = idev;
3357
3358         rt->rt6i_protocol = RTPROT_KERNEL;
3359         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3360         if (anycast)
3361                 rt->rt6i_flags |= RTF_ANYCAST;
3362         else
3363                 rt->rt6i_flags |= RTF_LOCAL;
3364
3365         rt->rt6i_gateway  = *addr;
3366         rt->rt6i_dst.addr = *addr;
3367         rt->rt6i_dst.plen = 128;
3368         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3369         rt->rt6i_table = fib6_get_table(net, tb_id);
3370
3371         return rt;
3372 }
3373
3374 /* remove deleted ip from prefsrc entries */
3375 struct arg_dev_net_ip {
3376         struct net_device *dev;
3377         struct net *net;
3378         struct in6_addr *addr;
3379 };
3380
3381 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3382 {
3383         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3384         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3385         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3386
3387         if (((void *)rt->dst.dev == dev || !dev) &&
3388             rt != net->ipv6.ip6_null_entry &&
3389             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3390                 spin_lock_bh(&rt6_exception_lock);
3391                 /* remove prefsrc entry */
3392                 rt->rt6i_prefsrc.plen = 0;
3393                 /* need to update cache as well */
3394                 rt6_exceptions_remove_prefsrc(rt);
3395                 spin_unlock_bh(&rt6_exception_lock);
3396         }
3397         return 0;
3398 }
3399
3400 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3401 {
3402         struct net *net = dev_net(ifp->idev->dev);
3403         struct arg_dev_net_ip adni = {
3404                 .dev = ifp->idev->dev,
3405                 .net = net,
3406                 .addr = &ifp->addr,
3407         };
3408         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3409 }
3410
3411 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3412
3413 /* Remove routers and update dst entries when gateway turn into host. */
3414 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3415 {
3416         struct in6_addr *gateway = (struct in6_addr *)arg;
3417
3418         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3419             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3420                 return -1;
3421         }
3422
3423         /* Further clean up cached routes in exception table.
3424          * This is needed because cached route may have a different
3425          * gateway than its 'parent' in the case of an ip redirect.
3426          */
3427         rt6_exceptions_clean_tohost(rt, gateway);
3428
3429         return 0;
3430 }
3431
3432 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3433 {
3434         fib6_clean_all(net, fib6_clean_tohost, gateway);
3435 }
3436
3437 struct arg_dev_net {
3438         struct net_device *dev;
3439         struct net *net;
3440 };
3441
3442 /* called with write lock held for table with rt */
3443 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3444 {
3445         const struct arg_dev_net *adn = arg;
3446         const struct net_device *dev = adn->dev;
3447
3448         if ((rt->dst.dev == dev || !dev) &&
3449             rt != adn->net->ipv6.ip6_null_entry &&
3450             (rt->rt6i_nsiblings == 0 ||
3451              (dev && netdev_unregistering(dev)) ||
3452              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3453                 return -1;
3454
3455         return 0;
3456 }
3457
3458 void rt6_ifdown(struct net *net, struct net_device *dev)
3459 {
3460         struct arg_dev_net adn = {
3461                 .dev = dev,
3462                 .net = net,
3463         };
3464
3465         fib6_clean_all(net, fib6_ifdown, &adn);
3466         if (dev)
3467                 rt6_uncached_list_flush_dev(net, dev);
3468 }
3469
3470 struct rt6_mtu_change_arg {
3471         struct net_device *dev;
3472         unsigned int mtu;
3473 };
3474
3475 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3476 {
3477         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3478         struct inet6_dev *idev;
3479
3480         /* In IPv6 pmtu discovery is not optional,
3481            so that RTAX_MTU lock cannot disable it.
3482            We still use this lock to block changes
3483            caused by addrconf/ndisc.
3484         */
3485
3486         idev = __in6_dev_get(arg->dev);
3487         if (!idev)
3488                 return 0;
3489
3490         /* For administrative MTU increase, there is no way to discover
3491            IPv6 PMTU increase, so PMTU increase should be updated here.
3492            Since RFC 1981 doesn't include administrative MTU increase
3493            update PMTU increase is a MUST. (i.e. jumbo frame)
3494          */
3495         /*
3496            If new MTU is less than route PMTU, this new MTU will be the
3497            lowest MTU in the path, update the route PMTU to reflect PMTU
3498            decreases; if new MTU is greater than route PMTU, and the
3499            old MTU is the lowest MTU in the path, update the route PMTU
3500            to reflect the increase. In this case if the other nodes' MTU
3501            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3502            PMTU discovery.
3503          */
3504         if (rt->dst.dev == arg->dev &&
3505             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3506             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3507                 spin_lock_bh(&rt6_exception_lock);
3508                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3509                     (dst_mtu(&rt->dst) < arg->mtu &&
3510                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3511                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3512                 }
3513                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3514                 spin_unlock_bh(&rt6_exception_lock);
3515         }
3516         return 0;
3517 }
3518
3519 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3520 {
3521         struct rt6_mtu_change_arg arg = {
3522                 .dev = dev,
3523                 .mtu = mtu,
3524         };
3525
3526         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3527 }
3528
3529 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3530         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3531         [RTA_OIF]               = { .type = NLA_U32 },
3532         [RTA_IIF]               = { .type = NLA_U32 },
3533         [RTA_PRIORITY]          = { .type = NLA_U32 },
3534         [RTA_METRICS]           = { .type = NLA_NESTED },
3535         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3536         [RTA_PREF]              = { .type = NLA_U8 },
3537         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3538         [RTA_ENCAP]             = { .type = NLA_NESTED },
3539         [RTA_EXPIRES]           = { .type = NLA_U32 },
3540         [RTA_UID]               = { .type = NLA_U32 },
3541         [RTA_MARK]              = { .type = NLA_U32 },
3542 };
3543
3544 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3545                               struct fib6_config *cfg,
3546                               struct netlink_ext_ack *extack)
3547 {
3548         struct rtmsg *rtm;
3549         struct nlattr *tb[RTA_MAX+1];
3550         unsigned int pref;
3551         int err;
3552
3553         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3554                           NULL);
3555         if (err < 0)
3556                 goto errout;
3557
3558         err = -EINVAL;
3559         rtm = nlmsg_data(nlh);
3560         memset(cfg, 0, sizeof(*cfg));
3561
3562         cfg->fc_table = rtm->rtm_table;
3563         cfg->fc_dst_len = rtm->rtm_dst_len;
3564         cfg->fc_src_len = rtm->rtm_src_len;
3565         cfg->fc_flags = RTF_UP;
3566         cfg->fc_protocol = rtm->rtm_protocol;
3567         cfg->fc_type = rtm->rtm_type;
3568
3569         if (rtm->rtm_type == RTN_UNREACHABLE ||
3570             rtm->rtm_type == RTN_BLACKHOLE ||
3571             rtm->rtm_type == RTN_PROHIBIT ||
3572             rtm->rtm_type == RTN_THROW)
3573                 cfg->fc_flags |= RTF_REJECT;
3574
3575         if (rtm->rtm_type == RTN_LOCAL)
3576                 cfg->fc_flags |= RTF_LOCAL;
3577
3578         if (rtm->rtm_flags & RTM_F_CLONED)
3579                 cfg->fc_flags |= RTF_CACHE;
3580
3581         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3582         cfg->fc_nlinfo.nlh = nlh;
3583         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3584
3585         if (tb[RTA_GATEWAY]) {
3586                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3587                 cfg->fc_flags |= RTF_GATEWAY;
3588         }
3589
3590         if (tb[RTA_DST]) {
3591                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3592
3593                 if (nla_len(tb[RTA_DST]) < plen)
3594                         goto errout;
3595
3596                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3597         }
3598
3599         if (tb[RTA_SRC]) {
3600                 int plen = (rtm->rtm_src_len + 7) >> 3;
3601
3602                 if (nla_len(tb[RTA_SRC]) < plen)
3603                         goto errout;
3604
3605                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3606         }
3607
3608         if (tb[RTA_PREFSRC])
3609                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3610
3611         if (tb[RTA_OIF])
3612                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3613
3614         if (tb[RTA_PRIORITY])
3615                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3616
3617         if (tb[RTA_METRICS]) {
3618                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3619                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3620         }
3621
3622         if (tb[RTA_TABLE])
3623                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3624
3625         if (tb[RTA_MULTIPATH]) {
3626                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3627                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3628
3629                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3630                                                      cfg->fc_mp_len, extack);
3631                 if (err < 0)
3632                         goto errout;
3633         }
3634
3635         if (tb[RTA_PREF]) {
3636                 pref = nla_get_u8(tb[RTA_PREF]);
3637                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3638                     pref != ICMPV6_ROUTER_PREF_HIGH)
3639                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3640                 cfg->fc_flags |= RTF_PREF(pref);
3641         }
3642
3643         if (tb[RTA_ENCAP])
3644                 cfg->fc_encap = tb[RTA_ENCAP];
3645
3646         if (tb[RTA_ENCAP_TYPE]) {
3647                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3648
3649                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3650                 if (err < 0)
3651                         goto errout;
3652         }
3653
3654         if (tb[RTA_EXPIRES]) {
3655                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3656
3657                 if (addrconf_finite_timeout(timeout)) {
3658                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3659                         cfg->fc_flags |= RTF_EXPIRES;
3660                 }
3661         }
3662
3663         err = 0;
3664 errout:
3665         return err;
3666 }
3667
3668 struct rt6_nh {
3669         struct rt6_info *rt6_info;
3670         struct fib6_config r_cfg;
3671         struct mx6_config mxc;
3672         struct list_head next;
3673 };
3674
3675 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3676 {
3677         struct rt6_nh *nh;
3678
3679         list_for_each_entry(nh, rt6_nh_list, next) {
3680                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3681                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3682                         nh->r_cfg.fc_ifindex);
3683         }
3684 }
3685
3686 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3687                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3688 {
3689         struct rt6_nh *nh;
3690         int err = -EEXIST;
3691
3692         list_for_each_entry(nh, rt6_nh_list, next) {
3693                 /* check if rt6_info already exists */
3694                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3695                         return err;
3696         }
3697
3698         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3699         if (!nh)
3700                 return -ENOMEM;
3701         nh->rt6_info = rt;
3702         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3703         if (err) {
3704                 kfree(nh);
3705                 return err;
3706         }
3707         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3708         list_add_tail(&nh->next, rt6_nh_list);
3709
3710         return 0;
3711 }
3712
3713 static void ip6_route_mpath_notify(struct rt6_info *rt,
3714                                    struct rt6_info *rt_last,
3715                                    struct nl_info *info,
3716                                    __u16 nlflags)
3717 {
3718         /* if this is an APPEND route, then rt points to the first route
3719          * inserted and rt_last points to last route inserted. Userspace
3720          * wants a consistent dump of the route which starts at the first
3721          * nexthop. Since sibling routes are always added at the end of
3722          * the list, find the first sibling of the last route appended
3723          */
3724         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3725                 rt = list_first_entry(&rt_last->rt6i_siblings,
3726                                       struct rt6_info,
3727                                       rt6i_siblings);
3728         }
3729
3730         if (rt)
3731                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3732 }
3733
3734 static int ip6_route_multipath_add(struct fib6_config *cfg,
3735                                    struct netlink_ext_ack *extack)
3736 {
3737         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3738         struct nl_info *info = &cfg->fc_nlinfo;
3739         struct fib6_config r_cfg;
3740         struct rtnexthop *rtnh;
3741         struct rt6_info *rt;
3742         struct rt6_nh *err_nh;
3743         struct rt6_nh *nh, *nh_safe;
3744         __u16 nlflags;
3745         int remaining;
3746         int attrlen;
3747         int err = 1;
3748         int nhn = 0;
3749         int replace = (cfg->fc_nlinfo.nlh &&
3750                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3751         LIST_HEAD(rt6_nh_list);
3752
3753         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3754         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3755                 nlflags |= NLM_F_APPEND;
3756
3757         remaining = cfg->fc_mp_len;
3758         rtnh = (struct rtnexthop *)cfg->fc_mp;
3759
3760         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3761          * rt6_info structs per nexthop
3762          */
3763         while (rtnh_ok(rtnh, remaining)) {
3764                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3765                 if (rtnh->rtnh_ifindex)
3766                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3767
3768                 attrlen = rtnh_attrlen(rtnh);
3769                 if (attrlen > 0) {
3770                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3771
3772                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3773                         if (nla) {
3774                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3775                                 r_cfg.fc_flags |= RTF_GATEWAY;
3776                         }
3777                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3778                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3779                         if (nla)
3780                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3781                 }
3782
3783                 rt = ip6_route_info_create(&r_cfg, extack);
3784                 if (IS_ERR(rt)) {
3785                         err = PTR_ERR(rt);
3786                         rt = NULL;
3787                         goto cleanup;
3788                 }
3789
3790                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3791                 if (err) {
3792                         dst_release_immediate(&rt->dst);
3793                         goto cleanup;
3794                 }
3795
3796                 rtnh = rtnh_next(rtnh, &remaining);
3797         }
3798
3799         /* for add and replace send one notification with all nexthops.
3800          * Skip the notification in fib6_add_rt2node and send one with
3801          * the full route when done
3802          */
3803         info->skip_notify = 1;
3804
3805         err_nh = NULL;
3806         list_for_each_entry(nh, &rt6_nh_list, next) {
3807                 rt_last = nh->rt6_info;
3808                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3809                 /* save reference to first route for notification */
3810                 if (!rt_notif && !err)
3811                         rt_notif = nh->rt6_info;
3812
3813                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3814                 nh->rt6_info = NULL;
3815                 if (err) {
3816                         if (replace && nhn)
3817                                 ip6_print_replace_route_err(&rt6_nh_list);
3818                         err_nh = nh;
3819                         goto add_errout;
3820                 }
3821
3822                 /* Because each route is added like a single route we remove
3823                  * these flags after the first nexthop: if there is a collision,
3824                  * we have already failed to add the first nexthop:
3825                  * fib6_add_rt2node() has rejected it; when replacing, old
3826                  * nexthops have been replaced by first new, the rest should
3827                  * be added to it.
3828                  */
3829                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3830                                                      NLM_F_REPLACE);
3831                 nhn++;
3832         }
3833
3834         /* success ... tell user about new route */
3835         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3836         goto cleanup;
3837
3838 add_errout:
3839         /* send notification for routes that were added so that
3840          * the delete notifications sent by ip6_route_del are
3841          * coherent
3842          */
3843         if (rt_notif)
3844                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3845
3846         /* Delete routes that were already added */
3847         list_for_each_entry(nh, &rt6_nh_list, next) {
3848                 if (err_nh == nh)
3849                         break;
3850                 ip6_route_del(&nh->r_cfg, extack);
3851         }
3852
3853 cleanup:
3854         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3855                 if (nh->rt6_info)
3856                         dst_release_immediate(&nh->rt6_info->dst);
3857                 kfree(nh->mxc.mx);
3858                 list_del(&nh->next);
3859                 kfree(nh);
3860         }
3861
3862         return err;
3863 }
3864
3865 static int ip6_route_multipath_del(struct fib6_config *cfg,
3866                                    struct netlink_ext_ack *extack)
3867 {
3868         struct fib6_config r_cfg;
3869         struct rtnexthop *rtnh;
3870         int remaining;
3871         int attrlen;
3872         int err = 1, last_err = 0;
3873
3874         remaining = cfg->fc_mp_len;
3875         rtnh = (struct rtnexthop *)cfg->fc_mp;
3876
3877         /* Parse a Multipath Entry */
3878         while (rtnh_ok(rtnh, remaining)) {
3879                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3880                 if (rtnh->rtnh_ifindex)
3881                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3882
3883                 attrlen = rtnh_attrlen(rtnh);
3884                 if (attrlen > 0) {
3885                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3886
3887                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3888                         if (nla) {
3889                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3890                                 r_cfg.fc_flags |= RTF_GATEWAY;
3891                         }
3892                 }
3893                 err = ip6_route_del(&r_cfg, extack);
3894                 if (err)
3895                         last_err = err;
3896
3897                 rtnh = rtnh_next(rtnh, &remaining);
3898         }
3899
3900         return last_err;
3901 }
3902
3903 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3904                               struct netlink_ext_ack *extack)
3905 {
3906         struct fib6_config cfg;
3907         int err;
3908
3909         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3910         if (err < 0)
3911                 return err;
3912
3913         if (cfg.fc_mp)
3914                 return ip6_route_multipath_del(&cfg, extack);
3915         else {
3916                 cfg.fc_delete_all_nh = 1;
3917                 return ip6_route_del(&cfg, extack);
3918         }
3919 }
3920
3921 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3922                               struct netlink_ext_ack *extack)
3923 {
3924         struct fib6_config cfg;
3925         int err;
3926
3927         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3928         if (err < 0)
3929                 return err;
3930
3931         if (cfg.fc_mp)
3932                 return ip6_route_multipath_add(&cfg, extack);
3933         else
3934                 return ip6_route_add(&cfg, extack);
3935 }
3936
3937 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3938 {
3939         int nexthop_len = 0;
3940
3941         if (rt->rt6i_nsiblings) {
3942                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3943                             + NLA_ALIGN(sizeof(struct rtnexthop))
3944                             + nla_total_size(16) /* RTA_GATEWAY */
3945                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3946
3947                 nexthop_len *= rt->rt6i_nsiblings;
3948         }
3949
3950         return NLMSG_ALIGN(sizeof(struct rtmsg))
3951                + nla_total_size(16) /* RTA_SRC */
3952                + nla_total_size(16) /* RTA_DST */
3953                + nla_total_size(16) /* RTA_GATEWAY */
3954                + nla_total_size(16) /* RTA_PREFSRC */
3955                + nla_total_size(4) /* RTA_TABLE */
3956                + nla_total_size(4) /* RTA_IIF */
3957                + nla_total_size(4) /* RTA_OIF */
3958                + nla_total_size(4) /* RTA_PRIORITY */
3959                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3960                + nla_total_size(sizeof(struct rta_cacheinfo))
3961                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3962                + nla_total_size(1) /* RTA_PREF */
3963                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3964                + nexthop_len;
3965 }
3966
3967 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3968                             unsigned int *flags, bool skip_oif)
3969 {
3970         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3971                 *flags |= RTNH_F_LINKDOWN;
3972                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3973                         *flags |= RTNH_F_DEAD;
3974         }
3975
3976         if (rt->rt6i_flags & RTF_GATEWAY) {
3977                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3978                         goto nla_put_failure;
3979         }
3980
3981         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3982                 *flags |= RTNH_F_OFFLOAD;
3983
3984         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3985         if (!skip_oif && rt->dst.dev &&
3986             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3987                 goto nla_put_failure;
3988
3989         if (rt->dst.lwtstate &&
3990             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3991                 goto nla_put_failure;
3992
3993         return 0;
3994
3995 nla_put_failure:
3996         return -EMSGSIZE;
3997 }
3998
3999 /* add multipath next hop */
4000 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4001 {
4002         struct rtnexthop *rtnh;
4003         unsigned int flags = 0;
4004
4005         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4006         if (!rtnh)
4007                 goto nla_put_failure;
4008
4009         rtnh->rtnh_hops = 0;
4010         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4011
4012         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4013                 goto nla_put_failure;
4014
4015         rtnh->rtnh_flags = flags;
4016
4017         /* length of rtnetlink header + attributes */
4018         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4019
4020         return 0;
4021
4022 nla_put_failure:
4023         return -EMSGSIZE;
4024 }
4025
4026 static int rt6_fill_node(struct net *net,
4027                          struct sk_buff *skb, struct rt6_info *rt,
4028                          struct in6_addr *dst, struct in6_addr *src,
4029                          int iif, int type, u32 portid, u32 seq,
4030                          unsigned int flags)
4031 {
4032         u32 metrics[RTAX_MAX];
4033         struct rtmsg *rtm;
4034         struct nlmsghdr *nlh;
4035         long expires;
4036         u32 table;
4037
4038         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4039         if (!nlh)
4040                 return -EMSGSIZE;
4041
4042         rtm = nlmsg_data(nlh);
4043         rtm->rtm_family = AF_INET6;
4044         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4045         rtm->rtm_src_len = rt->rt6i_src.plen;
4046         rtm->rtm_tos = 0;
4047         if (rt->rt6i_table)
4048                 table = rt->rt6i_table->tb6_id;
4049         else
4050                 table = RT6_TABLE_UNSPEC;
4051         rtm->rtm_table = table;
4052         if (nla_put_u32(skb, RTA_TABLE, table))
4053                 goto nla_put_failure;
4054         if (rt->rt6i_flags & RTF_REJECT) {
4055                 switch (rt->dst.error) {
4056                 case -EINVAL:
4057                         rtm->rtm_type = RTN_BLACKHOLE;
4058                         break;
4059                 case -EACCES:
4060                         rtm->rtm_type = RTN_PROHIBIT;
4061                         break;
4062                 case -EAGAIN:
4063                         rtm->rtm_type = RTN_THROW;
4064                         break;
4065                 default:
4066                         rtm->rtm_type = RTN_UNREACHABLE;
4067                         break;
4068                 }
4069         }
4070         else if (rt->rt6i_flags & RTF_LOCAL)
4071                 rtm->rtm_type = RTN_LOCAL;
4072         else if (rt->rt6i_flags & RTF_ANYCAST)
4073                 rtm->rtm_type = RTN_ANYCAST;
4074         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4075                 rtm->rtm_type = RTN_LOCAL;
4076         else
4077                 rtm->rtm_type = RTN_UNICAST;
4078         rtm->rtm_flags = 0;
4079         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4080         rtm->rtm_protocol = rt->rt6i_protocol;
4081
4082         if (rt->rt6i_flags & RTF_CACHE)
4083                 rtm->rtm_flags |= RTM_F_CLONED;
4084
4085         if (dst) {
4086                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4087                         goto nla_put_failure;
4088                 rtm->rtm_dst_len = 128;
4089         } else if (rtm->rtm_dst_len)
4090                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4091                         goto nla_put_failure;
4092 #ifdef CONFIG_IPV6_SUBTREES
4093         if (src) {
4094                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4095                         goto nla_put_failure;
4096                 rtm->rtm_src_len = 128;
4097         } else if (rtm->rtm_src_len &&
4098                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4099                 goto nla_put_failure;
4100 #endif
4101         if (iif) {
4102 #ifdef CONFIG_IPV6_MROUTE
4103                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4104                         int err = ip6mr_get_route(net, skb, rtm, portid);
4105
4106                         if (err == 0)
4107                                 return 0;
4108                         if (err < 0)
4109                                 goto nla_put_failure;
4110                 } else
4111 #endif
4112                         if (nla_put_u32(skb, RTA_IIF, iif))
4113                                 goto nla_put_failure;
4114         } else if (dst) {
4115                 struct in6_addr saddr_buf;
4116                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4117                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4118                         goto nla_put_failure;
4119         }
4120
4121         if (rt->rt6i_prefsrc.plen) {
4122                 struct in6_addr saddr_buf;
4123                 saddr_buf = rt->rt6i_prefsrc.addr;
4124                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4125                         goto nla_put_failure;
4126         }
4127
4128         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4129         if (rt->rt6i_pmtu)
4130                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4131         if (rtnetlink_put_metrics(skb, metrics) < 0)
4132                 goto nla_put_failure;
4133
4134         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4135                 goto nla_put_failure;
4136
4137         /* For multipath routes, walk the siblings list and add
4138          * each as a nexthop within RTA_MULTIPATH.
4139          */
4140         if (rt->rt6i_nsiblings) {
4141                 struct rt6_info *sibling, *next_sibling;
4142                 struct nlattr *mp;
4143
4144                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4145                 if (!mp)
4146                         goto nla_put_failure;
4147
4148                 if (rt6_add_nexthop(skb, rt) < 0)
4149                         goto nla_put_failure;
4150
4151                 list_for_each_entry_safe(sibling, next_sibling,
4152                                          &rt->rt6i_siblings, rt6i_siblings) {
4153                         if (rt6_add_nexthop(skb, sibling) < 0)
4154                                 goto nla_put_failure;
4155                 }
4156
4157                 nla_nest_end(skb, mp);
4158         } else {
4159                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4160                         goto nla_put_failure;
4161         }
4162
4163         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4164
4165         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4166                 goto nla_put_failure;
4167
4168         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4169                 goto nla_put_failure;
4170
4171
4172         nlmsg_end(skb, nlh);
4173         return 0;
4174
4175 nla_put_failure:
4176         nlmsg_cancel(skb, nlh);
4177         return -EMSGSIZE;
4178 }
4179
4180 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4181 {
4182         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4183         struct net *net = arg->net;
4184
4185         if (rt == net->ipv6.ip6_null_entry)
4186                 return 0;
4187
4188         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4189                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4190
4191                 /* user wants prefix routes only */
4192                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4193                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4194                         /* success since this is not a prefix route */
4195                         return 1;
4196                 }
4197         }
4198
4199         return rt6_fill_node(net,
4200                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4201                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4202                      NLM_F_MULTI);
4203 }
4204
4205 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4206                               struct netlink_ext_ack *extack)
4207 {
4208         struct net *net = sock_net(in_skb->sk);
4209         struct nlattr *tb[RTA_MAX+1];
4210         int err, iif = 0, oif = 0;
4211         struct dst_entry *dst;
4212         struct rt6_info *rt;
4213         struct sk_buff *skb;
4214         struct rtmsg *rtm;
4215         struct flowi6 fl6;
4216         bool fibmatch;
4217
4218         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4219                           extack);
4220         if (err < 0)
4221                 goto errout;
4222
4223         err = -EINVAL;
4224         memset(&fl6, 0, sizeof(fl6));
4225         rtm = nlmsg_data(nlh);
4226         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4227         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4228
4229         if (tb[RTA_SRC]) {
4230                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4231                         goto errout;
4232
4233                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4234         }
4235
4236         if (tb[RTA_DST]) {
4237                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4238                         goto errout;
4239
4240                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4241         }
4242
4243         if (tb[RTA_IIF])
4244                 iif = nla_get_u32(tb[RTA_IIF]);
4245
4246         if (tb[RTA_OIF])
4247                 oif = nla_get_u32(tb[RTA_OIF]);
4248
4249         if (tb[RTA_MARK])
4250                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4251
4252         if (tb[RTA_UID])
4253                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4254                                            nla_get_u32(tb[RTA_UID]));
4255         else
4256                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4257
4258         if (iif) {
4259                 struct net_device *dev;
4260                 int flags = 0;
4261
4262                 rcu_read_lock();
4263
4264                 dev = dev_get_by_index_rcu(net, iif);
4265                 if (!dev) {
4266                         rcu_read_unlock();
4267                         err = -ENODEV;
4268                         goto errout;
4269                 }
4270
4271                 fl6.flowi6_iif = iif;
4272
4273                 if (!ipv6_addr_any(&fl6.saddr))
4274                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4275
4276                 if (!fibmatch)
4277                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4278                 else
4279                         dst = ip6_route_lookup(net, &fl6, 0);
4280
4281                 rcu_read_unlock();
4282         } else {
4283                 fl6.flowi6_oif = oif;
4284
4285                 if (!fibmatch)
4286                         dst = ip6_route_output(net, NULL, &fl6);
4287                 else
4288                         dst = ip6_route_lookup(net, &fl6, 0);
4289         }
4290
4291
4292         rt = container_of(dst, struct rt6_info, dst);
4293         if (rt->dst.error) {
4294                 err = rt->dst.error;
4295                 ip6_rt_put(rt);
4296                 goto errout;
4297         }
4298
4299         if (rt == net->ipv6.ip6_null_entry) {
4300                 err = rt->dst.error;
4301                 ip6_rt_put(rt);
4302                 goto errout;
4303         }
4304
4305         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4306         if (!skb) {
4307                 ip6_rt_put(rt);
4308                 err = -ENOBUFS;
4309                 goto errout;
4310         }
4311
4312         skb_dst_set(skb, &rt->dst);
4313         if (fibmatch)
4314                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4315                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4316                                     nlh->nlmsg_seq, 0);
4317         else
4318                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4319                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4320                                     nlh->nlmsg_seq, 0);
4321         if (err < 0) {
4322                 kfree_skb(skb);
4323                 goto errout;
4324         }
4325
4326         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4327 errout:
4328         return err;
4329 }
4330
4331 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4332                      unsigned int nlm_flags)
4333 {
4334         struct sk_buff *skb;
4335         struct net *net = info->nl_net;
4336         u32 seq;
4337         int err;
4338
4339         err = -ENOBUFS;
4340         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4341
4342         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4343         if (!skb)
4344                 goto errout;
4345
4346         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4347                                 event, info->portid, seq, nlm_flags);
4348         if (err < 0) {
4349                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4350                 WARN_ON(err == -EMSGSIZE);
4351                 kfree_skb(skb);
4352                 goto errout;
4353         }
4354         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4355                     info->nlh, gfp_any());
4356         return;
4357 errout:
4358         if (err < 0)
4359                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4360 }
4361
4362 static int ip6_route_dev_notify(struct notifier_block *this,
4363                                 unsigned long event, void *ptr)
4364 {
4365         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4366         struct net *net = dev_net(dev);
4367
4368         if (!(dev->flags & IFF_LOOPBACK))
4369                 return NOTIFY_OK;
4370
4371         if (event == NETDEV_REGISTER) {
4372                 net->ipv6.ip6_null_entry->dst.dev = dev;
4373                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4374 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4375                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4376                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4377                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4378                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4379 #endif
4380          } else if (event == NETDEV_UNREGISTER &&
4381                     dev->reg_state != NETREG_UNREGISTERED) {
4382                 /* NETDEV_UNREGISTER could be fired for multiple times by
4383                  * netdev_wait_allrefs(). Make sure we only call this once.
4384                  */
4385                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4386 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4387                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4388                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4389 #endif
4390         }
4391
4392         return NOTIFY_OK;
4393 }
4394
4395 /*
4396  *      /proc
4397  */
4398
4399 #ifdef CONFIG_PROC_FS
4400
4401 static const struct file_operations ipv6_route_proc_fops = {
4402         .owner          = THIS_MODULE,
4403         .open           = ipv6_route_open,
4404         .read           = seq_read,
4405         .llseek         = seq_lseek,
4406         .release        = seq_release_net,
4407 };
4408
4409 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4410 {
4411         struct net *net = (struct net *)seq->private;
4412         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4413                    net->ipv6.rt6_stats->fib_nodes,
4414                    net->ipv6.rt6_stats->fib_route_nodes,
4415                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4416                    net->ipv6.rt6_stats->fib_rt_entries,
4417                    net->ipv6.rt6_stats->fib_rt_cache,
4418                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4419                    net->ipv6.rt6_stats->fib_discarded_routes);
4420
4421         return 0;
4422 }
4423
4424 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4425 {
4426         return single_open_net(inode, file, rt6_stats_seq_show);
4427 }
4428
4429 static const struct file_operations rt6_stats_seq_fops = {
4430         .owner   = THIS_MODULE,
4431         .open    = rt6_stats_seq_open,
4432         .read    = seq_read,
4433         .llseek  = seq_lseek,
4434         .release = single_release_net,
4435 };
4436 #endif  /* CONFIG_PROC_FS */
4437
4438 #ifdef CONFIG_SYSCTL
4439
4440 static
4441 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4442                               void __user *buffer, size_t *lenp, loff_t *ppos)
4443 {
4444         struct net *net;
4445         int delay;
4446         if (!write)
4447                 return -EINVAL;
4448
4449         net = (struct net *)ctl->extra1;
4450         delay = net->ipv6.sysctl.flush_delay;
4451         proc_dointvec(ctl, write, buffer, lenp, ppos);
4452         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4453         return 0;
4454 }
4455
4456 struct ctl_table ipv6_route_table_template[] = {
4457         {
4458                 .procname       =       "flush",
4459                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4460                 .maxlen         =       sizeof(int),
4461                 .mode           =       0200,
4462                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4463         },
4464         {
4465                 .procname       =       "gc_thresh",
4466                 .data           =       &ip6_dst_ops_template.gc_thresh,
4467                 .maxlen         =       sizeof(int),
4468                 .mode           =       0644,
4469                 .proc_handler   =       proc_dointvec,
4470         },
4471         {
4472                 .procname       =       "max_size",
4473                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4474                 .maxlen         =       sizeof(int),
4475                 .mode           =       0644,
4476                 .proc_handler   =       proc_dointvec,
4477         },
4478         {
4479                 .procname       =       "gc_min_interval",
4480                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4481                 .maxlen         =       sizeof(int),
4482                 .mode           =       0644,
4483                 .proc_handler   =       proc_dointvec_jiffies,
4484         },
4485         {
4486                 .procname       =       "gc_timeout",
4487                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4488                 .maxlen         =       sizeof(int),
4489                 .mode           =       0644,
4490                 .proc_handler   =       proc_dointvec_jiffies,
4491         },
4492         {
4493                 .procname       =       "gc_interval",
4494                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4495                 .maxlen         =       sizeof(int),
4496                 .mode           =       0644,
4497                 .proc_handler   =       proc_dointvec_jiffies,
4498         },
4499         {
4500                 .procname       =       "gc_elasticity",
4501                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4502                 .maxlen         =       sizeof(int),
4503                 .mode           =       0644,
4504                 .proc_handler   =       proc_dointvec,
4505         },
4506         {
4507                 .procname       =       "mtu_expires",
4508                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4509                 .maxlen         =       sizeof(int),
4510                 .mode           =       0644,
4511                 .proc_handler   =       proc_dointvec_jiffies,
4512         },
4513         {
4514                 .procname       =       "min_adv_mss",
4515                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4516                 .maxlen         =       sizeof(int),
4517                 .mode           =       0644,
4518                 .proc_handler   =       proc_dointvec,
4519         },
4520         {
4521                 .procname       =       "gc_min_interval_ms",
4522                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4523                 .maxlen         =       sizeof(int),
4524                 .mode           =       0644,
4525                 .proc_handler   =       proc_dointvec_ms_jiffies,
4526         },
4527         { }
4528 };
4529
4530 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4531 {
4532         struct ctl_table *table;
4533
4534         table = kmemdup(ipv6_route_table_template,
4535                         sizeof(ipv6_route_table_template),
4536                         GFP_KERNEL);
4537
4538         if (table) {
4539                 table[0].data = &net->ipv6.sysctl.flush_delay;
4540                 table[0].extra1 = net;
4541                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4542                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4543                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4544                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4545                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4546                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4547                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4548                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4549                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4550
4551                 /* Don't export sysctls to unprivileged users */
4552                 if (net->user_ns != &init_user_ns)
4553                         table[0].procname = NULL;
4554         }
4555
4556         return table;
4557 }
4558 #endif
4559
4560 static int __net_init ip6_route_net_init(struct net *net)
4561 {
4562         int ret = -ENOMEM;
4563
4564         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4565                sizeof(net->ipv6.ip6_dst_ops));
4566
4567         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4568                 goto out_ip6_dst_ops;
4569
4570         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4571                                            sizeof(*net->ipv6.ip6_null_entry),
4572                                            GFP_KERNEL);
4573         if (!net->ipv6.ip6_null_entry)
4574                 goto out_ip6_dst_entries;
4575         net->ipv6.ip6_null_entry->dst.path =
4576                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4577         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4578         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4579                          ip6_template_metrics, true);
4580
4581 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4582         net->ipv6.fib6_has_custom_rules = false;
4583         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4584                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4585                                                GFP_KERNEL);
4586         if (!net->ipv6.ip6_prohibit_entry)
4587                 goto out_ip6_null_entry;
4588         net->ipv6.ip6_prohibit_entry->dst.path =
4589                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4590         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4591         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4592                          ip6_template_metrics, true);
4593
4594         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4595                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4596                                                GFP_KERNEL);
4597         if (!net->ipv6.ip6_blk_hole_entry)
4598                 goto out_ip6_prohibit_entry;
4599         net->ipv6.ip6_blk_hole_entry->dst.path =
4600                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4601         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4602         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4603                          ip6_template_metrics, true);
4604 #endif
4605
4606         net->ipv6.sysctl.flush_delay = 0;
4607         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4608         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4609         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4610         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4611         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4612         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4613         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4614
4615         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4616
4617         ret = 0;
4618 out:
4619         return ret;
4620
4621 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4622 out_ip6_prohibit_entry:
4623         kfree(net->ipv6.ip6_prohibit_entry);
4624 out_ip6_null_entry:
4625         kfree(net->ipv6.ip6_null_entry);
4626 #endif
4627 out_ip6_dst_entries:
4628         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4629 out_ip6_dst_ops:
4630         goto out;
4631 }
4632
4633 static void __net_exit ip6_route_net_exit(struct net *net)
4634 {
4635         kfree(net->ipv6.ip6_null_entry);
4636 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4637         kfree(net->ipv6.ip6_prohibit_entry);
4638         kfree(net->ipv6.ip6_blk_hole_entry);
4639 #endif
4640         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4641 }
4642
4643 static int __net_init ip6_route_net_init_late(struct net *net)
4644 {
4645 #ifdef CONFIG_PROC_FS
4646         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4647         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4648 #endif
4649         return 0;
4650 }
4651
4652 static void __net_exit ip6_route_net_exit_late(struct net *net)
4653 {
4654 #ifdef CONFIG_PROC_FS
4655         remove_proc_entry("ipv6_route", net->proc_net);
4656         remove_proc_entry("rt6_stats", net->proc_net);
4657 #endif
4658 }
4659
4660 static struct pernet_operations ip6_route_net_ops = {
4661         .init = ip6_route_net_init,
4662         .exit = ip6_route_net_exit,
4663 };
4664
4665 static int __net_init ipv6_inetpeer_init(struct net *net)
4666 {
4667         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4668
4669         if (!bp)
4670                 return -ENOMEM;
4671         inet_peer_base_init(bp);
4672         net->ipv6.peers = bp;
4673         return 0;
4674 }
4675
4676 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4677 {
4678         struct inet_peer_base *bp = net->ipv6.peers;
4679
4680         net->ipv6.peers = NULL;
4681         inetpeer_invalidate_tree(bp);
4682         kfree(bp);
4683 }
4684
4685 static struct pernet_operations ipv6_inetpeer_ops = {
4686         .init   =       ipv6_inetpeer_init,
4687         .exit   =       ipv6_inetpeer_exit,
4688 };
4689
4690 static struct pernet_operations ip6_route_net_late_ops = {
4691         .init = ip6_route_net_init_late,
4692         .exit = ip6_route_net_exit_late,
4693 };
4694
4695 static struct notifier_block ip6_route_dev_notifier = {
4696         .notifier_call = ip6_route_dev_notify,
4697         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4698 };
4699
4700 void __init ip6_route_init_special_entries(void)
4701 {
4702         /* Registering of the loopback is done before this portion of code,
4703          * the loopback reference in rt6_info will not be taken, do it
4704          * manually for init_net */
4705         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4706         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4707   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4708         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4709         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4710         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4711         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4712   #endif
4713 }
4714
4715 int __init ip6_route_init(void)
4716 {
4717         int ret;
4718         int cpu;
4719
4720         ret = -ENOMEM;
4721         ip6_dst_ops_template.kmem_cachep =
4722                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4723                                   SLAB_HWCACHE_ALIGN, NULL);
4724         if (!ip6_dst_ops_template.kmem_cachep)
4725                 goto out;
4726
4727         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4728         if (ret)
4729                 goto out_kmem_cache;
4730
4731         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4732         if (ret)
4733                 goto out_dst_entries;
4734
4735         ret = register_pernet_subsys(&ip6_route_net_ops);
4736         if (ret)
4737                 goto out_register_inetpeer;
4738
4739         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4740
4741         ret = fib6_init();
4742         if (ret)
4743                 goto out_register_subsys;
4744
4745         ret = xfrm6_init();
4746         if (ret)
4747                 goto out_fib6_init;
4748
4749         ret = fib6_rules_init();
4750         if (ret)
4751                 goto xfrm6_init;
4752
4753         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4754         if (ret)
4755                 goto fib6_rules_init;
4756
4757         ret = -ENOBUFS;
4758         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4759             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4760             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4761                             RTNL_FLAG_DOIT_UNLOCKED))
4762                 goto out_register_late_subsys;
4763
4764         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4765         if (ret)
4766                 goto out_register_late_subsys;
4767
4768         for_each_possible_cpu(cpu) {
4769                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4770
4771                 INIT_LIST_HEAD(&ul->head);
4772                 spin_lock_init(&ul->lock);
4773         }
4774
4775 out:
4776         return ret;
4777
4778 out_register_late_subsys:
4779         unregister_pernet_subsys(&ip6_route_net_late_ops);
4780 fib6_rules_init:
4781         fib6_rules_cleanup();
4782 xfrm6_init:
4783         xfrm6_fini();
4784 out_fib6_init:
4785         fib6_gc_cleanup();
4786 out_register_subsys:
4787         unregister_pernet_subsys(&ip6_route_net_ops);
4788 out_register_inetpeer:
4789         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4790 out_dst_entries:
4791         dst_entries_destroy(&ip6_dst_blackhole_ops);
4792 out_kmem_cache:
4793         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4794         goto out;
4795 }
4796
4797 void ip6_route_cleanup(void)
4798 {
4799         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4800         unregister_pernet_subsys(&ip6_route_net_late_ops);
4801         fib6_rules_cleanup();
4802         xfrm6_fini();
4803         fib6_gc_cleanup();
4804         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4805         unregister_pernet_subsys(&ip6_route_net_ops);
4806         dst_entries_destroy(&ip6_dst_blackhole_ops);
4807         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4808 }