cb3366d5e165519d7ae098ef525873324a1f376c
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr,
106                                            struct net_device *dev,
107                                            unsigned int pref);
108 static struct rt6_info *rt6_get_route_info(struct net *net,
109                                            const struct in6_addr *prefix, int prefixlen,
110                                            const struct in6_addr *gwaddr,
111                                            struct net_device *dev);
112 #endif
113
114 struct uncached_list {
115         spinlock_t              lock;
116         struct list_head        head;
117 };
118
119 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
120
121 static void rt6_uncached_list_add(struct rt6_info *rt)
122 {
123         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
124
125         rt->dst.flags |= DST_NOCACHE;
126         rt->rt6i_uncached_list = ul;
127
128         spin_lock_bh(&ul->lock);
129         list_add_tail(&rt->rt6i_uncached, &ul->head);
130         spin_unlock_bh(&ul->lock);
131 }
132
133 static void rt6_uncached_list_del(struct rt6_info *rt)
134 {
135         if (!list_empty(&rt->rt6i_uncached)) {
136                 struct uncached_list *ul = rt->rt6i_uncached_list;
137
138                 spin_lock_bh(&ul->lock);
139                 list_del(&rt->rt6i_uncached);
140                 spin_unlock_bh(&ul->lock);
141         }
142 }
143
144 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
145 {
146         struct net_device *loopback_dev = net->loopback_dev;
147         int cpu;
148
149         if (dev == loopback_dev)
150                 return;
151
152         for_each_possible_cpu(cpu) {
153                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
154                 struct rt6_info *rt;
155
156                 spin_lock_bh(&ul->lock);
157                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
158                         struct inet6_dev *rt_idev = rt->rt6i_idev;
159                         struct net_device *rt_dev = rt->dst.dev;
160
161                         if (rt_idev->dev == dev) {
162                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
163                                 in6_dev_put(rt_idev);
164                         }
165
166                         if (rt_dev == dev) {
167                                 rt->dst.dev = loopback_dev;
168                                 dev_hold(rt->dst.dev);
169                                 dev_put(rt_dev);
170                         }
171                 }
172                 spin_unlock_bh(&ul->lock);
173         }
174 }
175
176 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
177 {
178         return dst_metrics_write_ptr(rt->dst.from);
179 }
180
181 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
182 {
183         struct rt6_info *rt = (struct rt6_info *)dst;
184
185         if (rt->rt6i_flags & RTF_PCPU)
186                 return rt6_pcpu_cow_metrics(rt);
187         else if (rt->rt6i_flags & RTF_CACHE)
188                 return NULL;
189         else
190                 return dst_cow_metrics_generic(dst, old);
191 }
192
193 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
194                                              struct sk_buff *skb,
195                                              const void *daddr)
196 {
197         struct in6_addr *p = &rt->rt6i_gateway;
198
199         if (!ipv6_addr_any(p))
200                 return (const void *) p;
201         else if (skb)
202                 return &ipv6_hdr(skb)->daddr;
203         return daddr;
204 }
205
206 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
207                                           struct sk_buff *skb,
208                                           const void *daddr)
209 {
210         struct rt6_info *rt = (struct rt6_info *) dst;
211         struct neighbour *n;
212
213         daddr = choose_neigh_daddr(rt, skb, daddr);
214         n = __ipv6_neigh_lookup(dst->dev, daddr);
215         if (n)
216                 return n;
217         return neigh_create(&nd_tbl, daddr, dst->dev);
218 }
219
220 static struct dst_ops ip6_dst_ops_template = {
221         .family                 =       AF_INET6,
222         .gc                     =       ip6_dst_gc,
223         .gc_thresh              =       1024,
224         .check                  =       ip6_dst_check,
225         .default_advmss         =       ip6_default_advmss,
226         .mtu                    =       ip6_mtu,
227         .cow_metrics            =       ipv6_cow_metrics,
228         .destroy                =       ip6_dst_destroy,
229         .ifdown                 =       ip6_dst_ifdown,
230         .negative_advice        =       ip6_negative_advice,
231         .link_failure           =       ip6_link_failure,
232         .update_pmtu            =       ip6_rt_update_pmtu,
233         .redirect               =       rt6_do_redirect,
234         .local_out              =       __ip6_local_out,
235         .neigh_lookup           =       ip6_neigh_lookup,
236 };
237
238 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
239 {
240         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
241
242         return mtu ? : dst->dev->mtu;
243 }
244
245 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
246                                          struct sk_buff *skb, u32 mtu)
247 {
248 }
249
250 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
251                                       struct sk_buff *skb)
252 {
253 }
254
255 static struct dst_ops ip6_dst_blackhole_ops = {
256         .family                 =       AF_INET6,
257         .destroy                =       ip6_dst_destroy,
258         .check                  =       ip6_dst_check,
259         .mtu                    =       ip6_blackhole_mtu,
260         .default_advmss         =       ip6_default_advmss,
261         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
262         .redirect               =       ip6_rt_blackhole_redirect,
263         .cow_metrics            =       dst_cow_metrics_generic,
264         .neigh_lookup           =       ip6_neigh_lookup,
265 };
266
267 static const u32 ip6_template_metrics[RTAX_MAX] = {
268         [RTAX_HOPLIMIT - 1] = 0,
269 };
270
271 static const struct rt6_info ip6_null_entry_template = {
272         .dst = {
273                 .__refcnt       = ATOMIC_INIT(1),
274                 .__use          = 1,
275                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
276                 .error          = -ENETUNREACH,
277                 .input          = ip6_pkt_discard,
278                 .output         = ip6_pkt_discard_out,
279         },
280         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
281         .rt6i_protocol  = RTPROT_KERNEL,
282         .rt6i_metric    = ~(u32) 0,
283         .rt6i_ref       = ATOMIC_INIT(1),
284 };
285
286 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
287
288 static const struct rt6_info ip6_prohibit_entry_template = {
289         .dst = {
290                 .__refcnt       = ATOMIC_INIT(1),
291                 .__use          = 1,
292                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
293                 .error          = -EACCES,
294                 .input          = ip6_pkt_prohibit,
295                 .output         = ip6_pkt_prohibit_out,
296         },
297         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
298         .rt6i_protocol  = RTPROT_KERNEL,
299         .rt6i_metric    = ~(u32) 0,
300         .rt6i_ref       = ATOMIC_INIT(1),
301 };
302
303 static const struct rt6_info ip6_blk_hole_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -EINVAL,
309                 .input          = dst_discard,
310                 .output         = dst_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313         .rt6i_protocol  = RTPROT_KERNEL,
314         .rt6i_metric    = ~(u32) 0,
315         .rt6i_ref       = ATOMIC_INIT(1),
316 };
317
318 #endif
319
320 static void rt6_info_init(struct rt6_info *rt)
321 {
322         struct dst_entry *dst = &rt->dst;
323
324         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
325         INIT_LIST_HEAD(&rt->rt6i_siblings);
326         INIT_LIST_HEAD(&rt->rt6i_uncached);
327 }
328
329 /* allocate dst with ip6_dst_ops */
330 static struct rt6_info *__ip6_dst_alloc(struct net *net,
331                                         struct net_device *dev,
332                                         int flags)
333 {
334         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
335                                         0, DST_OBSOLETE_FORCE_CHK, flags);
336
337         if (rt)
338                 rt6_info_init(rt);
339
340         return rt;
341 }
342
343 struct rt6_info *ip6_dst_alloc(struct net *net,
344                                struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
348
349         if (rt) {
350                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
351                 if (rt->rt6i_pcpu) {
352                         int cpu;
353
354                         for_each_possible_cpu(cpu) {
355                                 struct rt6_info **p;
356
357                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
358                                 /* no one shares rt */
359                                 *p =  NULL;
360                         }
361                 } else {
362                         dst_destroy((struct dst_entry *)rt);
363                         return NULL;
364                 }
365         }
366
367         return rt;
368 }
369 EXPORT_SYMBOL(ip6_dst_alloc);
370
371 static void ip6_dst_destroy(struct dst_entry *dst)
372 {
373         struct rt6_info *rt = (struct rt6_info *)dst;
374         struct dst_entry *from = dst->from;
375         struct inet6_dev *idev;
376
377         dst_destroy_metrics_generic(dst);
378         free_percpu(rt->rt6i_pcpu);
379         rt6_uncached_list_del(rt);
380
381         idev = rt->rt6i_idev;
382         if (idev) {
383                 rt->rt6i_idev = NULL;
384                 in6_dev_put(idev);
385         }
386
387         dst->from = NULL;
388         dst_release(from);
389 }
390
391 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392                            int how)
393 {
394         struct rt6_info *rt = (struct rt6_info *)dst;
395         struct inet6_dev *idev = rt->rt6i_idev;
396         struct net_device *loopback_dev =
397                 dev_net(dev)->loopback_dev;
398
399         if (dev != loopback_dev) {
400                 if (idev && idev->dev == dev) {
401                         struct inet6_dev *loopback_idev =
402                                 in6_dev_get(loopback_dev);
403                         if (loopback_idev) {
404                                 rt->rt6i_idev = loopback_idev;
405                                 in6_dev_put(idev);
406                         }
407                 }
408         }
409 }
410
411 static bool __rt6_check_expired(const struct rt6_info *rt)
412 {
413         if (rt->rt6i_flags & RTF_EXPIRES)
414                 return time_after(jiffies, rt->dst.expires);
415         else
416                 return false;
417 }
418
419 static bool rt6_check_expired(const struct rt6_info *rt)
420 {
421         if (rt->rt6i_flags & RTF_EXPIRES) {
422                 if (time_after(jiffies, rt->dst.expires))
423                         return true;
424         } else if (rt->dst.from) {
425                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
426         }
427         return false;
428 }
429
430 /* Multipath route selection:
431  *   Hash based function using packet header and flowlabel.
432  * Adapted from fib_info_hashfn()
433  */
434 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
435                                const struct flowi6 *fl6)
436 {
437         return get_hash_from_flowi6(fl6) % candidate_count;
438 }
439
440 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
441                                              struct flowi6 *fl6, int oif,
442                                              int strict)
443 {
444         struct rt6_info *sibling, *next_sibling;
445         int route_choosen;
446
447         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
448         /* Don't change the route, if route_choosen == 0
449          * (siblings does not include ourself)
450          */
451         if (route_choosen)
452                 list_for_each_entry_safe(sibling, next_sibling,
453                                 &match->rt6i_siblings, rt6i_siblings) {
454                         route_choosen--;
455                         if (route_choosen == 0) {
456                                 if (rt6_score_route(sibling, oif, strict) < 0)
457                                         break;
458                                 match = sibling;
459                                 break;
460                         }
461                 }
462         return match;
463 }
464
465 /*
466  *      Route lookup. Any table->tb6_lock is implied.
467  */
468
469 static inline struct rt6_info *rt6_device_match(struct net *net,
470                                                     struct rt6_info *rt,
471                                                     const struct in6_addr *saddr,
472                                                     int oif,
473                                                     int flags)
474 {
475         struct rt6_info *local = NULL;
476         struct rt6_info *sprt;
477
478         if (!oif && ipv6_addr_any(saddr))
479                 goto out;
480
481         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
482                 struct net_device *dev = sprt->dst.dev;
483
484                 if (oif) {
485                         if (dev->ifindex == oif)
486                                 return sprt;
487                         if (dev->flags & IFF_LOOPBACK) {
488                                 if (!sprt->rt6i_idev ||
489                                     sprt->rt6i_idev->dev->ifindex != oif) {
490                                         if (flags & RT6_LOOKUP_F_IFACE)
491                                                 continue;
492                                         if (local &&
493                                             local->rt6i_idev->dev->ifindex == oif)
494                                                 continue;
495                                 }
496                                 local = sprt;
497                         }
498                 } else {
499                         if (ipv6_chk_addr(net, saddr, dev,
500                                           flags & RT6_LOOKUP_F_IFACE))
501                                 return sprt;
502                 }
503         }
504
505         if (oif) {
506                 if (local)
507                         return local;
508
509                 if (flags & RT6_LOOKUP_F_IFACE)
510                         return net->ipv6.ip6_null_entry;
511         }
512 out:
513         return rt;
514 }
515
516 #ifdef CONFIG_IPV6_ROUTER_PREF
517 struct __rt6_probe_work {
518         struct work_struct work;
519         struct in6_addr target;
520         struct net_device *dev;
521 };
522
523 static void rt6_probe_deferred(struct work_struct *w)
524 {
525         struct in6_addr mcaddr;
526         struct __rt6_probe_work *work =
527                 container_of(w, struct __rt6_probe_work, work);
528
529         addrconf_addr_solict_mult(&work->target, &mcaddr);
530         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
531         dev_put(work->dev);
532         kfree(work);
533 }
534
535 static void rt6_probe(struct rt6_info *rt)
536 {
537         struct __rt6_probe_work *work;
538         struct neighbour *neigh;
539         /*
540          * Okay, this does not seem to be appropriate
541          * for now, however, we need to check if it
542          * is really so; aka Router Reachability Probing.
543          *
544          * Router Reachability Probe MUST be rate-limited
545          * to no more than one per minute.
546          */
547         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
548                 return;
549         rcu_read_lock_bh();
550         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
551         if (neigh) {
552                 if (neigh->nud_state & NUD_VALID)
553                         goto out;
554
555                 work = NULL;
556                 write_lock(&neigh->lock);
557                 if (!(neigh->nud_state & NUD_VALID) &&
558                     time_after(jiffies,
559                                neigh->updated +
560                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
561                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
562                         if (work)
563                                 __neigh_set_probe_once(neigh);
564                 }
565                 write_unlock(&neigh->lock);
566         } else {
567                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
568         }
569
570         if (work) {
571                 INIT_WORK(&work->work, rt6_probe_deferred);
572                 work->target = rt->rt6i_gateway;
573                 dev_hold(rt->dst.dev);
574                 work->dev = rt->dst.dev;
575                 schedule_work(&work->work);
576         }
577
578 out:
579         rcu_read_unlock_bh();
580 }
581 #else
582 static inline void rt6_probe(struct rt6_info *rt)
583 {
584 }
585 #endif
586
587 /*
588  * Default Router Selection (RFC 2461 6.3.6)
589  */
590 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
591 {
592         struct net_device *dev = rt->dst.dev;
593         if (!oif || dev->ifindex == oif)
594                 return 2;
595         if ((dev->flags & IFF_LOOPBACK) &&
596             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
597                 return 1;
598         return 0;
599 }
600
601 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
602 {
603         struct neighbour *neigh;
604         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
605
606         if (rt->rt6i_flags & RTF_NONEXTHOP ||
607             !(rt->rt6i_flags & RTF_GATEWAY))
608                 return RT6_NUD_SUCCEED;
609
610         rcu_read_lock_bh();
611         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
612         if (neigh) {
613                 read_lock(&neigh->lock);
614                 if (neigh->nud_state & NUD_VALID)
615                         ret = RT6_NUD_SUCCEED;
616 #ifdef CONFIG_IPV6_ROUTER_PREF
617                 else if (!(neigh->nud_state & NUD_FAILED))
618                         ret = RT6_NUD_SUCCEED;
619                 else
620                         ret = RT6_NUD_FAIL_PROBE;
621 #endif
622                 read_unlock(&neigh->lock);
623         } else {
624                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
625                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
626         }
627         rcu_read_unlock_bh();
628
629         return ret;
630 }
631
632 static int rt6_score_route(struct rt6_info *rt, int oif,
633                            int strict)
634 {
635         int m;
636
637         m = rt6_check_dev(rt, oif);
638         if (!m && (strict & RT6_LOOKUP_F_IFACE))
639                 return RT6_NUD_FAIL_HARD;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
642 #endif
643         if (strict & RT6_LOOKUP_F_REACHABLE) {
644                 int n = rt6_check_neigh(rt);
645                 if (n < 0)
646                         return n;
647         }
648         return m;
649 }
650
651 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
652                                    int *mpri, struct rt6_info *match,
653                                    bool *do_rr)
654 {
655         int m;
656         bool match_do_rr = false;
657         struct inet6_dev *idev = rt->rt6i_idev;
658         struct net_device *dev = rt->dst.dev;
659
660         if (dev && !netif_carrier_ok(dev) &&
661             idev->cnf.ignore_routes_with_linkdown &&
662             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
663                 goto out;
664
665         if (rt6_check_expired(rt))
666                 goto out;
667
668         m = rt6_score_route(rt, oif, strict);
669         if (m == RT6_NUD_FAIL_DO_RR) {
670                 match_do_rr = true;
671                 m = 0; /* lowest valid score */
672         } else if (m == RT6_NUD_FAIL_HARD) {
673                 goto out;
674         }
675
676         if (strict & RT6_LOOKUP_F_REACHABLE)
677                 rt6_probe(rt);
678
679         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
680         if (m > *mpri) {
681                 *do_rr = match_do_rr;
682                 *mpri = m;
683                 match = rt;
684         }
685 out:
686         return match;
687 }
688
689 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
690                                      struct rt6_info *rr_head,
691                                      u32 metric, int oif, int strict,
692                                      bool *do_rr)
693 {
694         struct rt6_info *rt, *match, *cont;
695         int mpri = -1;
696
697         match = NULL;
698         cont = NULL;
699         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
700                 if (rt->rt6i_metric != metric) {
701                         cont = rt;
702                         break;
703                 }
704
705                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
706         }
707
708         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
709                 if (rt->rt6i_metric != metric) {
710                         cont = rt;
711                         break;
712                 }
713
714                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
715         }
716
717         if (match || !cont)
718                 return match;
719
720         for (rt = cont; rt; rt = rt->dst.rt6_next)
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722
723         return match;
724 }
725
726 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
727 {
728         struct rt6_info *match, *rt0;
729         struct net *net;
730         bool do_rr = false;
731
732         rt0 = fn->rr_ptr;
733         if (!rt0)
734                 fn->rr_ptr = rt0 = fn->leaf;
735
736         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
737                              &do_rr);
738
739         if (do_rr) {
740                 struct rt6_info *next = rt0->dst.rt6_next;
741
742                 /* no entries matched; do round-robin */
743                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
744                         next = fn->leaf;
745
746                 if (next != rt0)
747                         fn->rr_ptr = next;
748         }
749
750         net = dev_net(rt0->dst.dev);
751         return match ? match : net->ipv6.ip6_null_entry;
752 }
753
754 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
755 {
756         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
757 }
758
759 #ifdef CONFIG_IPV6_ROUTE_INFO
760 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
761                   const struct in6_addr *gwaddr)
762 {
763         struct net *net = dev_net(dev);
764         struct route_info *rinfo = (struct route_info *) opt;
765         struct in6_addr prefix_buf, *prefix;
766         unsigned int pref;
767         unsigned long lifetime;
768         struct rt6_info *rt;
769
770         if (len < sizeof(struct route_info)) {
771                 return -EINVAL;
772         }
773
774         /* Sanity check for prefix_len and length */
775         if (rinfo->length > 3) {
776                 return -EINVAL;
777         } else if (rinfo->prefix_len > 128) {
778                 return -EINVAL;
779         } else if (rinfo->prefix_len > 64) {
780                 if (rinfo->length < 2) {
781                         return -EINVAL;
782                 }
783         } else if (rinfo->prefix_len > 0) {
784                 if (rinfo->length < 1) {
785                         return -EINVAL;
786                 }
787         }
788
789         pref = rinfo->route_pref;
790         if (pref == ICMPV6_ROUTER_PREF_INVALID)
791                 return -EINVAL;
792
793         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
794
795         if (rinfo->length == 3)
796                 prefix = (struct in6_addr *)rinfo->prefix;
797         else {
798                 /* this function is safe */
799                 ipv6_addr_prefix(&prefix_buf,
800                                  (struct in6_addr *)rinfo->prefix,
801                                  rinfo->prefix_len);
802                 prefix = &prefix_buf;
803         }
804
805         if (rinfo->prefix_len == 0)
806                 rt = rt6_get_dflt_router(gwaddr, dev);
807         else
808                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
809                                         gwaddr, dev);
810
811         if (rt && !lifetime) {
812                 ip6_del_rt(rt);
813                 rt = NULL;
814         }
815
816         if (!rt && lifetime)
817                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
818                                         dev, pref);
819         else if (rt)
820                 rt->rt6i_flags = RTF_ROUTEINFO |
821                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
822
823         if (rt) {
824                 if (!addrconf_finite_timeout(lifetime))
825                         rt6_clean_expires(rt);
826                 else
827                         rt6_set_expires(rt, jiffies + HZ * lifetime);
828
829                 ip6_rt_put(rt);
830         }
831         return 0;
832 }
833 #endif
834
835 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
836                                         struct in6_addr *saddr)
837 {
838         struct fib6_node *pn;
839         while (1) {
840                 if (fn->fn_flags & RTN_TL_ROOT)
841                         return NULL;
842                 pn = fn->parent;
843                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
844                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
845                 else
846                         fn = pn;
847                 if (fn->fn_flags & RTN_RTINFO)
848                         return fn;
849         }
850 }
851
852 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
853                                              struct fib6_table *table,
854                                              struct flowi6 *fl6, int flags)
855 {
856         struct fib6_node *fn;
857         struct rt6_info *rt;
858
859         read_lock_bh(&table->tb6_lock);
860         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
861 restart:
862         rt = fn->leaf;
863         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
864         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
865                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
866         if (rt == net->ipv6.ip6_null_entry) {
867                 fn = fib6_backtrack(fn, &fl6->saddr);
868                 if (fn)
869                         goto restart;
870         }
871         dst_use(&rt->dst, jiffies);
872         read_unlock_bh(&table->tb6_lock);
873
874         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
875
876         return rt;
877
878 }
879
880 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
881                                     int flags)
882 {
883         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
884 }
885 EXPORT_SYMBOL_GPL(ip6_route_lookup);
886
887 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
888                             const struct in6_addr *saddr, int oif, int strict)
889 {
890         struct flowi6 fl6 = {
891                 .flowi6_oif = oif,
892                 .daddr = *daddr,
893         };
894         struct dst_entry *dst;
895         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
896
897         if (saddr) {
898                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
899                 flags |= RT6_LOOKUP_F_HAS_SADDR;
900         }
901
902         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
903         if (dst->error == 0)
904                 return (struct rt6_info *) dst;
905
906         dst_release(dst);
907
908         return NULL;
909 }
910 EXPORT_SYMBOL(rt6_lookup);
911
912 /* ip6_ins_rt is called with FREE table->tb6_lock.
913    It takes new route entry, the addition fails by any reason the
914    route is freed. In any case, if caller does not hold it, it may
915    be destroyed.
916  */
917
918 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
919                         struct mx6_config *mxc)
920 {
921         int err;
922         struct fib6_table *table;
923
924         table = rt->rt6i_table;
925         write_lock_bh(&table->tb6_lock);
926         err = fib6_add(&table->tb6_root, rt, info, mxc);
927         write_unlock_bh(&table->tb6_lock);
928
929         return err;
930 }
931
932 int ip6_ins_rt(struct rt6_info *rt)
933 {
934         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
935         struct mx6_config mxc = { .mx = NULL, };
936
937         return __ip6_ins_rt(rt, &info, &mxc);
938 }
939
940 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
941                                            const struct in6_addr *daddr,
942                                            const struct in6_addr *saddr)
943 {
944         struct rt6_info *rt;
945
946         /*
947          *      Clone the route.
948          */
949
950         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
951                 ort = (struct rt6_info *)ort->dst.from;
952
953         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
954
955         if (!rt)
956                 return NULL;
957
958         ip6_rt_copy_init(rt, ort);
959         rt->rt6i_flags |= RTF_CACHE;
960         rt->rt6i_metric = 0;
961         rt->dst.flags |= DST_HOST;
962         rt->rt6i_dst.addr = *daddr;
963         rt->rt6i_dst.plen = 128;
964
965         if (!rt6_is_gw_or_nonexthop(ort)) {
966                 if (ort->rt6i_dst.plen != 128 &&
967                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
968                         rt->rt6i_flags |= RTF_ANYCAST;
969 #ifdef CONFIG_IPV6_SUBTREES
970                 if (rt->rt6i_src.plen && saddr) {
971                         rt->rt6i_src.addr = *saddr;
972                         rt->rt6i_src.plen = 128;
973                 }
974 #endif
975         }
976
977         return rt;
978 }
979
980 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
981 {
982         struct rt6_info *pcpu_rt;
983
984         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
985                                   rt->dst.dev, rt->dst.flags);
986
987         if (!pcpu_rt)
988                 return NULL;
989         ip6_rt_copy_init(pcpu_rt, rt);
990         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
991         pcpu_rt->rt6i_flags |= RTF_PCPU;
992         return pcpu_rt;
993 }
994
995 /* It should be called with read_lock_bh(&tb6_lock) acquired */
996 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
997 {
998         struct rt6_info *pcpu_rt, **p;
999
1000         p = this_cpu_ptr(rt->rt6i_pcpu);
1001         pcpu_rt = *p;
1002
1003         if (pcpu_rt) {
1004                 dst_hold(&pcpu_rt->dst);
1005                 rt6_dst_from_metrics_check(pcpu_rt);
1006         }
1007         return pcpu_rt;
1008 }
1009
1010 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1011 {
1012         struct fib6_table *table = rt->rt6i_table;
1013         struct rt6_info *pcpu_rt, *prev, **p;
1014
1015         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1016         if (!pcpu_rt) {
1017                 struct net *net = dev_net(rt->dst.dev);
1018
1019                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1020                 return net->ipv6.ip6_null_entry;
1021         }
1022
1023         read_lock_bh(&table->tb6_lock);
1024         if (rt->rt6i_pcpu) {
1025                 p = this_cpu_ptr(rt->rt6i_pcpu);
1026                 prev = cmpxchg(p, NULL, pcpu_rt);
1027                 if (prev) {
1028                         /* If someone did it before us, return prev instead */
1029                         dst_destroy(&pcpu_rt->dst);
1030                         pcpu_rt = prev;
1031                 }
1032         } else {
1033                 /* rt has been removed from the fib6 tree
1034                  * before we have a chance to acquire the read_lock.
1035                  * In this case, don't brother to create a pcpu rt
1036                  * since rt is going away anyway.  The next
1037                  * dst_check() will trigger a re-lookup.
1038                  */
1039                 dst_destroy(&pcpu_rt->dst);
1040                 pcpu_rt = rt;
1041         }
1042         dst_hold(&pcpu_rt->dst);
1043         rt6_dst_from_metrics_check(pcpu_rt);
1044         read_unlock_bh(&table->tb6_lock);
1045         return pcpu_rt;
1046 }
1047
1048 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1049                                int oif, struct flowi6 *fl6, int flags)
1050 {
1051         struct fib6_node *fn, *saved_fn;
1052         struct rt6_info *rt;
1053         int strict = 0;
1054
1055         strict |= flags & RT6_LOOKUP_F_IFACE;
1056         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1057         if (net->ipv6.devconf_all->forwarding == 0)
1058                 strict |= RT6_LOOKUP_F_REACHABLE;
1059
1060         read_lock_bh(&table->tb6_lock);
1061
1062         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063         saved_fn = fn;
1064
1065         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1066                 oif = 0;
1067
1068 redo_rt6_select:
1069         rt = rt6_select(fn, oif, strict);
1070         if (rt->rt6i_nsiblings)
1071                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1072         if (rt == net->ipv6.ip6_null_entry) {
1073                 fn = fib6_backtrack(fn, &fl6->saddr);
1074                 if (fn)
1075                         goto redo_rt6_select;
1076                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1077                         /* also consider unreachable route */
1078                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1079                         fn = saved_fn;
1080                         goto redo_rt6_select;
1081                 }
1082         }
1083
1084
1085         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1086                 dst_use(&rt->dst, jiffies);
1087                 read_unlock_bh(&table->tb6_lock);
1088
1089                 rt6_dst_from_metrics_check(rt);
1090
1091                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1092                 return rt;
1093         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1094                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1095                 /* Create a RTF_CACHE clone which will not be
1096                  * owned by the fib6 tree.  It is for the special case where
1097                  * the daddr in the skb during the neighbor look-up is different
1098                  * from the fl6->daddr used to look-up route here.
1099                  */
1100
1101                 struct rt6_info *uncached_rt;
1102
1103                 dst_use(&rt->dst, jiffies);
1104                 read_unlock_bh(&table->tb6_lock);
1105
1106                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1107                 dst_release(&rt->dst);
1108
1109                 if (uncached_rt)
1110                         rt6_uncached_list_add(uncached_rt);
1111                 else
1112                         uncached_rt = net->ipv6.ip6_null_entry;
1113
1114                 dst_hold(&uncached_rt->dst);
1115
1116                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1117                 return uncached_rt;
1118
1119         } else {
1120                 /* Get a percpu copy */
1121
1122                 struct rt6_info *pcpu_rt;
1123
1124                 rt->dst.lastuse = jiffies;
1125                 rt->dst.__use++;
1126                 pcpu_rt = rt6_get_pcpu_route(rt);
1127
1128                 if (pcpu_rt) {
1129                         read_unlock_bh(&table->tb6_lock);
1130                 } else {
1131                         /* We have to do the read_unlock first
1132                          * because rt6_make_pcpu_route() may trigger
1133                          * ip6_dst_gc() which will take the write_lock.
1134                          */
1135                         dst_hold(&rt->dst);
1136                         read_unlock_bh(&table->tb6_lock);
1137                         pcpu_rt = rt6_make_pcpu_route(rt);
1138                         dst_release(&rt->dst);
1139                 }
1140
1141                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1142                 return pcpu_rt;
1143
1144         }
1145 }
1146 EXPORT_SYMBOL_GPL(ip6_pol_route);
1147
1148 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1149                                             struct flowi6 *fl6, int flags)
1150 {
1151         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1152 }
1153
1154 struct dst_entry *ip6_route_input_lookup(struct net *net,
1155                                          struct net_device *dev,
1156                                          struct flowi6 *fl6, int flags)
1157 {
1158         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1159                 flags |= RT6_LOOKUP_F_IFACE;
1160
1161         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1162 }
1163 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1164
1165 void ip6_route_input(struct sk_buff *skb)
1166 {
1167         const struct ipv6hdr *iph = ipv6_hdr(skb);
1168         struct net *net = dev_net(skb->dev);
1169         int flags = RT6_LOOKUP_F_HAS_SADDR;
1170         struct ip_tunnel_info *tun_info;
1171         struct flowi6 fl6 = {
1172                 .flowi6_iif = skb->dev->ifindex,
1173                 .daddr = iph->daddr,
1174                 .saddr = iph->saddr,
1175                 .flowlabel = ip6_flowinfo(iph),
1176                 .flowi6_mark = skb->mark,
1177                 .flowi6_proto = iph->nexthdr,
1178         };
1179
1180         tun_info = skb_tunnel_info(skb);
1181         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1182                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1183         skb_dst_drop(skb);
1184         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1185 }
1186
1187 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1188                                              struct flowi6 *fl6, int flags)
1189 {
1190         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1191 }
1192
1193 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1194                                          struct flowi6 *fl6, int flags)
1195 {
1196         bool any_src;
1197
1198         if (rt6_need_strict(&fl6->daddr)) {
1199                 struct dst_entry *dst;
1200
1201                 dst = l3mdev_link_scope_lookup(net, fl6);
1202                 if (dst)
1203                         return dst;
1204         }
1205
1206         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1207
1208         any_src = ipv6_addr_any(&fl6->saddr);
1209         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1210             (fl6->flowi6_oif && any_src))
1211                 flags |= RT6_LOOKUP_F_IFACE;
1212
1213         if (!any_src)
1214                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1215         else if (sk)
1216                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1217
1218         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1221
1222 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1223 {
1224         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1225         struct dst_entry *new = NULL;
1226
1227         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1228         if (rt) {
1229                 rt6_info_init(rt);
1230
1231                 new = &rt->dst;
1232                 new->__use = 1;
1233                 new->input = dst_discard;
1234                 new->output = dst_discard_out;
1235
1236                 dst_copy_metrics(new, &ort->dst);
1237                 rt->rt6i_idev = ort->rt6i_idev;
1238                 if (rt->rt6i_idev)
1239                         in6_dev_hold(rt->rt6i_idev);
1240
1241                 rt->rt6i_gateway = ort->rt6i_gateway;
1242                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1243                 rt->rt6i_metric = 0;
1244
1245                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1246 #ifdef CONFIG_IPV6_SUBTREES
1247                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1248 #endif
1249
1250                 dst_free(new);
1251         }
1252
1253         dst_release(dst_orig);
1254         return new ? new : ERR_PTR(-ENOMEM);
1255 }
1256
1257 /*
1258  *      Destination cache support functions
1259  */
1260
1261 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1262 {
1263         if (rt->dst.from &&
1264             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1265                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1266 }
1267
1268 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1269 {
1270         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1271                 return NULL;
1272
1273         if (rt6_check_expired(rt))
1274                 return NULL;
1275
1276         return &rt->dst;
1277 }
1278
1279 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1280 {
1281         if (!__rt6_check_expired(rt) &&
1282             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1283             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1284                 return &rt->dst;
1285         else
1286                 return NULL;
1287 }
1288
1289 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1290 {
1291         struct rt6_info *rt;
1292
1293         rt = (struct rt6_info *) dst;
1294
1295         /* All IPV6 dsts are created with ->obsolete set to the value
1296          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1297          * into this function always.
1298          */
1299
1300         rt6_dst_from_metrics_check(rt);
1301
1302         if (rt->rt6i_flags & RTF_PCPU ||
1303             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1304                 return rt6_dst_from_check(rt, cookie);
1305         else
1306                 return rt6_check(rt, cookie);
1307 }
1308
1309 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1310 {
1311         struct rt6_info *rt = (struct rt6_info *) dst;
1312
1313         if (rt) {
1314                 if (rt->rt6i_flags & RTF_CACHE) {
1315                         if (rt6_check_expired(rt)) {
1316                                 ip6_del_rt(rt);
1317                                 dst = NULL;
1318                         }
1319                 } else {
1320                         dst_release(dst);
1321                         dst = NULL;
1322                 }
1323         }
1324         return dst;
1325 }
1326
1327 static void ip6_link_failure(struct sk_buff *skb)
1328 {
1329         struct rt6_info *rt;
1330
1331         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1332
1333         rt = (struct rt6_info *) skb_dst(skb);
1334         if (rt) {
1335                 if (rt->rt6i_flags & RTF_CACHE) {
1336                         dst_hold(&rt->dst);
1337                         ip6_del_rt(rt);
1338                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1339                         rt->rt6i_node->fn_sernum = -1;
1340                 }
1341         }
1342 }
1343
1344 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1345 {
1346         struct net *net = dev_net(rt->dst.dev);
1347
1348         rt->rt6i_flags |= RTF_MODIFIED;
1349         rt->rt6i_pmtu = mtu;
1350         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1351 }
1352
1353 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1354 {
1355         return !(rt->rt6i_flags & RTF_CACHE) &&
1356                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1357 }
1358
1359 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1360                                  const struct ipv6hdr *iph, u32 mtu)
1361 {
1362         struct rt6_info *rt6 = (struct rt6_info *)dst;
1363
1364         if (rt6->rt6i_flags & RTF_LOCAL)
1365                 return;
1366
1367         if (dst_metric_locked(dst, RTAX_MTU))
1368                 return;
1369
1370         dst_confirm(dst);
1371         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1372         if (mtu >= dst_mtu(dst))
1373                 return;
1374
1375         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1376                 rt6_do_update_pmtu(rt6, mtu);
1377         } else {
1378                 const struct in6_addr *daddr, *saddr;
1379                 struct rt6_info *nrt6;
1380
1381                 if (iph) {
1382                         daddr = &iph->daddr;
1383                         saddr = &iph->saddr;
1384                 } else if (sk) {
1385                         daddr = &sk->sk_v6_daddr;
1386                         saddr = &inet6_sk(sk)->saddr;
1387                 } else {
1388                         return;
1389                 }
1390                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1391                 if (nrt6) {
1392                         rt6_do_update_pmtu(nrt6, mtu);
1393
1394                         /* ip6_ins_rt(nrt6) will bump the
1395                          * rt6->rt6i_node->fn_sernum
1396                          * which will fail the next rt6_check() and
1397                          * invalidate the sk->sk_dst_cache.
1398                          */
1399                         ip6_ins_rt(nrt6);
1400                 }
1401         }
1402 }
1403
1404 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1405                                struct sk_buff *skb, u32 mtu)
1406 {
1407         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1408 }
1409
1410 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1411                      int oif, u32 mark, kuid_t uid)
1412 {
1413         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1414         struct dst_entry *dst;
1415         struct flowi6 fl6;
1416
1417         memset(&fl6, 0, sizeof(fl6));
1418         fl6.flowi6_oif = oif;
1419         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1420         fl6.daddr = iph->daddr;
1421         fl6.saddr = iph->saddr;
1422         fl6.flowlabel = ip6_flowinfo(iph);
1423         fl6.flowi6_uid = uid;
1424
1425         dst = ip6_route_output(net, NULL, &fl6);
1426         if (!dst->error)
1427                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1428         dst_release(dst);
1429 }
1430 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1431
1432 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1433 {
1434         struct dst_entry *dst;
1435
1436         ip6_update_pmtu(skb, sock_net(sk), mtu,
1437                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1438
1439         dst = __sk_dst_get(sk);
1440         if (!dst || !dst->obsolete ||
1441             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1442                 return;
1443
1444         bh_lock_sock(sk);
1445         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1446                 ip6_datagram_dst_update(sk, false);
1447         bh_unlock_sock(sk);
1448 }
1449 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1450
1451 /* Handle redirects */
1452 struct ip6rd_flowi {
1453         struct flowi6 fl6;
1454         struct in6_addr gateway;
1455 };
1456
1457 static struct rt6_info *__ip6_route_redirect(struct net *net,
1458                                              struct fib6_table *table,
1459                                              struct flowi6 *fl6,
1460                                              int flags)
1461 {
1462         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1463         struct rt6_info *rt;
1464         struct fib6_node *fn;
1465
1466         /* Get the "current" route for this destination and
1467          * check if the redirect has come from appropriate router.
1468          *
1469          * RFC 4861 specifies that redirects should only be
1470          * accepted if they come from the nexthop to the target.
1471          * Due to the way the routes are chosen, this notion
1472          * is a bit fuzzy and one might need to check all possible
1473          * routes.
1474          */
1475
1476         read_lock_bh(&table->tb6_lock);
1477         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1478 restart:
1479         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1480                 if (rt6_check_expired(rt))
1481                         continue;
1482                 if (rt->dst.error)
1483                         break;
1484                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1485                         continue;
1486                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1487                         continue;
1488                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1489                         continue;
1490                 break;
1491         }
1492
1493         if (!rt)
1494                 rt = net->ipv6.ip6_null_entry;
1495         else if (rt->dst.error) {
1496                 rt = net->ipv6.ip6_null_entry;
1497                 goto out;
1498         }
1499
1500         if (rt == net->ipv6.ip6_null_entry) {
1501                 fn = fib6_backtrack(fn, &fl6->saddr);
1502                 if (fn)
1503                         goto restart;
1504         }
1505
1506 out:
1507         dst_hold(&rt->dst);
1508
1509         read_unlock_bh(&table->tb6_lock);
1510
1511         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1512         return rt;
1513 };
1514
1515 static struct dst_entry *ip6_route_redirect(struct net *net,
1516                                         const struct flowi6 *fl6,
1517                                         const struct in6_addr *gateway)
1518 {
1519         int flags = RT6_LOOKUP_F_HAS_SADDR;
1520         struct ip6rd_flowi rdfl;
1521
1522         rdfl.fl6 = *fl6;
1523         rdfl.gateway = *gateway;
1524
1525         return fib6_rule_lookup(net, &rdfl.fl6,
1526                                 flags, __ip6_route_redirect);
1527 }
1528
1529 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1530                   kuid_t uid)
1531 {
1532         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1533         struct dst_entry *dst;
1534         struct flowi6 fl6;
1535
1536         memset(&fl6, 0, sizeof(fl6));
1537         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1538         fl6.flowi6_oif = oif;
1539         fl6.flowi6_mark = mark;
1540         fl6.daddr = iph->daddr;
1541         fl6.saddr = iph->saddr;
1542         fl6.flowlabel = ip6_flowinfo(iph);
1543         fl6.flowi6_uid = uid;
1544
1545         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1546         rt6_do_redirect(dst, NULL, skb);
1547         dst_release(dst);
1548 }
1549 EXPORT_SYMBOL_GPL(ip6_redirect);
1550
1551 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1552                             u32 mark)
1553 {
1554         const struct ipv6hdr *iph = ipv6_hdr(skb);
1555         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1556         struct dst_entry *dst;
1557         struct flowi6 fl6;
1558
1559         memset(&fl6, 0, sizeof(fl6));
1560         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1561         fl6.flowi6_oif = oif;
1562         fl6.flowi6_mark = mark;
1563         fl6.daddr = msg->dest;
1564         fl6.saddr = iph->daddr;
1565         fl6.flowi6_uid = sock_net_uid(net, NULL);
1566
1567         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1568         rt6_do_redirect(dst, NULL, skb);
1569         dst_release(dst);
1570 }
1571
1572 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1573 {
1574         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1575                      sk->sk_uid);
1576 }
1577 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1578
1579 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1580 {
1581         struct net_device *dev = dst->dev;
1582         unsigned int mtu = dst_mtu(dst);
1583         struct net *net = dev_net(dev);
1584
1585         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1586
1587         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1588                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1589
1590         /*
1591          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1592          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1593          * IPV6_MAXPLEN is also valid and means: "any MSS,
1594          * rely only on pmtu discovery"
1595          */
1596         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1597                 mtu = IPV6_MAXPLEN;
1598         return mtu;
1599 }
1600
1601 static unsigned int ip6_mtu(const struct dst_entry *dst)
1602 {
1603         const struct rt6_info *rt = (const struct rt6_info *)dst;
1604         unsigned int mtu = rt->rt6i_pmtu;
1605         struct inet6_dev *idev;
1606
1607         if (mtu)
1608                 goto out;
1609
1610         mtu = dst_metric_raw(dst, RTAX_MTU);
1611         if (mtu)
1612                 goto out;
1613
1614         mtu = IPV6_MIN_MTU;
1615
1616         rcu_read_lock();
1617         idev = __in6_dev_get(dst->dev);
1618         if (idev)
1619                 mtu = idev->cnf.mtu6;
1620         rcu_read_unlock();
1621
1622 out:
1623         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1624
1625         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1626 }
1627
1628 static struct dst_entry *icmp6_dst_gc_list;
1629 static DEFINE_SPINLOCK(icmp6_dst_lock);
1630
1631 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1632                                   struct flowi6 *fl6)
1633 {
1634         struct dst_entry *dst;
1635         struct rt6_info *rt;
1636         struct inet6_dev *idev = in6_dev_get(dev);
1637         struct net *net = dev_net(dev);
1638
1639         if (unlikely(!idev))
1640                 return ERR_PTR(-ENODEV);
1641
1642         rt = ip6_dst_alloc(net, dev, 0);
1643         if (unlikely(!rt)) {
1644                 in6_dev_put(idev);
1645                 dst = ERR_PTR(-ENOMEM);
1646                 goto out;
1647         }
1648
1649         rt->dst.flags |= DST_HOST;
1650         rt->dst.output  = ip6_output;
1651         atomic_set(&rt->dst.__refcnt, 1);
1652         rt->rt6i_gateway  = fl6->daddr;
1653         rt->rt6i_dst.addr = fl6->daddr;
1654         rt->rt6i_dst.plen = 128;
1655         rt->rt6i_idev     = idev;
1656         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1657
1658         spin_lock_bh(&icmp6_dst_lock);
1659         rt->dst.next = icmp6_dst_gc_list;
1660         icmp6_dst_gc_list = &rt->dst;
1661         spin_unlock_bh(&icmp6_dst_lock);
1662
1663         fib6_force_start_gc(net);
1664
1665         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1666
1667 out:
1668         return dst;
1669 }
1670
1671 int icmp6_dst_gc(void)
1672 {
1673         struct dst_entry *dst, **pprev;
1674         int more = 0;
1675
1676         spin_lock_bh(&icmp6_dst_lock);
1677         pprev = &icmp6_dst_gc_list;
1678
1679         while ((dst = *pprev) != NULL) {
1680                 if (!atomic_read(&dst->__refcnt)) {
1681                         *pprev = dst->next;
1682                         dst_free(dst);
1683                 } else {
1684                         pprev = &dst->next;
1685                         ++more;
1686                 }
1687         }
1688
1689         spin_unlock_bh(&icmp6_dst_lock);
1690
1691         return more;
1692 }
1693
1694 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1695                             void *arg)
1696 {
1697         struct dst_entry *dst, **pprev;
1698
1699         spin_lock_bh(&icmp6_dst_lock);
1700         pprev = &icmp6_dst_gc_list;
1701         while ((dst = *pprev) != NULL) {
1702                 struct rt6_info *rt = (struct rt6_info *) dst;
1703                 if (func(rt, arg)) {
1704                         *pprev = dst->next;
1705                         dst_free(dst);
1706                 } else {
1707                         pprev = &dst->next;
1708                 }
1709         }
1710         spin_unlock_bh(&icmp6_dst_lock);
1711 }
1712
1713 static int ip6_dst_gc(struct dst_ops *ops)
1714 {
1715         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1716         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1717         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1718         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1719         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1720         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1721         int entries;
1722
1723         entries = dst_entries_get_fast(ops);
1724         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1725             entries <= rt_max_size)
1726                 goto out;
1727
1728         net->ipv6.ip6_rt_gc_expire++;
1729         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1730         entries = dst_entries_get_slow(ops);
1731         if (entries < ops->gc_thresh)
1732                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1733 out:
1734         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1735         return entries > rt_max_size;
1736 }
1737
1738 static int ip6_convert_metrics(struct mx6_config *mxc,
1739                                const struct fib6_config *cfg)
1740 {
1741         bool ecn_ca = false;
1742         struct nlattr *nla;
1743         int remaining;
1744         u32 *mp;
1745
1746         if (!cfg->fc_mx)
1747                 return 0;
1748
1749         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1750         if (unlikely(!mp))
1751                 return -ENOMEM;
1752
1753         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1754                 int type = nla_type(nla);
1755                 u32 val;
1756
1757                 if (!type)
1758                         continue;
1759                 if (unlikely(type > RTAX_MAX))
1760                         goto err;
1761
1762                 if (type == RTAX_CC_ALGO) {
1763                         char tmp[TCP_CA_NAME_MAX];
1764
1765                         nla_strlcpy(tmp, nla, sizeof(tmp));
1766                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1767                         if (val == TCP_CA_UNSPEC)
1768                                 goto err;
1769                 } else {
1770                         val = nla_get_u32(nla);
1771                 }
1772                 if (type == RTAX_HOPLIMIT && val > 255)
1773                         val = 255;
1774                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1775                         goto err;
1776
1777                 mp[type - 1] = val;
1778                 __set_bit(type - 1, mxc->mx_valid);
1779         }
1780
1781         if (ecn_ca) {
1782                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1783                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1784         }
1785
1786         mxc->mx = mp;
1787         return 0;
1788  err:
1789         kfree(mp);
1790         return -EINVAL;
1791 }
1792
1793 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1794                                             struct fib6_config *cfg,
1795                                             const struct in6_addr *gw_addr)
1796 {
1797         struct flowi6 fl6 = {
1798                 .flowi6_oif = cfg->fc_ifindex,
1799                 .daddr = *gw_addr,
1800                 .saddr = cfg->fc_prefsrc,
1801         };
1802         struct fib6_table *table;
1803         struct rt6_info *rt;
1804         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1805
1806         table = fib6_get_table(net, cfg->fc_table);
1807         if (!table)
1808                 return NULL;
1809
1810         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1811                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1812
1813         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1814
1815         /* if table lookup failed, fall back to full lookup */
1816         if (rt == net->ipv6.ip6_null_entry) {
1817                 ip6_rt_put(rt);
1818                 rt = NULL;
1819         }
1820
1821         return rt;
1822 }
1823
1824 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1825 {
1826         struct net *net = cfg->fc_nlinfo.nl_net;
1827         struct rt6_info *rt = NULL;
1828         struct net_device *dev = NULL;
1829         struct inet6_dev *idev = NULL;
1830         struct fib6_table *table;
1831         int addr_type;
1832         int err = -EINVAL;
1833
1834         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1835                 goto out;
1836 #ifndef CONFIG_IPV6_SUBTREES
1837         if (cfg->fc_src_len)
1838                 goto out;
1839 #endif
1840         if (cfg->fc_ifindex) {
1841                 err = -ENODEV;
1842                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1843                 if (!dev)
1844                         goto out;
1845                 idev = in6_dev_get(dev);
1846                 if (!idev)
1847                         goto out;
1848         }
1849
1850         if (cfg->fc_metric == 0)
1851                 cfg->fc_metric = IP6_RT_PRIO_USER;
1852
1853         err = -ENOBUFS;
1854         if (cfg->fc_nlinfo.nlh &&
1855             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1856                 table = fib6_get_table(net, cfg->fc_table);
1857                 if (!table) {
1858                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1859                         table = fib6_new_table(net, cfg->fc_table);
1860                 }
1861         } else {
1862                 table = fib6_new_table(net, cfg->fc_table);
1863         }
1864
1865         if (!table)
1866                 goto out;
1867
1868         rt = ip6_dst_alloc(net, NULL,
1869                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1870
1871         if (!rt) {
1872                 err = -ENOMEM;
1873                 goto out;
1874         }
1875
1876         if (cfg->fc_flags & RTF_EXPIRES)
1877                 rt6_set_expires(rt, jiffies +
1878                                 clock_t_to_jiffies(cfg->fc_expires));
1879         else
1880                 rt6_clean_expires(rt);
1881
1882         if (cfg->fc_protocol == RTPROT_UNSPEC)
1883                 cfg->fc_protocol = RTPROT_BOOT;
1884         rt->rt6i_protocol = cfg->fc_protocol;
1885
1886         addr_type = ipv6_addr_type(&cfg->fc_dst);
1887
1888         if (addr_type & IPV6_ADDR_MULTICAST)
1889                 rt->dst.input = ip6_mc_input;
1890         else if (cfg->fc_flags & RTF_LOCAL)
1891                 rt->dst.input = ip6_input;
1892         else
1893                 rt->dst.input = ip6_forward;
1894
1895         rt->dst.output = ip6_output;
1896
1897         if (cfg->fc_encap) {
1898                 struct lwtunnel_state *lwtstate;
1899
1900                 err = lwtunnel_build_state(cfg->fc_encap_type,
1901                                            cfg->fc_encap, AF_INET6, cfg,
1902                                            &lwtstate);
1903                 if (err)
1904                         goto out;
1905                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1906                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1907                         rt->dst.lwtstate->orig_output = rt->dst.output;
1908                         rt->dst.output = lwtunnel_output;
1909                 }
1910                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1911                         rt->dst.lwtstate->orig_input = rt->dst.input;
1912                         rt->dst.input = lwtunnel_input;
1913                 }
1914         }
1915
1916         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1917         rt->rt6i_dst.plen = cfg->fc_dst_len;
1918         if (rt->rt6i_dst.plen == 128)
1919                 rt->dst.flags |= DST_HOST;
1920
1921 #ifdef CONFIG_IPV6_SUBTREES
1922         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1923         rt->rt6i_src.plen = cfg->fc_src_len;
1924 #endif
1925
1926         rt->rt6i_metric = cfg->fc_metric;
1927
1928         /* We cannot add true routes via loopback here,
1929            they would result in kernel looping; promote them to reject routes
1930          */
1931         if ((cfg->fc_flags & RTF_REJECT) ||
1932             (dev && (dev->flags & IFF_LOOPBACK) &&
1933              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1934              !(cfg->fc_flags & RTF_LOCAL))) {
1935                 /* hold loopback dev/idev if we haven't done so. */
1936                 if (dev != net->loopback_dev) {
1937                         if (dev) {
1938                                 dev_put(dev);
1939                                 in6_dev_put(idev);
1940                         }
1941                         dev = net->loopback_dev;
1942                         dev_hold(dev);
1943                         idev = in6_dev_get(dev);
1944                         if (!idev) {
1945                                 err = -ENODEV;
1946                                 goto out;
1947                         }
1948                 }
1949                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1950                 switch (cfg->fc_type) {
1951                 case RTN_BLACKHOLE:
1952                         rt->dst.error = -EINVAL;
1953                         rt->dst.output = dst_discard_out;
1954                         rt->dst.input = dst_discard;
1955                         break;
1956                 case RTN_PROHIBIT:
1957                         rt->dst.error = -EACCES;
1958                         rt->dst.output = ip6_pkt_prohibit_out;
1959                         rt->dst.input = ip6_pkt_prohibit;
1960                         break;
1961                 case RTN_THROW:
1962                 case RTN_UNREACHABLE:
1963                 default:
1964                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1965                                         : (cfg->fc_type == RTN_UNREACHABLE)
1966                                         ? -EHOSTUNREACH : -ENETUNREACH;
1967                         rt->dst.output = ip6_pkt_discard_out;
1968                         rt->dst.input = ip6_pkt_discard;
1969                         break;
1970                 }
1971                 goto install_route;
1972         }
1973
1974         if (cfg->fc_flags & RTF_GATEWAY) {
1975                 const struct in6_addr *gw_addr;
1976                 int gwa_type;
1977
1978                 gw_addr = &cfg->fc_gateway;
1979                 gwa_type = ipv6_addr_type(gw_addr);
1980
1981                 /* if gw_addr is local we will fail to detect this in case
1982                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1983                  * will return already-added prefix route via interface that
1984                  * prefix route was assigned to, which might be non-loopback.
1985                  */
1986                 err = -EINVAL;
1987                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1988                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1989                                             dev : NULL, 0, 0))
1990                         goto out;
1991
1992                 rt->rt6i_gateway = *gw_addr;
1993
1994                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1995                         struct rt6_info *grt = NULL;
1996
1997                         /* IPv6 strictly inhibits using not link-local
1998                            addresses as nexthop address.
1999                            Otherwise, router will not able to send redirects.
2000                            It is very good, but in some (rare!) circumstances
2001                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2002                            some exceptions. --ANK
2003                            We allow IPv4-mapped nexthops to support RFC4798-type
2004                            addressing
2005                          */
2006                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2007                                           IPV6_ADDR_MAPPED)))
2008                                 goto out;
2009
2010                         if (cfg->fc_table) {
2011                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2012
2013                                 if (grt) {
2014                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2015                                             (dev && dev != grt->dst.dev)) {
2016                                                 ip6_rt_put(grt);
2017                                                 grt = NULL;
2018                                         }
2019                                 }
2020                         }
2021
2022                         if (!grt)
2023                                 grt = rt6_lookup(net, gw_addr, NULL,
2024                                                  cfg->fc_ifindex, 1);
2025
2026                         err = -EHOSTUNREACH;
2027                         if (!grt)
2028                                 goto out;
2029                         if (dev) {
2030                                 if (dev != grt->dst.dev) {
2031                                         ip6_rt_put(grt);
2032                                         goto out;
2033                                 }
2034                         } else {
2035                                 dev = grt->dst.dev;
2036                                 idev = grt->rt6i_idev;
2037                                 dev_hold(dev);
2038                                 in6_dev_hold(grt->rt6i_idev);
2039                         }
2040                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2041                                 err = 0;
2042                         ip6_rt_put(grt);
2043
2044                         if (err)
2045                                 goto out;
2046                 }
2047                 err = -EINVAL;
2048                 if (!dev || (dev->flags & IFF_LOOPBACK))
2049                         goto out;
2050         }
2051
2052         err = -ENODEV;
2053         if (!dev)
2054                 goto out;
2055
2056         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2057                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2058                         err = -EINVAL;
2059                         goto out;
2060                 }
2061                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2062                 rt->rt6i_prefsrc.plen = 128;
2063         } else
2064                 rt->rt6i_prefsrc.plen = 0;
2065
2066         rt->rt6i_flags = cfg->fc_flags;
2067
2068 install_route:
2069         rt->dst.dev = dev;
2070         rt->rt6i_idev = idev;
2071         rt->rt6i_table = table;
2072
2073         cfg->fc_nlinfo.nl_net = dev_net(dev);
2074
2075         return rt;
2076 out:
2077         if (dev)
2078                 dev_put(dev);
2079         if (idev)
2080                 in6_dev_put(idev);
2081         if (rt)
2082                 dst_free(&rt->dst);
2083
2084         return ERR_PTR(err);
2085 }
2086
2087 int ip6_route_add(struct fib6_config *cfg)
2088 {
2089         struct mx6_config mxc = { .mx = NULL, };
2090         struct rt6_info *rt;
2091         int err;
2092
2093         rt = ip6_route_info_create(cfg);
2094         if (IS_ERR(rt)) {
2095                 err = PTR_ERR(rt);
2096                 rt = NULL;
2097                 goto out;
2098         }
2099
2100         err = ip6_convert_metrics(&mxc, cfg);
2101         if (err)
2102                 goto out;
2103
2104         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2105
2106         kfree(mxc.mx);
2107
2108         return err;
2109 out:
2110         if (rt)
2111                 dst_free(&rt->dst);
2112
2113         return err;
2114 }
2115
2116 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2117 {
2118         int err;
2119         struct fib6_table *table;
2120         struct net *net = dev_net(rt->dst.dev);
2121
2122         if (rt == net->ipv6.ip6_null_entry ||
2123             rt->dst.flags & DST_NOCACHE) {
2124                 err = -ENOENT;
2125                 goto out;
2126         }
2127
2128         table = rt->rt6i_table;
2129         write_lock_bh(&table->tb6_lock);
2130         err = fib6_del(rt, info);
2131         write_unlock_bh(&table->tb6_lock);
2132
2133 out:
2134         ip6_rt_put(rt);
2135         return err;
2136 }
2137
2138 int ip6_del_rt(struct rt6_info *rt)
2139 {
2140         struct nl_info info = {
2141                 .nl_net = dev_net(rt->dst.dev),
2142         };
2143         return __ip6_del_rt(rt, &info);
2144 }
2145
2146 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2147 {
2148         struct nl_info *info = &cfg->fc_nlinfo;
2149         struct fib6_table *table;
2150         int err;
2151
2152         table = rt->rt6i_table;
2153         write_lock_bh(&table->tb6_lock);
2154
2155         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2156                 struct rt6_info *sibling, *next_sibling;
2157
2158                 list_for_each_entry_safe(sibling, next_sibling,
2159                                          &rt->rt6i_siblings,
2160                                          rt6i_siblings) {
2161                         err = fib6_del(sibling, info);
2162                         if (err)
2163                                 goto out;
2164                 }
2165         }
2166
2167         err = fib6_del(rt, info);
2168 out:
2169         write_unlock_bh(&table->tb6_lock);
2170         ip6_rt_put(rt);
2171         return err;
2172 }
2173
2174 static int ip6_route_del(struct fib6_config *cfg)
2175 {
2176         struct fib6_table *table;
2177         struct fib6_node *fn;
2178         struct rt6_info *rt;
2179         int err = -ESRCH;
2180
2181         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2182         if (!table)
2183                 return err;
2184
2185         read_lock_bh(&table->tb6_lock);
2186
2187         fn = fib6_locate(&table->tb6_root,
2188                          &cfg->fc_dst, cfg->fc_dst_len,
2189                          &cfg->fc_src, cfg->fc_src_len);
2190
2191         if (fn) {
2192                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2193                         if ((rt->rt6i_flags & RTF_CACHE) &&
2194                             !(cfg->fc_flags & RTF_CACHE))
2195                                 continue;
2196                         if (cfg->fc_ifindex &&
2197                             (!rt->dst.dev ||
2198                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2199                                 continue;
2200                         if (cfg->fc_flags & RTF_GATEWAY &&
2201                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2202                                 continue;
2203                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2204                                 continue;
2205                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2206                                 continue;
2207                         dst_hold(&rt->dst);
2208                         read_unlock_bh(&table->tb6_lock);
2209
2210                         /* if gateway was specified only delete the one hop */
2211                         if (cfg->fc_flags & RTF_GATEWAY)
2212                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2213
2214                         return __ip6_del_rt_siblings(rt, cfg);
2215                 }
2216         }
2217         read_unlock_bh(&table->tb6_lock);
2218
2219         return err;
2220 }
2221
2222 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2223 {
2224         struct netevent_redirect netevent;
2225         struct rt6_info *rt, *nrt = NULL;
2226         struct ndisc_options ndopts;
2227         struct inet6_dev *in6_dev;
2228         struct neighbour *neigh;
2229         struct rd_msg *msg;
2230         int optlen, on_link;
2231         u8 *lladdr;
2232
2233         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2234         optlen -= sizeof(*msg);
2235
2236         if (optlen < 0) {
2237                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2238                 return;
2239         }
2240
2241         msg = (struct rd_msg *)icmp6_hdr(skb);
2242
2243         if (ipv6_addr_is_multicast(&msg->dest)) {
2244                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2245                 return;
2246         }
2247
2248         on_link = 0;
2249         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2250                 on_link = 1;
2251         } else if (ipv6_addr_type(&msg->target) !=
2252                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2253                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2254                 return;
2255         }
2256
2257         in6_dev = __in6_dev_get(skb->dev);
2258         if (!in6_dev)
2259                 return;
2260         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2261                 return;
2262
2263         /* RFC2461 8.1:
2264          *      The IP source address of the Redirect MUST be the same as the current
2265          *      first-hop router for the specified ICMP Destination Address.
2266          */
2267
2268         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2269                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2270                 return;
2271         }
2272
2273         lladdr = NULL;
2274         if (ndopts.nd_opts_tgt_lladdr) {
2275                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2276                                              skb->dev);
2277                 if (!lladdr) {
2278                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2279                         return;
2280                 }
2281         }
2282
2283         rt = (struct rt6_info *) dst;
2284         if (rt->rt6i_flags & RTF_REJECT) {
2285                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2286                 return;
2287         }
2288
2289         /* Redirect received -> path was valid.
2290          * Look, redirects are sent only in response to data packets,
2291          * so that this nexthop apparently is reachable. --ANK
2292          */
2293         dst_confirm(&rt->dst);
2294
2295         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2296         if (!neigh)
2297                 return;
2298
2299         /*
2300          *      We have finally decided to accept it.
2301          */
2302
2303         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2304                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2305                      NEIGH_UPDATE_F_OVERRIDE|
2306                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2307                                      NEIGH_UPDATE_F_ISROUTER)),
2308                      NDISC_REDIRECT, &ndopts);
2309
2310         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2311         if (!nrt)
2312                 goto out;
2313
2314         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2315         if (on_link)
2316                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2317
2318         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2319
2320         if (ip6_ins_rt(nrt))
2321                 goto out;
2322
2323         netevent.old = &rt->dst;
2324         netevent.new = &nrt->dst;
2325         netevent.daddr = &msg->dest;
2326         netevent.neigh = neigh;
2327         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2328
2329         if (rt->rt6i_flags & RTF_CACHE) {
2330                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2331                 ip6_del_rt(rt);
2332         }
2333
2334 out:
2335         neigh_release(neigh);
2336 }
2337
2338 /*
2339  *      Misc support functions
2340  */
2341
2342 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2343 {
2344         BUG_ON(from->dst.from);
2345
2346         rt->rt6i_flags &= ~RTF_EXPIRES;
2347         dst_hold(&from->dst);
2348         rt->dst.from = &from->dst;
2349         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2350 }
2351
2352 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2353 {
2354         rt->dst.input = ort->dst.input;
2355         rt->dst.output = ort->dst.output;
2356         rt->rt6i_dst = ort->rt6i_dst;
2357         rt->dst.error = ort->dst.error;
2358         rt->rt6i_idev = ort->rt6i_idev;
2359         if (rt->rt6i_idev)
2360                 in6_dev_hold(rt->rt6i_idev);
2361         rt->dst.lastuse = jiffies;
2362         rt->rt6i_gateway = ort->rt6i_gateway;
2363         rt->rt6i_flags = ort->rt6i_flags;
2364         rt6_set_from(rt, ort);
2365         rt->rt6i_metric = ort->rt6i_metric;
2366 #ifdef CONFIG_IPV6_SUBTREES
2367         rt->rt6i_src = ort->rt6i_src;
2368 #endif
2369         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2370         rt->rt6i_table = ort->rt6i_table;
2371         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2372 }
2373
2374 #ifdef CONFIG_IPV6_ROUTE_INFO
2375 static struct rt6_info *rt6_get_route_info(struct net *net,
2376                                            const struct in6_addr *prefix, int prefixlen,
2377                                            const struct in6_addr *gwaddr,
2378                                            struct net_device *dev)
2379 {
2380         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2381         int ifindex = dev->ifindex;
2382         struct fib6_node *fn;
2383         struct rt6_info *rt = NULL;
2384         struct fib6_table *table;
2385
2386         table = fib6_get_table(net, tb_id);
2387         if (!table)
2388                 return NULL;
2389
2390         read_lock_bh(&table->tb6_lock);
2391         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2392         if (!fn)
2393                 goto out;
2394
2395         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2396                 if (rt->dst.dev->ifindex != ifindex)
2397                         continue;
2398                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2399                         continue;
2400                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2401                         continue;
2402                 dst_hold(&rt->dst);
2403                 break;
2404         }
2405 out:
2406         read_unlock_bh(&table->tb6_lock);
2407         return rt;
2408 }
2409
2410 static struct rt6_info *rt6_add_route_info(struct net *net,
2411                                            const struct in6_addr *prefix, int prefixlen,
2412                                            const struct in6_addr *gwaddr,
2413                                            struct net_device *dev,
2414                                            unsigned int pref)
2415 {
2416         struct fib6_config cfg = {
2417                 .fc_metric      = IP6_RT_PRIO_USER,
2418                 .fc_ifindex     = dev->ifindex,
2419                 .fc_dst_len     = prefixlen,
2420                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2421                                   RTF_UP | RTF_PREF(pref),
2422                 .fc_nlinfo.portid = 0,
2423                 .fc_nlinfo.nlh = NULL,
2424                 .fc_nlinfo.nl_net = net,
2425         };
2426
2427         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2428         cfg.fc_dst = *prefix;
2429         cfg.fc_gateway = *gwaddr;
2430
2431         /* We should treat it as a default route if prefix length is 0. */
2432         if (!prefixlen)
2433                 cfg.fc_flags |= RTF_DEFAULT;
2434
2435         ip6_route_add(&cfg);
2436
2437         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2438 }
2439 #endif
2440
2441 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2442 {
2443         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2444         struct rt6_info *rt;
2445         struct fib6_table *table;
2446
2447         table = fib6_get_table(dev_net(dev), tb_id);
2448         if (!table)
2449                 return NULL;
2450
2451         read_lock_bh(&table->tb6_lock);
2452         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2453                 if (dev == rt->dst.dev &&
2454                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2455                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2456                         break;
2457         }
2458         if (rt)
2459                 dst_hold(&rt->dst);
2460         read_unlock_bh(&table->tb6_lock);
2461         return rt;
2462 }
2463
2464 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2465                                      struct net_device *dev,
2466                                      unsigned int pref)
2467 {
2468         struct fib6_config cfg = {
2469                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2470                 .fc_metric      = IP6_RT_PRIO_USER,
2471                 .fc_ifindex     = dev->ifindex,
2472                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2473                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2474                 .fc_nlinfo.portid = 0,
2475                 .fc_nlinfo.nlh = NULL,
2476                 .fc_nlinfo.nl_net = dev_net(dev),
2477         };
2478
2479         cfg.fc_gateway = *gwaddr;
2480
2481         if (!ip6_route_add(&cfg)) {
2482                 struct fib6_table *table;
2483
2484                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2485                 if (table)
2486                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2487         }
2488
2489         return rt6_get_dflt_router(gwaddr, dev);
2490 }
2491
2492 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2493 {
2494         struct rt6_info *rt;
2495
2496 restart:
2497         read_lock_bh(&table->tb6_lock);
2498         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2499                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2500                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2501                         dst_hold(&rt->dst);
2502                         read_unlock_bh(&table->tb6_lock);
2503                         ip6_del_rt(rt);
2504                         goto restart;
2505                 }
2506         }
2507         read_unlock_bh(&table->tb6_lock);
2508
2509         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2510 }
2511
2512 void rt6_purge_dflt_routers(struct net *net)
2513 {
2514         struct fib6_table *table;
2515         struct hlist_head *head;
2516         unsigned int h;
2517
2518         rcu_read_lock();
2519
2520         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2521                 head = &net->ipv6.fib_table_hash[h];
2522                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2523                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2524                                 __rt6_purge_dflt_routers(table);
2525                 }
2526         }
2527
2528         rcu_read_unlock();
2529 }
2530
2531 static void rtmsg_to_fib6_config(struct net *net,
2532                                  struct in6_rtmsg *rtmsg,
2533                                  struct fib6_config *cfg)
2534 {
2535         memset(cfg, 0, sizeof(*cfg));
2536
2537         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2538                          : RT6_TABLE_MAIN;
2539         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2540         cfg->fc_metric = rtmsg->rtmsg_metric;
2541         cfg->fc_expires = rtmsg->rtmsg_info;
2542         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2543         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2544         cfg->fc_flags = rtmsg->rtmsg_flags;
2545
2546         cfg->fc_nlinfo.nl_net = net;
2547
2548         cfg->fc_dst = rtmsg->rtmsg_dst;
2549         cfg->fc_src = rtmsg->rtmsg_src;
2550         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2551 }
2552
2553 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2554 {
2555         struct fib6_config cfg;
2556         struct in6_rtmsg rtmsg;
2557         int err;
2558
2559         switch (cmd) {
2560         case SIOCADDRT:         /* Add a route */
2561         case SIOCDELRT:         /* Delete a route */
2562                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2563                         return -EPERM;
2564                 err = copy_from_user(&rtmsg, arg,
2565                                      sizeof(struct in6_rtmsg));
2566                 if (err)
2567                         return -EFAULT;
2568
2569                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2570
2571                 rtnl_lock();
2572                 switch (cmd) {
2573                 case SIOCADDRT:
2574                         err = ip6_route_add(&cfg);
2575                         break;
2576                 case SIOCDELRT:
2577                         err = ip6_route_del(&cfg);
2578                         break;
2579                 default:
2580                         err = -EINVAL;
2581                 }
2582                 rtnl_unlock();
2583
2584                 return err;
2585         }
2586
2587         return -EINVAL;
2588 }
2589
2590 /*
2591  *      Drop the packet on the floor
2592  */
2593
2594 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2595 {
2596         int type;
2597         struct dst_entry *dst = skb_dst(skb);
2598         switch (ipstats_mib_noroutes) {
2599         case IPSTATS_MIB_INNOROUTES:
2600                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2601                 if (type == IPV6_ADDR_ANY) {
2602                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2603                                       IPSTATS_MIB_INADDRERRORS);
2604                         break;
2605                 }
2606                 /* FALLTHROUGH */
2607         case IPSTATS_MIB_OUTNOROUTES:
2608                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2609                               ipstats_mib_noroutes);
2610                 break;
2611         }
2612         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2613         kfree_skb(skb);
2614         return 0;
2615 }
2616
2617 static int ip6_pkt_discard(struct sk_buff *skb)
2618 {
2619         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2620 }
2621
2622 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2623 {
2624         skb->dev = skb_dst(skb)->dev;
2625         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2626 }
2627
2628 static int ip6_pkt_prohibit(struct sk_buff *skb)
2629 {
2630         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2631 }
2632
2633 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2634 {
2635         skb->dev = skb_dst(skb)->dev;
2636         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2637 }
2638
2639 /*
2640  *      Allocate a dst for local (unicast / anycast) address.
2641  */
2642
2643 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2644                                     const struct in6_addr *addr,
2645                                     bool anycast)
2646 {
2647         u32 tb_id;
2648         struct net *net = dev_net(idev->dev);
2649         struct net_device *dev = net->loopback_dev;
2650         struct rt6_info *rt;
2651
2652         /* use L3 Master device as loopback for host routes if device
2653          * is enslaved and address is not link local or multicast
2654          */
2655         if (!rt6_need_strict(addr))
2656                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2657
2658         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2659         if (!rt)
2660                 return ERR_PTR(-ENOMEM);
2661
2662         in6_dev_hold(idev);
2663
2664         rt->dst.flags |= DST_HOST;
2665         rt->dst.input = ip6_input;
2666         rt->dst.output = ip6_output;
2667         rt->rt6i_idev = idev;
2668
2669         rt->rt6i_protocol = RTPROT_KERNEL;
2670         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2671         if (anycast)
2672                 rt->rt6i_flags |= RTF_ANYCAST;
2673         else
2674                 rt->rt6i_flags |= RTF_LOCAL;
2675
2676         rt->rt6i_gateway  = *addr;
2677         rt->rt6i_dst.addr = *addr;
2678         rt->rt6i_dst.plen = 128;
2679         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2680         rt->rt6i_table = fib6_get_table(net, tb_id);
2681         rt->dst.flags |= DST_NOCACHE;
2682
2683         atomic_set(&rt->dst.__refcnt, 1);
2684
2685         return rt;
2686 }
2687
2688 /* remove deleted ip from prefsrc entries */
2689 struct arg_dev_net_ip {
2690         struct net_device *dev;
2691         struct net *net;
2692         struct in6_addr *addr;
2693 };
2694
2695 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2696 {
2697         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2698         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2699         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2700
2701         if (((void *)rt->dst.dev == dev || !dev) &&
2702             rt != net->ipv6.ip6_null_entry &&
2703             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2704                 /* remove prefsrc entry */
2705                 rt->rt6i_prefsrc.plen = 0;
2706         }
2707         return 0;
2708 }
2709
2710 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2711 {
2712         struct net *net = dev_net(ifp->idev->dev);
2713         struct arg_dev_net_ip adni = {
2714                 .dev = ifp->idev->dev,
2715                 .net = net,
2716                 .addr = &ifp->addr,
2717         };
2718         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2719 }
2720
2721 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2722 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2723
2724 /* Remove routers and update dst entries when gateway turn into host. */
2725 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2726 {
2727         struct in6_addr *gateway = (struct in6_addr *)arg;
2728
2729         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2730              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2731              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2732                 return -1;
2733         }
2734         return 0;
2735 }
2736
2737 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2738 {
2739         fib6_clean_all(net, fib6_clean_tohost, gateway);
2740 }
2741
2742 struct arg_dev_net {
2743         struct net_device *dev;
2744         struct net *net;
2745 };
2746
2747 /* called with write lock held for table with rt */
2748 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2749 {
2750         const struct arg_dev_net *adn = arg;
2751         const struct net_device *dev = adn->dev;
2752
2753         if ((rt->dst.dev == dev || !dev) &&
2754             rt != adn->net->ipv6.ip6_null_entry &&
2755             (rt->rt6i_nsiblings == 0 ||
2756              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2757                 return -1;
2758
2759         return 0;
2760 }
2761
2762 void rt6_ifdown(struct net *net, struct net_device *dev)
2763 {
2764         struct arg_dev_net adn = {
2765                 .dev = dev,
2766                 .net = net,
2767         };
2768
2769         fib6_clean_all(net, fib6_ifdown, &adn);
2770         icmp6_clean_all(fib6_ifdown, &adn);
2771         if (dev)
2772                 rt6_uncached_list_flush_dev(net, dev);
2773 }
2774
2775 struct rt6_mtu_change_arg {
2776         struct net_device *dev;
2777         unsigned int mtu;
2778 };
2779
2780 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2781 {
2782         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2783         struct inet6_dev *idev;
2784
2785         /* In IPv6 pmtu discovery is not optional,
2786            so that RTAX_MTU lock cannot disable it.
2787            We still use this lock to block changes
2788            caused by addrconf/ndisc.
2789         */
2790
2791         idev = __in6_dev_get(arg->dev);
2792         if (!idev)
2793                 return 0;
2794
2795         /* For administrative MTU increase, there is no way to discover
2796            IPv6 PMTU increase, so PMTU increase should be updated here.
2797            Since RFC 1981 doesn't include administrative MTU increase
2798            update PMTU increase is a MUST. (i.e. jumbo frame)
2799          */
2800         /*
2801            If new MTU is less than route PMTU, this new MTU will be the
2802            lowest MTU in the path, update the route PMTU to reflect PMTU
2803            decreases; if new MTU is greater than route PMTU, and the
2804            old MTU is the lowest MTU in the path, update the route PMTU
2805            to reflect the increase. In this case if the other nodes' MTU
2806            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2807            PMTU discovery.
2808          */
2809         if (rt->dst.dev == arg->dev &&
2810             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2811             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2812                 if (rt->rt6i_flags & RTF_CACHE) {
2813                         /* For RTF_CACHE with rt6i_pmtu == 0
2814                          * (i.e. a redirected route),
2815                          * the metrics of its rt->dst.from has already
2816                          * been updated.
2817                          */
2818                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2819                                 rt->rt6i_pmtu = arg->mtu;
2820                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2821                            (dst_mtu(&rt->dst) < arg->mtu &&
2822                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2823                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2824                 }
2825         }
2826         return 0;
2827 }
2828
2829 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2830 {
2831         struct rt6_mtu_change_arg arg = {
2832                 .dev = dev,
2833                 .mtu = mtu,
2834         };
2835
2836         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2837 }
2838
2839 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2840         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2841         [RTA_OIF]               = { .type = NLA_U32 },
2842         [RTA_IIF]               = { .type = NLA_U32 },
2843         [RTA_PRIORITY]          = { .type = NLA_U32 },
2844         [RTA_METRICS]           = { .type = NLA_NESTED },
2845         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2846         [RTA_PREF]              = { .type = NLA_U8 },
2847         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2848         [RTA_ENCAP]             = { .type = NLA_NESTED },
2849         [RTA_EXPIRES]           = { .type = NLA_U32 },
2850         [RTA_UID]               = { .type = NLA_U32 },
2851 };
2852
2853 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2854                               struct fib6_config *cfg)
2855 {
2856         struct rtmsg *rtm;
2857         struct nlattr *tb[RTA_MAX+1];
2858         unsigned int pref;
2859         int err;
2860
2861         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2862         if (err < 0)
2863                 goto errout;
2864
2865         err = -EINVAL;
2866         rtm = nlmsg_data(nlh);
2867         memset(cfg, 0, sizeof(*cfg));
2868
2869         cfg->fc_table = rtm->rtm_table;
2870         cfg->fc_dst_len = rtm->rtm_dst_len;
2871         cfg->fc_src_len = rtm->rtm_src_len;
2872         cfg->fc_flags = RTF_UP;
2873         cfg->fc_protocol = rtm->rtm_protocol;
2874         cfg->fc_type = rtm->rtm_type;
2875
2876         if (rtm->rtm_type == RTN_UNREACHABLE ||
2877             rtm->rtm_type == RTN_BLACKHOLE ||
2878             rtm->rtm_type == RTN_PROHIBIT ||
2879             rtm->rtm_type == RTN_THROW)
2880                 cfg->fc_flags |= RTF_REJECT;
2881
2882         if (rtm->rtm_type == RTN_LOCAL)
2883                 cfg->fc_flags |= RTF_LOCAL;
2884
2885         if (rtm->rtm_flags & RTM_F_CLONED)
2886                 cfg->fc_flags |= RTF_CACHE;
2887
2888         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2889         cfg->fc_nlinfo.nlh = nlh;
2890         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2891
2892         if (tb[RTA_GATEWAY]) {
2893                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2894                 cfg->fc_flags |= RTF_GATEWAY;
2895         }
2896
2897         if (tb[RTA_DST]) {
2898                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2899
2900                 if (nla_len(tb[RTA_DST]) < plen)
2901                         goto errout;
2902
2903                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2904         }
2905
2906         if (tb[RTA_SRC]) {
2907                 int plen = (rtm->rtm_src_len + 7) >> 3;
2908
2909                 if (nla_len(tb[RTA_SRC]) < plen)
2910                         goto errout;
2911
2912                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2913         }
2914
2915         if (tb[RTA_PREFSRC])
2916                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2917
2918         if (tb[RTA_OIF])
2919                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2920
2921         if (tb[RTA_PRIORITY])
2922                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2923
2924         if (tb[RTA_METRICS]) {
2925                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2926                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2927         }
2928
2929         if (tb[RTA_TABLE])
2930                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2931
2932         if (tb[RTA_MULTIPATH]) {
2933                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2934                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2935
2936                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2937                                                      cfg->fc_mp_len);
2938                 if (err < 0)
2939                         goto errout;
2940         }
2941
2942         if (tb[RTA_PREF]) {
2943                 pref = nla_get_u8(tb[RTA_PREF]);
2944                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2945                     pref != ICMPV6_ROUTER_PREF_HIGH)
2946                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2947                 cfg->fc_flags |= RTF_PREF(pref);
2948         }
2949
2950         if (tb[RTA_ENCAP])
2951                 cfg->fc_encap = tb[RTA_ENCAP];
2952
2953         if (tb[RTA_ENCAP_TYPE]) {
2954                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2955
2956                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
2957                 if (err < 0)
2958                         goto errout;
2959         }
2960
2961         if (tb[RTA_EXPIRES]) {
2962                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2963
2964                 if (addrconf_finite_timeout(timeout)) {
2965                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2966                         cfg->fc_flags |= RTF_EXPIRES;
2967                 }
2968         }
2969
2970         err = 0;
2971 errout:
2972         return err;
2973 }
2974
2975 struct rt6_nh {
2976         struct rt6_info *rt6_info;
2977         struct fib6_config r_cfg;
2978         struct mx6_config mxc;
2979         struct list_head next;
2980 };
2981
2982 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2983 {
2984         struct rt6_nh *nh;
2985
2986         list_for_each_entry(nh, rt6_nh_list, next) {
2987                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2988                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2989                         nh->r_cfg.fc_ifindex);
2990         }
2991 }
2992
2993 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2994                                  struct rt6_info *rt, struct fib6_config *r_cfg)
2995 {
2996         struct rt6_nh *nh;
2997         struct rt6_info *rtnh;
2998         int err = -EEXIST;
2999
3000         list_for_each_entry(nh, rt6_nh_list, next) {
3001                 /* check if rt6_info already exists */
3002                 rtnh = nh->rt6_info;
3003
3004                 if (rtnh->dst.dev == rt->dst.dev &&
3005                     rtnh->rt6i_idev == rt->rt6i_idev &&
3006                     ipv6_addr_equal(&rtnh->rt6i_gateway,
3007                                     &rt->rt6i_gateway))
3008                         return err;
3009         }
3010
3011         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3012         if (!nh)
3013                 return -ENOMEM;
3014         nh->rt6_info = rt;
3015         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3016         if (err) {
3017                 kfree(nh);
3018                 return err;
3019         }
3020         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3021         list_add_tail(&nh->next, rt6_nh_list);
3022
3023         return 0;
3024 }
3025
3026 static void ip6_route_mpath_notify(struct rt6_info *rt,
3027                                    struct rt6_info *rt_last,
3028                                    struct nl_info *info,
3029                                    __u16 nlflags)
3030 {
3031         /* if this is an APPEND route, then rt points to the first route
3032          * inserted and rt_last points to last route inserted. Userspace
3033          * wants a consistent dump of the route which starts at the first
3034          * nexthop. Since sibling routes are always added at the end of
3035          * the list, find the first sibling of the last route appended
3036          */
3037         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3038                 rt = list_first_entry(&rt_last->rt6i_siblings,
3039                                       struct rt6_info,
3040                                       rt6i_siblings);
3041         }
3042
3043         if (rt)
3044                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3045 }
3046
3047 static int ip6_route_multipath_add(struct fib6_config *cfg)
3048 {
3049         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3050         struct nl_info *info = &cfg->fc_nlinfo;
3051         struct fib6_config r_cfg;
3052         struct rtnexthop *rtnh;
3053         struct rt6_info *rt;
3054         struct rt6_nh *err_nh;
3055         struct rt6_nh *nh, *nh_safe;
3056         __u16 nlflags;
3057         int remaining;
3058         int attrlen;
3059         int err = 1;
3060         int nhn = 0;
3061         int replace = (cfg->fc_nlinfo.nlh &&
3062                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3063         LIST_HEAD(rt6_nh_list);
3064
3065         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3066         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3067                 nlflags |= NLM_F_APPEND;
3068
3069         remaining = cfg->fc_mp_len;
3070         rtnh = (struct rtnexthop *)cfg->fc_mp;
3071
3072         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3073          * rt6_info structs per nexthop
3074          */
3075         while (rtnh_ok(rtnh, remaining)) {
3076                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3077                 if (rtnh->rtnh_ifindex)
3078                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3079
3080                 attrlen = rtnh_attrlen(rtnh);
3081                 if (attrlen > 0) {
3082                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3083
3084                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3085                         if (nla) {
3086                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3087                                 r_cfg.fc_flags |= RTF_GATEWAY;
3088                         }
3089                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3090                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3091                         if (nla)
3092                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3093                 }
3094
3095                 rt = ip6_route_info_create(&r_cfg);
3096                 if (IS_ERR(rt)) {
3097                         err = PTR_ERR(rt);
3098                         rt = NULL;
3099                         goto cleanup;
3100                 }
3101
3102                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3103                 if (err) {
3104                         dst_free(&rt->dst);
3105                         goto cleanup;
3106                 }
3107
3108                 rtnh = rtnh_next(rtnh, &remaining);
3109         }
3110
3111         /* for add and replace send one notification with all nexthops.
3112          * Skip the notification in fib6_add_rt2node and send one with
3113          * the full route when done
3114          */
3115         info->skip_notify = 1;
3116
3117         err_nh = NULL;
3118         list_for_each_entry(nh, &rt6_nh_list, next) {
3119                 rt_last = nh->rt6_info;
3120                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
3121                 /* save reference to first route for notification */
3122                 if (!rt_notif && !err)
3123                         rt_notif = nh->rt6_info;
3124
3125                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3126                 nh->rt6_info = NULL;
3127                 if (err) {
3128                         if (replace && nhn)
3129                                 ip6_print_replace_route_err(&rt6_nh_list);
3130                         err_nh = nh;
3131                         goto add_errout;
3132                 }
3133
3134                 /* Because each route is added like a single route we remove
3135                  * these flags after the first nexthop: if there is a collision,
3136                  * we have already failed to add the first nexthop:
3137                  * fib6_add_rt2node() has rejected it; when replacing, old
3138                  * nexthops have been replaced by first new, the rest should
3139                  * be added to it.
3140                  */
3141                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3142                                                      NLM_F_REPLACE);
3143                 nhn++;
3144         }
3145
3146         /* success ... tell user about new route */
3147         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3148         goto cleanup;
3149
3150 add_errout:
3151         /* send notification for routes that were added so that
3152          * the delete notifications sent by ip6_route_del are
3153          * coherent
3154          */
3155         if (rt_notif)
3156                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3157
3158         /* Delete routes that were already added */
3159         list_for_each_entry(nh, &rt6_nh_list, next) {
3160                 if (err_nh == nh)
3161                         break;
3162                 ip6_route_del(&nh->r_cfg);
3163         }
3164
3165 cleanup:
3166         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3167                 if (nh->rt6_info)
3168                         dst_free(&nh->rt6_info->dst);
3169                 kfree(nh->mxc.mx);
3170                 list_del(&nh->next);
3171                 kfree(nh);
3172         }
3173
3174         return err;
3175 }
3176
3177 static int ip6_route_multipath_del(struct fib6_config *cfg)
3178 {
3179         struct fib6_config r_cfg;
3180         struct rtnexthop *rtnh;
3181         int remaining;
3182         int attrlen;
3183         int err = 1, last_err = 0;
3184
3185         remaining = cfg->fc_mp_len;
3186         rtnh = (struct rtnexthop *)cfg->fc_mp;
3187
3188         /* Parse a Multipath Entry */
3189         while (rtnh_ok(rtnh, remaining)) {
3190                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3191                 if (rtnh->rtnh_ifindex)
3192                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3193
3194                 attrlen = rtnh_attrlen(rtnh);
3195                 if (attrlen > 0) {
3196                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3197
3198                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3199                         if (nla) {
3200                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3201                                 r_cfg.fc_flags |= RTF_GATEWAY;
3202                         }
3203                 }
3204                 err = ip6_route_del(&r_cfg);
3205                 if (err)
3206                         last_err = err;
3207
3208                 rtnh = rtnh_next(rtnh, &remaining);
3209         }
3210
3211         return last_err;
3212 }
3213
3214 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3215 {
3216         struct fib6_config cfg;
3217         int err;
3218
3219         err = rtm_to_fib6_config(skb, nlh, &cfg);
3220         if (err < 0)
3221                 return err;
3222
3223         if (cfg.fc_mp)
3224                 return ip6_route_multipath_del(&cfg);
3225         else {
3226                 cfg.fc_delete_all_nh = 1;
3227                 return ip6_route_del(&cfg);
3228         }
3229 }
3230
3231 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3232 {
3233         struct fib6_config cfg;
3234         int err;
3235
3236         err = rtm_to_fib6_config(skb, nlh, &cfg);
3237         if (err < 0)
3238                 return err;
3239
3240         if (cfg.fc_mp)
3241                 return ip6_route_multipath_add(&cfg);
3242         else
3243                 return ip6_route_add(&cfg);
3244 }
3245
3246 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3247 {
3248         int nexthop_len = 0;
3249
3250         if (rt->rt6i_nsiblings) {
3251                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3252                             + NLA_ALIGN(sizeof(struct rtnexthop))
3253                             + nla_total_size(16) /* RTA_GATEWAY */
3254                             + nla_total_size(4)  /* RTA_OIF */
3255                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3256
3257                 nexthop_len *= rt->rt6i_nsiblings;
3258         }
3259
3260         return NLMSG_ALIGN(sizeof(struct rtmsg))
3261                + nla_total_size(16) /* RTA_SRC */
3262                + nla_total_size(16) /* RTA_DST */
3263                + nla_total_size(16) /* RTA_GATEWAY */
3264                + nla_total_size(16) /* RTA_PREFSRC */
3265                + nla_total_size(4) /* RTA_TABLE */
3266                + nla_total_size(4) /* RTA_IIF */
3267                + nla_total_size(4) /* RTA_OIF */
3268                + nla_total_size(4) /* RTA_PRIORITY */
3269                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3270                + nla_total_size(sizeof(struct rta_cacheinfo))
3271                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3272                + nla_total_size(1) /* RTA_PREF */
3273                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3274                + nexthop_len;
3275 }
3276
3277 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3278                             unsigned int *flags)
3279 {
3280         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3281                 *flags |= RTNH_F_LINKDOWN;
3282                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3283                         *flags |= RTNH_F_DEAD;
3284         }
3285
3286         if (rt->rt6i_flags & RTF_GATEWAY) {
3287                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3288                         goto nla_put_failure;
3289         }
3290
3291         if (rt->dst.dev &&
3292             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3293                 goto nla_put_failure;
3294
3295         if (rt->dst.lwtstate &&
3296             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3297                 goto nla_put_failure;
3298
3299         return 0;
3300
3301 nla_put_failure:
3302         return -EMSGSIZE;
3303 }
3304
3305 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3306 {
3307         struct rtnexthop *rtnh;
3308         unsigned int flags = 0;
3309
3310         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3311         if (!rtnh)
3312                 goto nla_put_failure;
3313
3314         rtnh->rtnh_hops = 0;
3315         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3316
3317         if (rt6_nexthop_info(skb, rt, &flags) < 0)
3318                 goto nla_put_failure;
3319
3320         rtnh->rtnh_flags = flags;
3321
3322         /* length of rtnetlink header + attributes */
3323         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3324
3325         return 0;
3326
3327 nla_put_failure:
3328         return -EMSGSIZE;
3329 }
3330
3331 static int rt6_fill_node(struct net *net,
3332                          struct sk_buff *skb, struct rt6_info *rt,
3333                          struct in6_addr *dst, struct in6_addr *src,
3334                          int iif, int type, u32 portid, u32 seq,
3335                          unsigned int flags)
3336 {
3337         u32 metrics[RTAX_MAX];
3338         struct rtmsg *rtm;
3339         struct nlmsghdr *nlh;
3340         long expires;
3341         u32 table;
3342
3343         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3344         if (!nlh)
3345                 return -EMSGSIZE;
3346
3347         rtm = nlmsg_data(nlh);
3348         rtm->rtm_family = AF_INET6;
3349         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3350         rtm->rtm_src_len = rt->rt6i_src.plen;
3351         rtm->rtm_tos = 0;
3352         if (rt->rt6i_table)
3353                 table = rt->rt6i_table->tb6_id;
3354         else
3355                 table = RT6_TABLE_UNSPEC;
3356         rtm->rtm_table = table;
3357         if (nla_put_u32(skb, RTA_TABLE, table))
3358                 goto nla_put_failure;
3359         if (rt->rt6i_flags & RTF_REJECT) {
3360                 switch (rt->dst.error) {
3361                 case -EINVAL:
3362                         rtm->rtm_type = RTN_BLACKHOLE;
3363                         break;
3364                 case -EACCES:
3365                         rtm->rtm_type = RTN_PROHIBIT;
3366                         break;
3367                 case -EAGAIN:
3368                         rtm->rtm_type = RTN_THROW;
3369                         break;
3370                 default:
3371                         rtm->rtm_type = RTN_UNREACHABLE;
3372                         break;
3373                 }
3374         }
3375         else if (rt->rt6i_flags & RTF_LOCAL)
3376                 rtm->rtm_type = RTN_LOCAL;
3377         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3378                 rtm->rtm_type = RTN_LOCAL;
3379         else
3380                 rtm->rtm_type = RTN_UNICAST;
3381         rtm->rtm_flags = 0;
3382         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3383         rtm->rtm_protocol = rt->rt6i_protocol;
3384         if (rt->rt6i_flags & RTF_DYNAMIC)
3385                 rtm->rtm_protocol = RTPROT_REDIRECT;
3386         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3387                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3388                         rtm->rtm_protocol = RTPROT_RA;
3389                 else
3390                         rtm->rtm_protocol = RTPROT_KERNEL;
3391         }
3392
3393         if (rt->rt6i_flags & RTF_CACHE)
3394                 rtm->rtm_flags |= RTM_F_CLONED;
3395
3396         if (dst) {
3397                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3398                         goto nla_put_failure;
3399                 rtm->rtm_dst_len = 128;
3400         } else if (rtm->rtm_dst_len)
3401                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3402                         goto nla_put_failure;
3403 #ifdef CONFIG_IPV6_SUBTREES
3404         if (src) {
3405                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3406                         goto nla_put_failure;
3407                 rtm->rtm_src_len = 128;
3408         } else if (rtm->rtm_src_len &&
3409                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3410                 goto nla_put_failure;
3411 #endif
3412         if (iif) {
3413 #ifdef CONFIG_IPV6_MROUTE
3414                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3415                         int err = ip6mr_get_route(net, skb, rtm, portid);
3416
3417                         if (err == 0)
3418                                 return 0;
3419                         if (err < 0)
3420                                 goto nla_put_failure;
3421                 } else
3422 #endif
3423                         if (nla_put_u32(skb, RTA_IIF, iif))
3424                                 goto nla_put_failure;
3425         } else if (dst) {
3426                 struct in6_addr saddr_buf;
3427                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3428                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3429                         goto nla_put_failure;
3430         }
3431
3432         if (rt->rt6i_prefsrc.plen) {
3433                 struct in6_addr saddr_buf;
3434                 saddr_buf = rt->rt6i_prefsrc.addr;
3435                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3436                         goto nla_put_failure;
3437         }
3438
3439         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3440         if (rt->rt6i_pmtu)
3441                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3442         if (rtnetlink_put_metrics(skb, metrics) < 0)
3443                 goto nla_put_failure;
3444
3445         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3446                 goto nla_put_failure;
3447
3448         /* For multipath routes, walk the siblings list and add
3449          * each as a nexthop within RTA_MULTIPATH.
3450          */
3451         if (rt->rt6i_nsiblings) {
3452                 struct rt6_info *sibling, *next_sibling;
3453                 struct nlattr *mp;
3454
3455                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3456                 if (!mp)
3457                         goto nla_put_failure;
3458
3459                 if (rt6_add_nexthop(skb, rt) < 0)
3460                         goto nla_put_failure;
3461
3462                 list_for_each_entry_safe(sibling, next_sibling,
3463                                          &rt->rt6i_siblings, rt6i_siblings) {
3464                         if (rt6_add_nexthop(skb, sibling) < 0)
3465                                 goto nla_put_failure;
3466                 }
3467
3468                 nla_nest_end(skb, mp);
3469         } else {
3470                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags) < 0)
3471                         goto nla_put_failure;
3472         }
3473
3474         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3475
3476         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3477                 goto nla_put_failure;
3478
3479         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3480                 goto nla_put_failure;
3481
3482
3483         nlmsg_end(skb, nlh);
3484         return 0;
3485
3486 nla_put_failure:
3487         nlmsg_cancel(skb, nlh);
3488         return -EMSGSIZE;
3489 }
3490
3491 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3492 {
3493         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3494         struct net *net = arg->net;
3495
3496         if (rt == net->ipv6.ip6_null_entry)
3497                 return 0;
3498
3499         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3500                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3501
3502                 /* user wants prefix routes only */
3503                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3504                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3505                         /* success since this is not a prefix route */
3506                         return 1;
3507                 }
3508         }
3509
3510         return rt6_fill_node(net,
3511                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3512                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3513                      NLM_F_MULTI);
3514 }
3515
3516 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3517 {
3518         struct net *net = sock_net(in_skb->sk);
3519         struct nlattr *tb[RTA_MAX+1];
3520         struct rt6_info *rt;
3521         struct sk_buff *skb;
3522         struct rtmsg *rtm;
3523         struct flowi6 fl6;
3524         int err, iif = 0, oif = 0;
3525
3526         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3527         if (err < 0)
3528                 goto errout;
3529
3530         err = -EINVAL;
3531         memset(&fl6, 0, sizeof(fl6));
3532         rtm = nlmsg_data(nlh);
3533         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3534
3535         if (tb[RTA_SRC]) {
3536                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3537                         goto errout;
3538
3539                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3540         }
3541
3542         if (tb[RTA_DST]) {
3543                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3544                         goto errout;
3545
3546                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3547         }
3548
3549         if (tb[RTA_IIF])
3550                 iif = nla_get_u32(tb[RTA_IIF]);
3551
3552         if (tb[RTA_OIF])
3553                 oif = nla_get_u32(tb[RTA_OIF]);
3554
3555         if (tb[RTA_MARK])
3556                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3557
3558         if (tb[RTA_UID])
3559                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3560                                            nla_get_u32(tb[RTA_UID]));
3561         else
3562                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3563
3564         if (iif) {
3565                 struct net_device *dev;
3566                 int flags = 0;
3567
3568                 dev = __dev_get_by_index(net, iif);
3569                 if (!dev) {
3570                         err = -ENODEV;
3571                         goto errout;
3572                 }
3573
3574                 fl6.flowi6_iif = iif;
3575
3576                 if (!ipv6_addr_any(&fl6.saddr))
3577                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3578
3579                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3580                                                                flags);
3581         } else {
3582                 fl6.flowi6_oif = oif;
3583
3584                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3585         }
3586
3587         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3588         if (!skb) {
3589                 ip6_rt_put(rt);
3590                 err = -ENOBUFS;
3591                 goto errout;
3592         }
3593
3594         skb_dst_set(skb, &rt->dst);
3595
3596         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3597                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3598                             nlh->nlmsg_seq, 0);
3599         if (err < 0) {
3600                 kfree_skb(skb);
3601                 goto errout;
3602         }
3603
3604         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3605 errout:
3606         return err;
3607 }
3608
3609 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3610                      unsigned int nlm_flags)
3611 {
3612         struct sk_buff *skb;
3613         struct net *net = info->nl_net;
3614         u32 seq;
3615         int err;
3616
3617         err = -ENOBUFS;
3618         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3619
3620         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3621         if (!skb)
3622                 goto errout;
3623
3624         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3625                                 event, info->portid, seq, nlm_flags);
3626         if (err < 0) {
3627                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3628                 WARN_ON(err == -EMSGSIZE);
3629                 kfree_skb(skb);
3630                 goto errout;
3631         }
3632         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3633                     info->nlh, gfp_any());
3634         return;
3635 errout:
3636         if (err < 0)
3637                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3638 }
3639
3640 static int ip6_route_dev_notify(struct notifier_block *this,
3641                                 unsigned long event, void *ptr)
3642 {
3643         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3644         struct net *net = dev_net(dev);
3645
3646         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3647                 net->ipv6.ip6_null_entry->dst.dev = dev;
3648                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3649 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3650                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3651                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3652                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3653                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3654 #endif
3655         }
3656
3657         return NOTIFY_OK;
3658 }
3659
3660 /*
3661  *      /proc
3662  */
3663
3664 #ifdef CONFIG_PROC_FS
3665
3666 static const struct file_operations ipv6_route_proc_fops = {
3667         .owner          = THIS_MODULE,
3668         .open           = ipv6_route_open,
3669         .read           = seq_read,
3670         .llseek         = seq_lseek,
3671         .release        = seq_release_net,
3672 };
3673
3674 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3675 {
3676         struct net *net = (struct net *)seq->private;
3677         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3678                    net->ipv6.rt6_stats->fib_nodes,
3679                    net->ipv6.rt6_stats->fib_route_nodes,
3680                    net->ipv6.rt6_stats->fib_rt_alloc,
3681                    net->ipv6.rt6_stats->fib_rt_entries,
3682                    net->ipv6.rt6_stats->fib_rt_cache,
3683                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3684                    net->ipv6.rt6_stats->fib_discarded_routes);
3685
3686         return 0;
3687 }
3688
3689 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3690 {
3691         return single_open_net(inode, file, rt6_stats_seq_show);
3692 }
3693
3694 static const struct file_operations rt6_stats_seq_fops = {
3695         .owner   = THIS_MODULE,
3696         .open    = rt6_stats_seq_open,
3697         .read    = seq_read,
3698         .llseek  = seq_lseek,
3699         .release = single_release_net,
3700 };
3701 #endif  /* CONFIG_PROC_FS */
3702
3703 #ifdef CONFIG_SYSCTL
3704
3705 static
3706 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3707                               void __user *buffer, size_t *lenp, loff_t *ppos)
3708 {
3709         struct net *net;
3710         int delay;
3711         if (!write)
3712                 return -EINVAL;
3713
3714         net = (struct net *)ctl->extra1;
3715         delay = net->ipv6.sysctl.flush_delay;
3716         proc_dointvec(ctl, write, buffer, lenp, ppos);
3717         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3718         return 0;
3719 }
3720
3721 struct ctl_table ipv6_route_table_template[] = {
3722         {
3723                 .procname       =       "flush",
3724                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3725                 .maxlen         =       sizeof(int),
3726                 .mode           =       0200,
3727                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3728         },
3729         {
3730                 .procname       =       "gc_thresh",
3731                 .data           =       &ip6_dst_ops_template.gc_thresh,
3732                 .maxlen         =       sizeof(int),
3733                 .mode           =       0644,
3734                 .proc_handler   =       proc_dointvec,
3735         },
3736         {
3737                 .procname       =       "max_size",
3738                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3739                 .maxlen         =       sizeof(int),
3740                 .mode           =       0644,
3741                 .proc_handler   =       proc_dointvec,
3742         },
3743         {
3744                 .procname       =       "gc_min_interval",
3745                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3746                 .maxlen         =       sizeof(int),
3747                 .mode           =       0644,
3748                 .proc_handler   =       proc_dointvec_jiffies,
3749         },
3750         {
3751                 .procname       =       "gc_timeout",
3752                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3753                 .maxlen         =       sizeof(int),
3754                 .mode           =       0644,
3755                 .proc_handler   =       proc_dointvec_jiffies,
3756         },
3757         {
3758                 .procname       =       "gc_interval",
3759                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3760                 .maxlen         =       sizeof(int),
3761                 .mode           =       0644,
3762                 .proc_handler   =       proc_dointvec_jiffies,
3763         },
3764         {
3765                 .procname       =       "gc_elasticity",
3766                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3767                 .maxlen         =       sizeof(int),
3768                 .mode           =       0644,
3769                 .proc_handler   =       proc_dointvec,
3770         },
3771         {
3772                 .procname       =       "mtu_expires",
3773                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3774                 .maxlen         =       sizeof(int),
3775                 .mode           =       0644,
3776                 .proc_handler   =       proc_dointvec_jiffies,
3777         },
3778         {
3779                 .procname       =       "min_adv_mss",
3780                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3781                 .maxlen         =       sizeof(int),
3782                 .mode           =       0644,
3783                 .proc_handler   =       proc_dointvec,
3784         },
3785         {
3786                 .procname       =       "gc_min_interval_ms",
3787                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3788                 .maxlen         =       sizeof(int),
3789                 .mode           =       0644,
3790                 .proc_handler   =       proc_dointvec_ms_jiffies,
3791         },
3792         { }
3793 };
3794
3795 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3796 {
3797         struct ctl_table *table;
3798
3799         table = kmemdup(ipv6_route_table_template,
3800                         sizeof(ipv6_route_table_template),
3801                         GFP_KERNEL);
3802
3803         if (table) {
3804                 table[0].data = &net->ipv6.sysctl.flush_delay;
3805                 table[0].extra1 = net;
3806                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3807                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3808                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3809                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3810                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3811                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3812                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3813                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3814                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3815
3816                 /* Don't export sysctls to unprivileged users */
3817                 if (net->user_ns != &init_user_ns)
3818                         table[0].procname = NULL;
3819         }
3820
3821         return table;
3822 }
3823 #endif
3824
3825 static int __net_init ip6_route_net_init(struct net *net)
3826 {
3827         int ret = -ENOMEM;
3828
3829         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3830                sizeof(net->ipv6.ip6_dst_ops));
3831
3832         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3833                 goto out_ip6_dst_ops;
3834
3835         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3836                                            sizeof(*net->ipv6.ip6_null_entry),
3837                                            GFP_KERNEL);
3838         if (!net->ipv6.ip6_null_entry)
3839                 goto out_ip6_dst_entries;
3840         net->ipv6.ip6_null_entry->dst.path =
3841                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3842         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3843         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3844                          ip6_template_metrics, true);
3845
3846 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3847         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3848                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3849                                                GFP_KERNEL);
3850         if (!net->ipv6.ip6_prohibit_entry)
3851                 goto out_ip6_null_entry;
3852         net->ipv6.ip6_prohibit_entry->dst.path =
3853                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3854         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3855         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3856                          ip6_template_metrics, true);
3857
3858         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3859                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3860                                                GFP_KERNEL);
3861         if (!net->ipv6.ip6_blk_hole_entry)
3862                 goto out_ip6_prohibit_entry;
3863         net->ipv6.ip6_blk_hole_entry->dst.path =
3864                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3865         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3866         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3867                          ip6_template_metrics, true);
3868 #endif
3869
3870         net->ipv6.sysctl.flush_delay = 0;
3871         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3872         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3873         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3874         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3875         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3876         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3877         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3878
3879         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3880
3881         ret = 0;
3882 out:
3883         return ret;
3884
3885 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3886 out_ip6_prohibit_entry:
3887         kfree(net->ipv6.ip6_prohibit_entry);
3888 out_ip6_null_entry:
3889         kfree(net->ipv6.ip6_null_entry);
3890 #endif
3891 out_ip6_dst_entries:
3892         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3893 out_ip6_dst_ops:
3894         goto out;
3895 }
3896
3897 static void __net_exit ip6_route_net_exit(struct net *net)
3898 {
3899         kfree(net->ipv6.ip6_null_entry);
3900 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3901         kfree(net->ipv6.ip6_prohibit_entry);
3902         kfree(net->ipv6.ip6_blk_hole_entry);
3903 #endif
3904         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3905 }
3906
3907 static int __net_init ip6_route_net_init_late(struct net *net)
3908 {
3909 #ifdef CONFIG_PROC_FS
3910         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3911         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3912 #endif
3913         return 0;
3914 }
3915
3916 static void __net_exit ip6_route_net_exit_late(struct net *net)
3917 {
3918 #ifdef CONFIG_PROC_FS
3919         remove_proc_entry("ipv6_route", net->proc_net);
3920         remove_proc_entry("rt6_stats", net->proc_net);
3921 #endif
3922 }
3923
3924 static struct pernet_operations ip6_route_net_ops = {
3925         .init = ip6_route_net_init,
3926         .exit = ip6_route_net_exit,
3927 };
3928
3929 static int __net_init ipv6_inetpeer_init(struct net *net)
3930 {
3931         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3932
3933         if (!bp)
3934                 return -ENOMEM;
3935         inet_peer_base_init(bp);
3936         net->ipv6.peers = bp;
3937         return 0;
3938 }
3939
3940 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3941 {
3942         struct inet_peer_base *bp = net->ipv6.peers;
3943
3944         net->ipv6.peers = NULL;
3945         inetpeer_invalidate_tree(bp);
3946         kfree(bp);
3947 }
3948
3949 static struct pernet_operations ipv6_inetpeer_ops = {
3950         .init   =       ipv6_inetpeer_init,
3951         .exit   =       ipv6_inetpeer_exit,
3952 };
3953
3954 static struct pernet_operations ip6_route_net_late_ops = {
3955         .init = ip6_route_net_init_late,
3956         .exit = ip6_route_net_exit_late,
3957 };
3958
3959 static struct notifier_block ip6_route_dev_notifier = {
3960         .notifier_call = ip6_route_dev_notify,
3961         .priority = 0,
3962 };
3963
3964 int __init ip6_route_init(void)
3965 {
3966         int ret;
3967         int cpu;
3968
3969         ret = -ENOMEM;
3970         ip6_dst_ops_template.kmem_cachep =
3971                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3972                                   SLAB_HWCACHE_ALIGN, NULL);
3973         if (!ip6_dst_ops_template.kmem_cachep)
3974                 goto out;
3975
3976         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3977         if (ret)
3978                 goto out_kmem_cache;
3979
3980         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3981         if (ret)
3982                 goto out_dst_entries;
3983
3984         ret = register_pernet_subsys(&ip6_route_net_ops);
3985         if (ret)
3986                 goto out_register_inetpeer;
3987
3988         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3989
3990         /* Registering of the loopback is done before this portion of code,
3991          * the loopback reference in rt6_info will not be taken, do it
3992          * manually for init_net */
3993         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3994         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3995   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3996         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3997         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3998         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3999         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4000   #endif
4001         ret = fib6_init();
4002         if (ret)
4003                 goto out_register_subsys;
4004
4005         ret = xfrm6_init();
4006         if (ret)
4007                 goto out_fib6_init;
4008
4009         ret = fib6_rules_init();
4010         if (ret)
4011                 goto xfrm6_init;
4012
4013         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4014         if (ret)
4015                 goto fib6_rules_init;
4016
4017         ret = -ENOBUFS;
4018         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4019             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4020             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4021                 goto out_register_late_subsys;
4022
4023         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4024         if (ret)
4025                 goto out_register_late_subsys;
4026
4027         for_each_possible_cpu(cpu) {
4028                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4029
4030                 INIT_LIST_HEAD(&ul->head);
4031                 spin_lock_init(&ul->lock);
4032         }
4033
4034 out:
4035         return ret;
4036
4037 out_register_late_subsys:
4038         unregister_pernet_subsys(&ip6_route_net_late_ops);
4039 fib6_rules_init:
4040         fib6_rules_cleanup();
4041 xfrm6_init:
4042         xfrm6_fini();
4043 out_fib6_init:
4044         fib6_gc_cleanup();
4045 out_register_subsys:
4046         unregister_pernet_subsys(&ip6_route_net_ops);
4047 out_register_inetpeer:
4048         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4049 out_dst_entries:
4050         dst_entries_destroy(&ip6_dst_blackhole_ops);
4051 out_kmem_cache:
4052         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4053         goto out;
4054 }
4055
4056 void ip6_route_cleanup(void)
4057 {
4058         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4059         unregister_pernet_subsys(&ip6_route_net_late_ops);
4060         fib6_rules_cleanup();
4061         xfrm6_fini();
4062         fib6_gc_cleanup();
4063         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4064         unregister_pernet_subsys(&ip6_route_net_ops);
4065         dst_entries_destroy(&ip6_dst_blackhole_ops);
4066         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4067 }