net-ipv6: on device mtu change do not add mtu to mtu-less routes
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <asm/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr,
106                                            struct net_device *dev,
107                                            unsigned int pref);
108 static struct rt6_info *rt6_get_route_info(struct net *net,
109                                            const struct in6_addr *prefix, int prefixlen,
110                                            const struct in6_addr *gwaddr,
111                                            struct net_device *dev);
112 #endif
113
114 struct uncached_list {
115         spinlock_t              lock;
116         struct list_head        head;
117 };
118
119 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
120
121 static void rt6_uncached_list_add(struct rt6_info *rt)
122 {
123         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
124
125         rt->dst.flags |= DST_NOCACHE;
126         rt->rt6i_uncached_list = ul;
127
128         spin_lock_bh(&ul->lock);
129         list_add_tail(&rt->rt6i_uncached, &ul->head);
130         spin_unlock_bh(&ul->lock);
131 }
132
133 static void rt6_uncached_list_del(struct rt6_info *rt)
134 {
135         if (!list_empty(&rt->rt6i_uncached)) {
136                 struct uncached_list *ul = rt->rt6i_uncached_list;
137
138                 spin_lock_bh(&ul->lock);
139                 list_del(&rt->rt6i_uncached);
140                 spin_unlock_bh(&ul->lock);
141         }
142 }
143
144 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
145 {
146         struct net_device *loopback_dev = net->loopback_dev;
147         int cpu;
148
149         if (dev == loopback_dev)
150                 return;
151
152         for_each_possible_cpu(cpu) {
153                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
154                 struct rt6_info *rt;
155
156                 spin_lock_bh(&ul->lock);
157                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
158                         struct inet6_dev *rt_idev = rt->rt6i_idev;
159                         struct net_device *rt_dev = rt->dst.dev;
160
161                         if (rt_idev->dev == dev) {
162                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
163                                 in6_dev_put(rt_idev);
164                         }
165
166                         if (rt_dev == dev) {
167                                 rt->dst.dev = loopback_dev;
168                                 dev_hold(rt->dst.dev);
169                                 dev_put(rt_dev);
170                         }
171                 }
172                 spin_unlock_bh(&ul->lock);
173         }
174 }
175
176 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
177 {
178         return dst_metrics_write_ptr(rt->dst.from);
179 }
180
181 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
182 {
183         struct rt6_info *rt = (struct rt6_info *)dst;
184
185         if (rt->rt6i_flags & RTF_PCPU)
186                 return rt6_pcpu_cow_metrics(rt);
187         else if (rt->rt6i_flags & RTF_CACHE)
188                 return NULL;
189         else
190                 return dst_cow_metrics_generic(dst, old);
191 }
192
193 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
194                                              struct sk_buff *skb,
195                                              const void *daddr)
196 {
197         struct in6_addr *p = &rt->rt6i_gateway;
198
199         if (!ipv6_addr_any(p))
200                 return (const void *) p;
201         else if (skb)
202                 return &ipv6_hdr(skb)->daddr;
203         return daddr;
204 }
205
206 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
207                                           struct sk_buff *skb,
208                                           const void *daddr)
209 {
210         struct rt6_info *rt = (struct rt6_info *) dst;
211         struct neighbour *n;
212
213         daddr = choose_neigh_daddr(rt, skb, daddr);
214         n = __ipv6_neigh_lookup(dst->dev, daddr);
215         if (n)
216                 return n;
217         return neigh_create(&nd_tbl, daddr, dst->dev);
218 }
219
220 static struct dst_ops ip6_dst_ops_template = {
221         .family                 =       AF_INET6,
222         .gc                     =       ip6_dst_gc,
223         .gc_thresh              =       1024,
224         .check                  =       ip6_dst_check,
225         .default_advmss         =       ip6_default_advmss,
226         .mtu                    =       ip6_mtu,
227         .cow_metrics            =       ipv6_cow_metrics,
228         .destroy                =       ip6_dst_destroy,
229         .ifdown                 =       ip6_dst_ifdown,
230         .negative_advice        =       ip6_negative_advice,
231         .link_failure           =       ip6_link_failure,
232         .update_pmtu            =       ip6_rt_update_pmtu,
233         .redirect               =       rt6_do_redirect,
234         .local_out              =       __ip6_local_out,
235         .neigh_lookup           =       ip6_neigh_lookup,
236 };
237
238 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
239 {
240         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
241
242         return mtu ? : dst->dev->mtu;
243 }
244
245 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
246                                          struct sk_buff *skb, u32 mtu)
247 {
248 }
249
250 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
251                                       struct sk_buff *skb)
252 {
253 }
254
255 static struct dst_ops ip6_dst_blackhole_ops = {
256         .family                 =       AF_INET6,
257         .destroy                =       ip6_dst_destroy,
258         .check                  =       ip6_dst_check,
259         .mtu                    =       ip6_blackhole_mtu,
260         .default_advmss         =       ip6_default_advmss,
261         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
262         .redirect               =       ip6_rt_blackhole_redirect,
263         .cow_metrics            =       dst_cow_metrics_generic,
264         .neigh_lookup           =       ip6_neigh_lookup,
265 };
266
267 static const u32 ip6_template_metrics[RTAX_MAX] = {
268         [RTAX_HOPLIMIT - 1] = 0,
269 };
270
271 static const struct rt6_info ip6_null_entry_template = {
272         .dst = {
273                 .__refcnt       = ATOMIC_INIT(1),
274                 .__use          = 1,
275                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
276                 .error          = -ENETUNREACH,
277                 .input          = ip6_pkt_discard,
278                 .output         = ip6_pkt_discard_out,
279         },
280         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
281         .rt6i_protocol  = RTPROT_KERNEL,
282         .rt6i_metric    = ~(u32) 0,
283         .rt6i_ref       = ATOMIC_INIT(1),
284 };
285
286 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
287
288 static const struct rt6_info ip6_prohibit_entry_template = {
289         .dst = {
290                 .__refcnt       = ATOMIC_INIT(1),
291                 .__use          = 1,
292                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
293                 .error          = -EACCES,
294                 .input          = ip6_pkt_prohibit,
295                 .output         = ip6_pkt_prohibit_out,
296         },
297         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
298         .rt6i_protocol  = RTPROT_KERNEL,
299         .rt6i_metric    = ~(u32) 0,
300         .rt6i_ref       = ATOMIC_INIT(1),
301 };
302
303 static const struct rt6_info ip6_blk_hole_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -EINVAL,
309                 .input          = dst_discard,
310                 .output         = dst_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313         .rt6i_protocol  = RTPROT_KERNEL,
314         .rt6i_metric    = ~(u32) 0,
315         .rt6i_ref       = ATOMIC_INIT(1),
316 };
317
318 #endif
319
320 static void rt6_info_init(struct rt6_info *rt)
321 {
322         struct dst_entry *dst = &rt->dst;
323
324         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
325         INIT_LIST_HEAD(&rt->rt6i_siblings);
326         INIT_LIST_HEAD(&rt->rt6i_uncached);
327 }
328
329 /* allocate dst with ip6_dst_ops */
330 static struct rt6_info *__ip6_dst_alloc(struct net *net,
331                                         struct net_device *dev,
332                                         int flags)
333 {
334         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
335                                         0, DST_OBSOLETE_FORCE_CHK, flags);
336
337         if (rt)
338                 rt6_info_init(rt);
339
340         return rt;
341 }
342
343 struct rt6_info *ip6_dst_alloc(struct net *net,
344                                struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
348
349         if (rt) {
350                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
351                 if (rt->rt6i_pcpu) {
352                         int cpu;
353
354                         for_each_possible_cpu(cpu) {
355                                 struct rt6_info **p;
356
357                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
358                                 /* no one shares rt */
359                                 *p =  NULL;
360                         }
361                 } else {
362                         dst_destroy((struct dst_entry *)rt);
363                         return NULL;
364                 }
365         }
366
367         return rt;
368 }
369 EXPORT_SYMBOL(ip6_dst_alloc);
370
371 static void ip6_dst_destroy(struct dst_entry *dst)
372 {
373         struct rt6_info *rt = (struct rt6_info *)dst;
374         struct dst_entry *from = dst->from;
375         struct inet6_dev *idev;
376
377         dst_destroy_metrics_generic(dst);
378         free_percpu(rt->rt6i_pcpu);
379         rt6_uncached_list_del(rt);
380
381         idev = rt->rt6i_idev;
382         if (idev) {
383                 rt->rt6i_idev = NULL;
384                 in6_dev_put(idev);
385         }
386
387         dst->from = NULL;
388         dst_release(from);
389 }
390
391 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392                            int how)
393 {
394         struct rt6_info *rt = (struct rt6_info *)dst;
395         struct inet6_dev *idev = rt->rt6i_idev;
396         struct net_device *loopback_dev =
397                 dev_net(dev)->loopback_dev;
398
399         if (dev != loopback_dev) {
400                 if (idev && idev->dev == dev) {
401                         struct inet6_dev *loopback_idev =
402                                 in6_dev_get(loopback_dev);
403                         if (loopback_idev) {
404                                 rt->rt6i_idev = loopback_idev;
405                                 in6_dev_put(idev);
406                         }
407                 }
408         }
409 }
410
411 static bool __rt6_check_expired(const struct rt6_info *rt)
412 {
413         if (rt->rt6i_flags & RTF_EXPIRES)
414                 return time_after(jiffies, rt->dst.expires);
415         else
416                 return false;
417 }
418
419 static bool rt6_check_expired(const struct rt6_info *rt)
420 {
421         if (rt->rt6i_flags & RTF_EXPIRES) {
422                 if (time_after(jiffies, rt->dst.expires))
423                         return true;
424         } else if (rt->dst.from) {
425                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
426         }
427         return false;
428 }
429
430 /* Multipath route selection:
431  *   Hash based function using packet header and flowlabel.
432  * Adapted from fib_info_hashfn()
433  */
434 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
435                                const struct flowi6 *fl6)
436 {
437         return get_hash_from_flowi6(fl6) % candidate_count;
438 }
439
440 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
441                                              struct flowi6 *fl6, int oif,
442                                              int strict)
443 {
444         struct rt6_info *sibling, *next_sibling;
445         int route_choosen;
446
447         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
448         /* Don't change the route, if route_choosen == 0
449          * (siblings does not include ourself)
450          */
451         if (route_choosen)
452                 list_for_each_entry_safe(sibling, next_sibling,
453                                 &match->rt6i_siblings, rt6i_siblings) {
454                         route_choosen--;
455                         if (route_choosen == 0) {
456                                 if (rt6_score_route(sibling, oif, strict) < 0)
457                                         break;
458                                 match = sibling;
459                                 break;
460                         }
461                 }
462         return match;
463 }
464
465 /*
466  *      Route lookup. Any table->tb6_lock is implied.
467  */
468
469 static inline struct rt6_info *rt6_device_match(struct net *net,
470                                                     struct rt6_info *rt,
471                                                     const struct in6_addr *saddr,
472                                                     int oif,
473                                                     int flags)
474 {
475         struct rt6_info *local = NULL;
476         struct rt6_info *sprt;
477
478         if (!oif && ipv6_addr_any(saddr))
479                 goto out;
480
481         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
482                 struct net_device *dev = sprt->dst.dev;
483
484                 if (oif) {
485                         if (dev->ifindex == oif)
486                                 return sprt;
487                         if (dev->flags & IFF_LOOPBACK) {
488                                 if (!sprt->rt6i_idev ||
489                                     sprt->rt6i_idev->dev->ifindex != oif) {
490                                         if (flags & RT6_LOOKUP_F_IFACE)
491                                                 continue;
492                                         if (local &&
493                                             local->rt6i_idev->dev->ifindex == oif)
494                                                 continue;
495                                 }
496                                 local = sprt;
497                         }
498                 } else {
499                         if (ipv6_chk_addr(net, saddr, dev,
500                                           flags & RT6_LOOKUP_F_IFACE))
501                                 return sprt;
502                 }
503         }
504
505         if (oif) {
506                 if (local)
507                         return local;
508
509                 if (flags & RT6_LOOKUP_F_IFACE)
510                         return net->ipv6.ip6_null_entry;
511         }
512 out:
513         return rt;
514 }
515
516 #ifdef CONFIG_IPV6_ROUTER_PREF
517 struct __rt6_probe_work {
518         struct work_struct work;
519         struct in6_addr target;
520         struct net_device *dev;
521 };
522
523 static void rt6_probe_deferred(struct work_struct *w)
524 {
525         struct in6_addr mcaddr;
526         struct __rt6_probe_work *work =
527                 container_of(w, struct __rt6_probe_work, work);
528
529         addrconf_addr_solict_mult(&work->target, &mcaddr);
530         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
531         dev_put(work->dev);
532         kfree(work);
533 }
534
535 static void rt6_probe(struct rt6_info *rt)
536 {
537         struct __rt6_probe_work *work;
538         struct neighbour *neigh;
539         /*
540          * Okay, this does not seem to be appropriate
541          * for now, however, we need to check if it
542          * is really so; aka Router Reachability Probing.
543          *
544          * Router Reachability Probe MUST be rate-limited
545          * to no more than one per minute.
546          */
547         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
548                 return;
549         rcu_read_lock_bh();
550         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
551         if (neigh) {
552                 if (neigh->nud_state & NUD_VALID)
553                         goto out;
554
555                 work = NULL;
556                 write_lock(&neigh->lock);
557                 if (!(neigh->nud_state & NUD_VALID) &&
558                     time_after(jiffies,
559                                neigh->updated +
560                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
561                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
562                         if (work)
563                                 __neigh_set_probe_once(neigh);
564                 }
565                 write_unlock(&neigh->lock);
566         } else {
567                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
568         }
569
570         if (work) {
571                 INIT_WORK(&work->work, rt6_probe_deferred);
572                 work->target = rt->rt6i_gateway;
573                 dev_hold(rt->dst.dev);
574                 work->dev = rt->dst.dev;
575                 schedule_work(&work->work);
576         }
577
578 out:
579         rcu_read_unlock_bh();
580 }
581 #else
582 static inline void rt6_probe(struct rt6_info *rt)
583 {
584 }
585 #endif
586
587 /*
588  * Default Router Selection (RFC 2461 6.3.6)
589  */
590 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
591 {
592         struct net_device *dev = rt->dst.dev;
593         if (!oif || dev->ifindex == oif)
594                 return 2;
595         if ((dev->flags & IFF_LOOPBACK) &&
596             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
597                 return 1;
598         return 0;
599 }
600
601 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
602 {
603         struct neighbour *neigh;
604         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
605
606         if (rt->rt6i_flags & RTF_NONEXTHOP ||
607             !(rt->rt6i_flags & RTF_GATEWAY))
608                 return RT6_NUD_SUCCEED;
609
610         rcu_read_lock_bh();
611         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
612         if (neigh) {
613                 read_lock(&neigh->lock);
614                 if (neigh->nud_state & NUD_VALID)
615                         ret = RT6_NUD_SUCCEED;
616 #ifdef CONFIG_IPV6_ROUTER_PREF
617                 else if (!(neigh->nud_state & NUD_FAILED))
618                         ret = RT6_NUD_SUCCEED;
619                 else
620                         ret = RT6_NUD_FAIL_PROBE;
621 #endif
622                 read_unlock(&neigh->lock);
623         } else {
624                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
625                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
626         }
627         rcu_read_unlock_bh();
628
629         return ret;
630 }
631
632 static int rt6_score_route(struct rt6_info *rt, int oif,
633                            int strict)
634 {
635         int m;
636
637         m = rt6_check_dev(rt, oif);
638         if (!m && (strict & RT6_LOOKUP_F_IFACE))
639                 return RT6_NUD_FAIL_HARD;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
642 #endif
643         if (strict & RT6_LOOKUP_F_REACHABLE) {
644                 int n = rt6_check_neigh(rt);
645                 if (n < 0)
646                         return n;
647         }
648         return m;
649 }
650
651 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
652                                    int *mpri, struct rt6_info *match,
653                                    bool *do_rr)
654 {
655         int m;
656         bool match_do_rr = false;
657         struct inet6_dev *idev = rt->rt6i_idev;
658         struct net_device *dev = rt->dst.dev;
659
660         if (dev && !netif_carrier_ok(dev) &&
661             idev->cnf.ignore_routes_with_linkdown &&
662             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
663                 goto out;
664
665         if (rt6_check_expired(rt))
666                 goto out;
667
668         m = rt6_score_route(rt, oif, strict);
669         if (m == RT6_NUD_FAIL_DO_RR) {
670                 match_do_rr = true;
671                 m = 0; /* lowest valid score */
672         } else if (m == RT6_NUD_FAIL_HARD) {
673                 goto out;
674         }
675
676         if (strict & RT6_LOOKUP_F_REACHABLE)
677                 rt6_probe(rt);
678
679         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
680         if (m > *mpri) {
681                 *do_rr = match_do_rr;
682                 *mpri = m;
683                 match = rt;
684         }
685 out:
686         return match;
687 }
688
689 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
690                                      struct rt6_info *rr_head,
691                                      u32 metric, int oif, int strict,
692                                      bool *do_rr)
693 {
694         struct rt6_info *rt, *match, *cont;
695         int mpri = -1;
696
697         match = NULL;
698         cont = NULL;
699         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
700                 if (rt->rt6i_metric != metric) {
701                         cont = rt;
702                         break;
703                 }
704
705                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
706         }
707
708         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
709                 if (rt->rt6i_metric != metric) {
710                         cont = rt;
711                         break;
712                 }
713
714                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
715         }
716
717         if (match || !cont)
718                 return match;
719
720         for (rt = cont; rt; rt = rt->dst.rt6_next)
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722
723         return match;
724 }
725
726 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
727 {
728         struct rt6_info *match, *rt0;
729         struct net *net;
730         bool do_rr = false;
731
732         rt0 = fn->rr_ptr;
733         if (!rt0)
734                 fn->rr_ptr = rt0 = fn->leaf;
735
736         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
737                              &do_rr);
738
739         if (do_rr) {
740                 struct rt6_info *next = rt0->dst.rt6_next;
741
742                 /* no entries matched; do round-robin */
743                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
744                         next = fn->leaf;
745
746                 if (next != rt0)
747                         fn->rr_ptr = next;
748         }
749
750         net = dev_net(rt0->dst.dev);
751         return match ? match : net->ipv6.ip6_null_entry;
752 }
753
754 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
755 {
756         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
757 }
758
759 #ifdef CONFIG_IPV6_ROUTE_INFO
760 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
761                   const struct in6_addr *gwaddr)
762 {
763         struct net *net = dev_net(dev);
764         struct route_info *rinfo = (struct route_info *) opt;
765         struct in6_addr prefix_buf, *prefix;
766         unsigned int pref;
767         unsigned long lifetime;
768         struct rt6_info *rt;
769
770         if (len < sizeof(struct route_info)) {
771                 return -EINVAL;
772         }
773
774         /* Sanity check for prefix_len and length */
775         if (rinfo->length > 3) {
776                 return -EINVAL;
777         } else if (rinfo->prefix_len > 128) {
778                 return -EINVAL;
779         } else if (rinfo->prefix_len > 64) {
780                 if (rinfo->length < 2) {
781                         return -EINVAL;
782                 }
783         } else if (rinfo->prefix_len > 0) {
784                 if (rinfo->length < 1) {
785                         return -EINVAL;
786                 }
787         }
788
789         pref = rinfo->route_pref;
790         if (pref == ICMPV6_ROUTER_PREF_INVALID)
791                 return -EINVAL;
792
793         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
794
795         if (rinfo->length == 3)
796                 prefix = (struct in6_addr *)rinfo->prefix;
797         else {
798                 /* this function is safe */
799                 ipv6_addr_prefix(&prefix_buf,
800                                  (struct in6_addr *)rinfo->prefix,
801                                  rinfo->prefix_len);
802                 prefix = &prefix_buf;
803         }
804
805         if (rinfo->prefix_len == 0)
806                 rt = rt6_get_dflt_router(gwaddr, dev);
807         else
808                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
809                                         gwaddr, dev);
810
811         if (rt && !lifetime) {
812                 ip6_del_rt(rt);
813                 rt = NULL;
814         }
815
816         if (!rt && lifetime)
817                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
818                                         dev, pref);
819         else if (rt)
820                 rt->rt6i_flags = RTF_ROUTEINFO |
821                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
822
823         if (rt) {
824                 if (!addrconf_finite_timeout(lifetime))
825                         rt6_clean_expires(rt);
826                 else
827                         rt6_set_expires(rt, jiffies + HZ * lifetime);
828
829                 ip6_rt_put(rt);
830         }
831         return 0;
832 }
833 #endif
834
835 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
836                                         struct in6_addr *saddr)
837 {
838         struct fib6_node *pn;
839         while (1) {
840                 if (fn->fn_flags & RTN_TL_ROOT)
841                         return NULL;
842                 pn = fn->parent;
843                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
844                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
845                 else
846                         fn = pn;
847                 if (fn->fn_flags & RTN_RTINFO)
848                         return fn;
849         }
850 }
851
852 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
853                                              struct fib6_table *table,
854                                              struct flowi6 *fl6, int flags)
855 {
856         struct fib6_node *fn;
857         struct rt6_info *rt;
858
859         read_lock_bh(&table->tb6_lock);
860         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
861 restart:
862         rt = fn->leaf;
863         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
864         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
865                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
866         if (rt == net->ipv6.ip6_null_entry) {
867                 fn = fib6_backtrack(fn, &fl6->saddr);
868                 if (fn)
869                         goto restart;
870         }
871         dst_use(&rt->dst, jiffies);
872         read_unlock_bh(&table->tb6_lock);
873
874         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
875
876         return rt;
877
878 }
879
880 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
881                                     int flags)
882 {
883         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
884 }
885 EXPORT_SYMBOL_GPL(ip6_route_lookup);
886
887 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
888                             const struct in6_addr *saddr, int oif, int strict)
889 {
890         struct flowi6 fl6 = {
891                 .flowi6_oif = oif,
892                 .daddr = *daddr,
893         };
894         struct dst_entry *dst;
895         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
896
897         if (saddr) {
898                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
899                 flags |= RT6_LOOKUP_F_HAS_SADDR;
900         }
901
902         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
903         if (dst->error == 0)
904                 return (struct rt6_info *) dst;
905
906         dst_release(dst);
907
908         return NULL;
909 }
910 EXPORT_SYMBOL(rt6_lookup);
911
912 /* ip6_ins_rt is called with FREE table->tb6_lock.
913    It takes new route entry, the addition fails by any reason the
914    route is freed. In any case, if caller does not hold it, it may
915    be destroyed.
916  */
917
918 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
919                         struct mx6_config *mxc)
920 {
921         int err;
922         struct fib6_table *table;
923
924         table = rt->rt6i_table;
925         write_lock_bh(&table->tb6_lock);
926         err = fib6_add(&table->tb6_root, rt, info, mxc);
927         write_unlock_bh(&table->tb6_lock);
928
929         return err;
930 }
931
932 int ip6_ins_rt(struct rt6_info *rt)
933 {
934         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
935         struct mx6_config mxc = { .mx = NULL, };
936
937         return __ip6_ins_rt(rt, &info, &mxc);
938 }
939
940 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
941                                            const struct in6_addr *daddr,
942                                            const struct in6_addr *saddr)
943 {
944         struct rt6_info *rt;
945
946         /*
947          *      Clone the route.
948          */
949
950         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
951                 ort = (struct rt6_info *)ort->dst.from;
952
953         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
954
955         if (!rt)
956                 return NULL;
957
958         ip6_rt_copy_init(rt, ort);
959         rt->rt6i_flags |= RTF_CACHE;
960         rt->rt6i_metric = 0;
961         rt->dst.flags |= DST_HOST;
962         rt->rt6i_dst.addr = *daddr;
963         rt->rt6i_dst.plen = 128;
964
965         if (!rt6_is_gw_or_nonexthop(ort)) {
966                 if (ort->rt6i_dst.plen != 128 &&
967                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
968                         rt->rt6i_flags |= RTF_ANYCAST;
969 #ifdef CONFIG_IPV6_SUBTREES
970                 if (rt->rt6i_src.plen && saddr) {
971                         rt->rt6i_src.addr = *saddr;
972                         rt->rt6i_src.plen = 128;
973                 }
974 #endif
975         }
976
977         return rt;
978 }
979
980 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
981 {
982         struct rt6_info *pcpu_rt;
983
984         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
985                                   rt->dst.dev, rt->dst.flags);
986
987         if (!pcpu_rt)
988                 return NULL;
989         ip6_rt_copy_init(pcpu_rt, rt);
990         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
991         pcpu_rt->rt6i_flags |= RTF_PCPU;
992         return pcpu_rt;
993 }
994
995 /* It should be called with read_lock_bh(&tb6_lock) acquired */
996 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
997 {
998         struct rt6_info *pcpu_rt, **p;
999
1000         p = this_cpu_ptr(rt->rt6i_pcpu);
1001         pcpu_rt = *p;
1002
1003         if (pcpu_rt) {
1004                 dst_hold(&pcpu_rt->dst);
1005                 rt6_dst_from_metrics_check(pcpu_rt);
1006         }
1007         return pcpu_rt;
1008 }
1009
1010 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1011 {
1012         struct fib6_table *table = rt->rt6i_table;
1013         struct rt6_info *pcpu_rt, *prev, **p;
1014
1015         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1016         if (!pcpu_rt) {
1017                 struct net *net = dev_net(rt->dst.dev);
1018
1019                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1020                 return net->ipv6.ip6_null_entry;
1021         }
1022
1023         read_lock_bh(&table->tb6_lock);
1024         if (rt->rt6i_pcpu) {
1025                 p = this_cpu_ptr(rt->rt6i_pcpu);
1026                 prev = cmpxchg(p, NULL, pcpu_rt);
1027                 if (prev) {
1028                         /* If someone did it before us, return prev instead */
1029                         dst_destroy(&pcpu_rt->dst);
1030                         pcpu_rt = prev;
1031                 }
1032         } else {
1033                 /* rt has been removed from the fib6 tree
1034                  * before we have a chance to acquire the read_lock.
1035                  * In this case, don't brother to create a pcpu rt
1036                  * since rt is going away anyway.  The next
1037                  * dst_check() will trigger a re-lookup.
1038                  */
1039                 dst_destroy(&pcpu_rt->dst);
1040                 pcpu_rt = rt;
1041         }
1042         dst_hold(&pcpu_rt->dst);
1043         rt6_dst_from_metrics_check(pcpu_rt);
1044         read_unlock_bh(&table->tb6_lock);
1045         return pcpu_rt;
1046 }
1047
1048 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1049                                int oif, struct flowi6 *fl6, int flags)
1050 {
1051         struct fib6_node *fn, *saved_fn;
1052         struct rt6_info *rt;
1053         int strict = 0;
1054
1055         strict |= flags & RT6_LOOKUP_F_IFACE;
1056         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1057         if (net->ipv6.devconf_all->forwarding == 0)
1058                 strict |= RT6_LOOKUP_F_REACHABLE;
1059
1060         read_lock_bh(&table->tb6_lock);
1061
1062         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063         saved_fn = fn;
1064
1065         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1066                 oif = 0;
1067
1068 redo_rt6_select:
1069         rt = rt6_select(fn, oif, strict);
1070         if (rt->rt6i_nsiblings)
1071                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1072         if (rt == net->ipv6.ip6_null_entry) {
1073                 fn = fib6_backtrack(fn, &fl6->saddr);
1074                 if (fn)
1075                         goto redo_rt6_select;
1076                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1077                         /* also consider unreachable route */
1078                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1079                         fn = saved_fn;
1080                         goto redo_rt6_select;
1081                 }
1082         }
1083
1084
1085         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1086                 dst_use(&rt->dst, jiffies);
1087                 read_unlock_bh(&table->tb6_lock);
1088
1089                 rt6_dst_from_metrics_check(rt);
1090
1091                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1092                 return rt;
1093         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1094                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1095                 /* Create a RTF_CACHE clone which will not be
1096                  * owned by the fib6 tree.  It is for the special case where
1097                  * the daddr in the skb during the neighbor look-up is different
1098                  * from the fl6->daddr used to look-up route here.
1099                  */
1100
1101                 struct rt6_info *uncached_rt;
1102
1103                 dst_use(&rt->dst, jiffies);
1104                 read_unlock_bh(&table->tb6_lock);
1105
1106                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1107                 dst_release(&rt->dst);
1108
1109                 if (uncached_rt)
1110                         rt6_uncached_list_add(uncached_rt);
1111                 else
1112                         uncached_rt = net->ipv6.ip6_null_entry;
1113
1114                 dst_hold(&uncached_rt->dst);
1115
1116                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1117                 return uncached_rt;
1118
1119         } else {
1120                 /* Get a percpu copy */
1121
1122                 struct rt6_info *pcpu_rt;
1123
1124                 rt->dst.lastuse = jiffies;
1125                 rt->dst.__use++;
1126                 pcpu_rt = rt6_get_pcpu_route(rt);
1127
1128                 if (pcpu_rt) {
1129                         read_unlock_bh(&table->tb6_lock);
1130                 } else {
1131                         /* We have to do the read_unlock first
1132                          * because rt6_make_pcpu_route() may trigger
1133                          * ip6_dst_gc() which will take the write_lock.
1134                          */
1135                         dst_hold(&rt->dst);
1136                         read_unlock_bh(&table->tb6_lock);
1137                         pcpu_rt = rt6_make_pcpu_route(rt);
1138                         dst_release(&rt->dst);
1139                 }
1140
1141                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1142                 return pcpu_rt;
1143
1144         }
1145 }
1146 EXPORT_SYMBOL_GPL(ip6_pol_route);
1147
1148 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1149                                             struct flowi6 *fl6, int flags)
1150 {
1151         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1152 }
1153
1154 struct dst_entry *ip6_route_input_lookup(struct net *net,
1155                                          struct net_device *dev,
1156                                          struct flowi6 *fl6, int flags)
1157 {
1158         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1159                 flags |= RT6_LOOKUP_F_IFACE;
1160
1161         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1162 }
1163 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1164
1165 void ip6_route_input(struct sk_buff *skb)
1166 {
1167         const struct ipv6hdr *iph = ipv6_hdr(skb);
1168         struct net *net = dev_net(skb->dev);
1169         int flags = RT6_LOOKUP_F_HAS_SADDR;
1170         struct ip_tunnel_info *tun_info;
1171         struct flowi6 fl6 = {
1172                 .flowi6_iif = skb->dev->ifindex,
1173                 .daddr = iph->daddr,
1174                 .saddr = iph->saddr,
1175                 .flowlabel = ip6_flowinfo(iph),
1176                 .flowi6_mark = skb->mark,
1177                 .flowi6_proto = iph->nexthdr,
1178         };
1179
1180         tun_info = skb_tunnel_info(skb);
1181         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1182                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1183         skb_dst_drop(skb);
1184         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1185 }
1186
1187 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1188                                              struct flowi6 *fl6, int flags)
1189 {
1190         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1191 }
1192
1193 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1194                                          struct flowi6 *fl6, int flags)
1195 {
1196         bool any_src;
1197
1198         if (rt6_need_strict(&fl6->daddr)) {
1199                 struct dst_entry *dst;
1200
1201                 dst = l3mdev_link_scope_lookup(net, fl6);
1202                 if (dst)
1203                         return dst;
1204         }
1205
1206         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1207
1208         any_src = ipv6_addr_any(&fl6->saddr);
1209         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1210             (fl6->flowi6_oif && any_src))
1211                 flags |= RT6_LOOKUP_F_IFACE;
1212
1213         if (!any_src)
1214                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1215         else if (sk)
1216                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1217
1218         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1221
1222 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1223 {
1224         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1225         struct dst_entry *new = NULL;
1226
1227         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1228         if (rt) {
1229                 rt6_info_init(rt);
1230
1231                 new = &rt->dst;
1232                 new->__use = 1;
1233                 new->input = dst_discard;
1234                 new->output = dst_discard_out;
1235
1236                 dst_copy_metrics(new, &ort->dst);
1237                 rt->rt6i_idev = ort->rt6i_idev;
1238                 if (rt->rt6i_idev)
1239                         in6_dev_hold(rt->rt6i_idev);
1240
1241                 rt->rt6i_gateway = ort->rt6i_gateway;
1242                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1243                 rt->rt6i_metric = 0;
1244
1245                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1246 #ifdef CONFIG_IPV6_SUBTREES
1247                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1248 #endif
1249
1250                 dst_free(new);
1251         }
1252
1253         dst_release(dst_orig);
1254         return new ? new : ERR_PTR(-ENOMEM);
1255 }
1256
1257 /*
1258  *      Destination cache support functions
1259  */
1260
1261 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1262 {
1263         if (rt->dst.from &&
1264             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1265                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1266 }
1267
1268 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1269 {
1270         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1271                 return NULL;
1272
1273         if (rt6_check_expired(rt))
1274                 return NULL;
1275
1276         return &rt->dst;
1277 }
1278
1279 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1280 {
1281         if (!__rt6_check_expired(rt) &&
1282             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1283             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1284                 return &rt->dst;
1285         else
1286                 return NULL;
1287 }
1288
1289 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1290 {
1291         struct rt6_info *rt;
1292
1293         rt = (struct rt6_info *) dst;
1294
1295         /* All IPV6 dsts are created with ->obsolete set to the value
1296          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1297          * into this function always.
1298          */
1299
1300         rt6_dst_from_metrics_check(rt);
1301
1302         if (rt->rt6i_flags & RTF_PCPU ||
1303             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1304                 return rt6_dst_from_check(rt, cookie);
1305         else
1306                 return rt6_check(rt, cookie);
1307 }
1308
1309 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1310 {
1311         struct rt6_info *rt = (struct rt6_info *) dst;
1312
1313         if (rt) {
1314                 if (rt->rt6i_flags & RTF_CACHE) {
1315                         if (rt6_check_expired(rt)) {
1316                                 ip6_del_rt(rt);
1317                                 dst = NULL;
1318                         }
1319                 } else {
1320                         dst_release(dst);
1321                         dst = NULL;
1322                 }
1323         }
1324         return dst;
1325 }
1326
1327 static void ip6_link_failure(struct sk_buff *skb)
1328 {
1329         struct rt6_info *rt;
1330
1331         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1332
1333         rt = (struct rt6_info *) skb_dst(skb);
1334         if (rt) {
1335                 if (rt->rt6i_flags & RTF_CACHE) {
1336                         dst_hold(&rt->dst);
1337                         ip6_del_rt(rt);
1338                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1339                         rt->rt6i_node->fn_sernum = -1;
1340                 }
1341         }
1342 }
1343
1344 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1345 {
1346         struct net *net = dev_net(rt->dst.dev);
1347
1348         rt->rt6i_flags |= RTF_MODIFIED;
1349         rt->rt6i_pmtu = mtu;
1350         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1351 }
1352
1353 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1354 {
1355         return !(rt->rt6i_flags & RTF_CACHE) &&
1356                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1357 }
1358
1359 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1360                                  const struct ipv6hdr *iph, u32 mtu)
1361 {
1362         struct rt6_info *rt6 = (struct rt6_info *)dst;
1363
1364         if (rt6->rt6i_flags & RTF_LOCAL)
1365                 return;
1366
1367         if (dst_metric_locked(dst, RTAX_MTU))
1368                 return;
1369
1370         dst_confirm(dst);
1371         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1372         if (mtu >= dst_mtu(dst))
1373                 return;
1374
1375         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1376                 rt6_do_update_pmtu(rt6, mtu);
1377         } else {
1378                 const struct in6_addr *daddr, *saddr;
1379                 struct rt6_info *nrt6;
1380
1381                 if (iph) {
1382                         daddr = &iph->daddr;
1383                         saddr = &iph->saddr;
1384                 } else if (sk) {
1385                         daddr = &sk->sk_v6_daddr;
1386                         saddr = &inet6_sk(sk)->saddr;
1387                 } else {
1388                         return;
1389                 }
1390                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1391                 if (nrt6) {
1392                         rt6_do_update_pmtu(nrt6, mtu);
1393
1394                         /* ip6_ins_rt(nrt6) will bump the
1395                          * rt6->rt6i_node->fn_sernum
1396                          * which will fail the next rt6_check() and
1397                          * invalidate the sk->sk_dst_cache.
1398                          */
1399                         ip6_ins_rt(nrt6);
1400                 }
1401         }
1402 }
1403
1404 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1405                                struct sk_buff *skb, u32 mtu)
1406 {
1407         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1408 }
1409
1410 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1411                      int oif, u32 mark)
1412 {
1413         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1414         struct dst_entry *dst;
1415         struct flowi6 fl6;
1416
1417         memset(&fl6, 0, sizeof(fl6));
1418         fl6.flowi6_oif = oif;
1419         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1420         fl6.daddr = iph->daddr;
1421         fl6.saddr = iph->saddr;
1422         fl6.flowlabel = ip6_flowinfo(iph);
1423
1424         dst = ip6_route_output(net, NULL, &fl6);
1425         if (!dst->error)
1426                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1427         dst_release(dst);
1428 }
1429 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1430
1431 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1432 {
1433         struct dst_entry *dst;
1434
1435         ip6_update_pmtu(skb, sock_net(sk), mtu,
1436                         sk->sk_bound_dev_if, sk->sk_mark);
1437
1438         dst = __sk_dst_get(sk);
1439         if (!dst || !dst->obsolete ||
1440             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1441                 return;
1442
1443         bh_lock_sock(sk);
1444         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1445                 ip6_datagram_dst_update(sk, false);
1446         bh_unlock_sock(sk);
1447 }
1448 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1449
1450 /* Handle redirects */
1451 struct ip6rd_flowi {
1452         struct flowi6 fl6;
1453         struct in6_addr gateway;
1454 };
1455
1456 static struct rt6_info *__ip6_route_redirect(struct net *net,
1457                                              struct fib6_table *table,
1458                                              struct flowi6 *fl6,
1459                                              int flags)
1460 {
1461         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1462         struct rt6_info *rt;
1463         struct fib6_node *fn;
1464
1465         /* Get the "current" route for this destination and
1466          * check if the redirect has come from approriate router.
1467          *
1468          * RFC 4861 specifies that redirects should only be
1469          * accepted if they come from the nexthop to the target.
1470          * Due to the way the routes are chosen, this notion
1471          * is a bit fuzzy and one might need to check all possible
1472          * routes.
1473          */
1474
1475         read_lock_bh(&table->tb6_lock);
1476         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1477 restart:
1478         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1479                 if (rt6_check_expired(rt))
1480                         continue;
1481                 if (rt->dst.error)
1482                         break;
1483                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1484                         continue;
1485                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1486                         continue;
1487                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1488                         continue;
1489                 break;
1490         }
1491
1492         if (!rt)
1493                 rt = net->ipv6.ip6_null_entry;
1494         else if (rt->dst.error) {
1495                 rt = net->ipv6.ip6_null_entry;
1496                 goto out;
1497         }
1498
1499         if (rt == net->ipv6.ip6_null_entry) {
1500                 fn = fib6_backtrack(fn, &fl6->saddr);
1501                 if (fn)
1502                         goto restart;
1503         }
1504
1505 out:
1506         dst_hold(&rt->dst);
1507
1508         read_unlock_bh(&table->tb6_lock);
1509
1510         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1511         return rt;
1512 };
1513
1514 static struct dst_entry *ip6_route_redirect(struct net *net,
1515                                         const struct flowi6 *fl6,
1516                                         const struct in6_addr *gateway)
1517 {
1518         int flags = RT6_LOOKUP_F_HAS_SADDR;
1519         struct ip6rd_flowi rdfl;
1520
1521         rdfl.fl6 = *fl6;
1522         rdfl.gateway = *gateway;
1523
1524         return fib6_rule_lookup(net, &rdfl.fl6,
1525                                 flags, __ip6_route_redirect);
1526 }
1527
1528 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1529 {
1530         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1531         struct dst_entry *dst;
1532         struct flowi6 fl6;
1533
1534         memset(&fl6, 0, sizeof(fl6));
1535         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1536         fl6.flowi6_oif = oif;
1537         fl6.flowi6_mark = mark;
1538         fl6.daddr = iph->daddr;
1539         fl6.saddr = iph->saddr;
1540         fl6.flowlabel = ip6_flowinfo(iph);
1541
1542         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1543         rt6_do_redirect(dst, NULL, skb);
1544         dst_release(dst);
1545 }
1546 EXPORT_SYMBOL_GPL(ip6_redirect);
1547
1548 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1549                             u32 mark)
1550 {
1551         const struct ipv6hdr *iph = ipv6_hdr(skb);
1552         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1553         struct dst_entry *dst;
1554         struct flowi6 fl6;
1555
1556         memset(&fl6, 0, sizeof(fl6));
1557         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1558         fl6.flowi6_oif = oif;
1559         fl6.flowi6_mark = mark;
1560         fl6.daddr = msg->dest;
1561         fl6.saddr = iph->daddr;
1562
1563         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1564         rt6_do_redirect(dst, NULL, skb);
1565         dst_release(dst);
1566 }
1567
1568 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1569 {
1570         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1571 }
1572 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1573
1574 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1575 {
1576         struct net_device *dev = dst->dev;
1577         unsigned int mtu = dst_mtu(dst);
1578         struct net *net = dev_net(dev);
1579
1580         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1581
1582         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1583                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1584
1585         /*
1586          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1587          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1588          * IPV6_MAXPLEN is also valid and means: "any MSS,
1589          * rely only on pmtu discovery"
1590          */
1591         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1592                 mtu = IPV6_MAXPLEN;
1593         return mtu;
1594 }
1595
1596 static unsigned int ip6_mtu(const struct dst_entry *dst)
1597 {
1598         const struct rt6_info *rt = (const struct rt6_info *)dst;
1599         unsigned int mtu = rt->rt6i_pmtu;
1600         struct inet6_dev *idev;
1601
1602         if (mtu)
1603                 goto out;
1604
1605         mtu = dst_metric_raw(dst, RTAX_MTU);
1606         if (mtu)
1607                 goto out;
1608
1609         mtu = IPV6_MIN_MTU;
1610
1611         rcu_read_lock();
1612         idev = __in6_dev_get(dst->dev);
1613         if (idev)
1614                 mtu = idev->cnf.mtu6;
1615         rcu_read_unlock();
1616
1617 out:
1618         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1619
1620         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1621 }
1622
1623 static struct dst_entry *icmp6_dst_gc_list;
1624 static DEFINE_SPINLOCK(icmp6_dst_lock);
1625
1626 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1627                                   struct flowi6 *fl6)
1628 {
1629         struct dst_entry *dst;
1630         struct rt6_info *rt;
1631         struct inet6_dev *idev = in6_dev_get(dev);
1632         struct net *net = dev_net(dev);
1633
1634         if (unlikely(!idev))
1635                 return ERR_PTR(-ENODEV);
1636
1637         rt = ip6_dst_alloc(net, dev, 0);
1638         if (unlikely(!rt)) {
1639                 in6_dev_put(idev);
1640                 dst = ERR_PTR(-ENOMEM);
1641                 goto out;
1642         }
1643
1644         rt->dst.flags |= DST_HOST;
1645         rt->dst.output  = ip6_output;
1646         atomic_set(&rt->dst.__refcnt, 1);
1647         rt->rt6i_gateway  = fl6->daddr;
1648         rt->rt6i_dst.addr = fl6->daddr;
1649         rt->rt6i_dst.plen = 128;
1650         rt->rt6i_idev     = idev;
1651         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1652
1653         spin_lock_bh(&icmp6_dst_lock);
1654         rt->dst.next = icmp6_dst_gc_list;
1655         icmp6_dst_gc_list = &rt->dst;
1656         spin_unlock_bh(&icmp6_dst_lock);
1657
1658         fib6_force_start_gc(net);
1659
1660         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1661
1662 out:
1663         return dst;
1664 }
1665
1666 int icmp6_dst_gc(void)
1667 {
1668         struct dst_entry *dst, **pprev;
1669         int more = 0;
1670
1671         spin_lock_bh(&icmp6_dst_lock);
1672         pprev = &icmp6_dst_gc_list;
1673
1674         while ((dst = *pprev) != NULL) {
1675                 if (!atomic_read(&dst->__refcnt)) {
1676                         *pprev = dst->next;
1677                         dst_free(dst);
1678                 } else {
1679                         pprev = &dst->next;
1680                         ++more;
1681                 }
1682         }
1683
1684         spin_unlock_bh(&icmp6_dst_lock);
1685
1686         return more;
1687 }
1688
1689 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1690                             void *arg)
1691 {
1692         struct dst_entry *dst, **pprev;
1693
1694         spin_lock_bh(&icmp6_dst_lock);
1695         pprev = &icmp6_dst_gc_list;
1696         while ((dst = *pprev) != NULL) {
1697                 struct rt6_info *rt = (struct rt6_info *) dst;
1698                 if (func(rt, arg)) {
1699                         *pprev = dst->next;
1700                         dst_free(dst);
1701                 } else {
1702                         pprev = &dst->next;
1703                 }
1704         }
1705         spin_unlock_bh(&icmp6_dst_lock);
1706 }
1707
1708 static int ip6_dst_gc(struct dst_ops *ops)
1709 {
1710         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1711         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1712         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1713         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1714         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1715         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1716         int entries;
1717
1718         entries = dst_entries_get_fast(ops);
1719         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1720             entries <= rt_max_size)
1721                 goto out;
1722
1723         net->ipv6.ip6_rt_gc_expire++;
1724         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1725         entries = dst_entries_get_slow(ops);
1726         if (entries < ops->gc_thresh)
1727                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1728 out:
1729         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1730         return entries > rt_max_size;
1731 }
1732
1733 static int ip6_convert_metrics(struct mx6_config *mxc,
1734                                const struct fib6_config *cfg)
1735 {
1736         bool ecn_ca = false;
1737         struct nlattr *nla;
1738         int remaining;
1739         u32 *mp;
1740
1741         if (!cfg->fc_mx)
1742                 return 0;
1743
1744         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1745         if (unlikely(!mp))
1746                 return -ENOMEM;
1747
1748         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1749                 int type = nla_type(nla);
1750                 u32 val;
1751
1752                 if (!type)
1753                         continue;
1754                 if (unlikely(type > RTAX_MAX))
1755                         goto err;
1756
1757                 if (type == RTAX_CC_ALGO) {
1758                         char tmp[TCP_CA_NAME_MAX];
1759
1760                         nla_strlcpy(tmp, nla, sizeof(tmp));
1761                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1762                         if (val == TCP_CA_UNSPEC)
1763                                 goto err;
1764                 } else {
1765                         val = nla_get_u32(nla);
1766                 }
1767                 if (type == RTAX_HOPLIMIT && val > 255)
1768                         val = 255;
1769                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1770                         goto err;
1771
1772                 mp[type - 1] = val;
1773                 __set_bit(type - 1, mxc->mx_valid);
1774         }
1775
1776         if (ecn_ca) {
1777                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1778                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1779         }
1780
1781         mxc->mx = mp;
1782         return 0;
1783  err:
1784         kfree(mp);
1785         return -EINVAL;
1786 }
1787
1788 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1789                                             struct fib6_config *cfg,
1790                                             const struct in6_addr *gw_addr)
1791 {
1792         struct flowi6 fl6 = {
1793                 .flowi6_oif = cfg->fc_ifindex,
1794                 .daddr = *gw_addr,
1795                 .saddr = cfg->fc_prefsrc,
1796         };
1797         struct fib6_table *table;
1798         struct rt6_info *rt;
1799         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1800
1801         table = fib6_get_table(net, cfg->fc_table);
1802         if (!table)
1803                 return NULL;
1804
1805         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1806                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1807
1808         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1809
1810         /* if table lookup failed, fall back to full lookup */
1811         if (rt == net->ipv6.ip6_null_entry) {
1812                 ip6_rt_put(rt);
1813                 rt = NULL;
1814         }
1815
1816         return rt;
1817 }
1818
1819 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1820 {
1821         struct net *net = cfg->fc_nlinfo.nl_net;
1822         struct rt6_info *rt = NULL;
1823         struct net_device *dev = NULL;
1824         struct inet6_dev *idev = NULL;
1825         struct fib6_table *table;
1826         int addr_type;
1827         int err = -EINVAL;
1828
1829         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1830                 goto out;
1831 #ifndef CONFIG_IPV6_SUBTREES
1832         if (cfg->fc_src_len)
1833                 goto out;
1834 #endif
1835         if (cfg->fc_ifindex) {
1836                 err = -ENODEV;
1837                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1838                 if (!dev)
1839                         goto out;
1840                 idev = in6_dev_get(dev);
1841                 if (!idev)
1842                         goto out;
1843         }
1844
1845         if (cfg->fc_metric == 0)
1846                 cfg->fc_metric = IP6_RT_PRIO_USER;
1847
1848         err = -ENOBUFS;
1849         if (cfg->fc_nlinfo.nlh &&
1850             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1851                 table = fib6_get_table(net, cfg->fc_table);
1852                 if (!table) {
1853                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1854                         table = fib6_new_table(net, cfg->fc_table);
1855                 }
1856         } else {
1857                 table = fib6_new_table(net, cfg->fc_table);
1858         }
1859
1860         if (!table)
1861                 goto out;
1862
1863         rt = ip6_dst_alloc(net, NULL,
1864                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1865
1866         if (!rt) {
1867                 err = -ENOMEM;
1868                 goto out;
1869         }
1870
1871         if (cfg->fc_flags & RTF_EXPIRES)
1872                 rt6_set_expires(rt, jiffies +
1873                                 clock_t_to_jiffies(cfg->fc_expires));
1874         else
1875                 rt6_clean_expires(rt);
1876
1877         if (cfg->fc_protocol == RTPROT_UNSPEC)
1878                 cfg->fc_protocol = RTPROT_BOOT;
1879         rt->rt6i_protocol = cfg->fc_protocol;
1880
1881         addr_type = ipv6_addr_type(&cfg->fc_dst);
1882
1883         if (addr_type & IPV6_ADDR_MULTICAST)
1884                 rt->dst.input = ip6_mc_input;
1885         else if (cfg->fc_flags & RTF_LOCAL)
1886                 rt->dst.input = ip6_input;
1887         else
1888                 rt->dst.input = ip6_forward;
1889
1890         rt->dst.output = ip6_output;
1891
1892         if (cfg->fc_encap) {
1893                 struct lwtunnel_state *lwtstate;
1894
1895                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1896                                            cfg->fc_encap, AF_INET6, cfg,
1897                                            &lwtstate);
1898                 if (err)
1899                         goto out;
1900                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1901                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1902                         rt->dst.lwtstate->orig_output = rt->dst.output;
1903                         rt->dst.output = lwtunnel_output;
1904                 }
1905                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1906                         rt->dst.lwtstate->orig_input = rt->dst.input;
1907                         rt->dst.input = lwtunnel_input;
1908                 }
1909         }
1910
1911         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1912         rt->rt6i_dst.plen = cfg->fc_dst_len;
1913         if (rt->rt6i_dst.plen == 128)
1914                 rt->dst.flags |= DST_HOST;
1915
1916 #ifdef CONFIG_IPV6_SUBTREES
1917         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1918         rt->rt6i_src.plen = cfg->fc_src_len;
1919 #endif
1920
1921         rt->rt6i_metric = cfg->fc_metric;
1922
1923         /* We cannot add true routes via loopback here,
1924            they would result in kernel looping; promote them to reject routes
1925          */
1926         if ((cfg->fc_flags & RTF_REJECT) ||
1927             (dev && (dev->flags & IFF_LOOPBACK) &&
1928              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1929              !(cfg->fc_flags & RTF_LOCAL))) {
1930                 /* hold loopback dev/idev if we haven't done so. */
1931                 if (dev != net->loopback_dev) {
1932                         if (dev) {
1933                                 dev_put(dev);
1934                                 in6_dev_put(idev);
1935                         }
1936                         dev = net->loopback_dev;
1937                         dev_hold(dev);
1938                         idev = in6_dev_get(dev);
1939                         if (!idev) {
1940                                 err = -ENODEV;
1941                                 goto out;
1942                         }
1943                 }
1944                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1945                 switch (cfg->fc_type) {
1946                 case RTN_BLACKHOLE:
1947                         rt->dst.error = -EINVAL;
1948                         rt->dst.output = dst_discard_out;
1949                         rt->dst.input = dst_discard;
1950                         break;
1951                 case RTN_PROHIBIT:
1952                         rt->dst.error = -EACCES;
1953                         rt->dst.output = ip6_pkt_prohibit_out;
1954                         rt->dst.input = ip6_pkt_prohibit;
1955                         break;
1956                 case RTN_THROW:
1957                 case RTN_UNREACHABLE:
1958                 default:
1959                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1960                                         : (cfg->fc_type == RTN_UNREACHABLE)
1961                                         ? -EHOSTUNREACH : -ENETUNREACH;
1962                         rt->dst.output = ip6_pkt_discard_out;
1963                         rt->dst.input = ip6_pkt_discard;
1964                         break;
1965                 }
1966                 goto install_route;
1967         }
1968
1969         if (cfg->fc_flags & RTF_GATEWAY) {
1970                 const struct in6_addr *gw_addr;
1971                 int gwa_type;
1972
1973                 gw_addr = &cfg->fc_gateway;
1974                 gwa_type = ipv6_addr_type(gw_addr);
1975
1976                 /* if gw_addr is local we will fail to detect this in case
1977                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1978                  * will return already-added prefix route via interface that
1979                  * prefix route was assigned to, which might be non-loopback.
1980                  */
1981                 err = -EINVAL;
1982                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1983                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1984                                             dev : NULL, 0, 0))
1985                         goto out;
1986
1987                 rt->rt6i_gateway = *gw_addr;
1988
1989                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1990                         struct rt6_info *grt = NULL;
1991
1992                         /* IPv6 strictly inhibits using not link-local
1993                            addresses as nexthop address.
1994                            Otherwise, router will not able to send redirects.
1995                            It is very good, but in some (rare!) circumstances
1996                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1997                            some exceptions. --ANK
1998                          */
1999                         if (!(gwa_type & IPV6_ADDR_UNICAST))
2000                                 goto out;
2001
2002                         if (cfg->fc_table) {
2003                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2004
2005                                 if (grt) {
2006                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2007                                             (dev && dev != grt->dst.dev)) {
2008                                                 ip6_rt_put(grt);
2009                                                 grt = NULL;
2010                                         }
2011                                 }
2012                         }
2013
2014                         if (!grt)
2015                                 grt = rt6_lookup(net, gw_addr, NULL,
2016                                                  cfg->fc_ifindex, 1);
2017
2018                         err = -EHOSTUNREACH;
2019                         if (!grt)
2020                                 goto out;
2021                         if (dev) {
2022                                 if (dev != grt->dst.dev) {
2023                                         ip6_rt_put(grt);
2024                                         goto out;
2025                                 }
2026                         } else {
2027                                 dev = grt->dst.dev;
2028                                 idev = grt->rt6i_idev;
2029                                 dev_hold(dev);
2030                                 in6_dev_hold(grt->rt6i_idev);
2031                         }
2032                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2033                                 err = 0;
2034                         ip6_rt_put(grt);
2035
2036                         if (err)
2037                                 goto out;
2038                 }
2039                 err = -EINVAL;
2040                 if (!dev || (dev->flags & IFF_LOOPBACK))
2041                         goto out;
2042         }
2043
2044         err = -ENODEV;
2045         if (!dev)
2046                 goto out;
2047
2048         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2049                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2050                         err = -EINVAL;
2051                         goto out;
2052                 }
2053                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2054                 rt->rt6i_prefsrc.plen = 128;
2055         } else
2056                 rt->rt6i_prefsrc.plen = 0;
2057
2058         rt->rt6i_flags = cfg->fc_flags;
2059
2060 install_route:
2061         rt->dst.dev = dev;
2062         rt->rt6i_idev = idev;
2063         rt->rt6i_table = table;
2064
2065         cfg->fc_nlinfo.nl_net = dev_net(dev);
2066
2067         return rt;
2068 out:
2069         if (dev)
2070                 dev_put(dev);
2071         if (idev)
2072                 in6_dev_put(idev);
2073         if (rt)
2074                 dst_free(&rt->dst);
2075
2076         return ERR_PTR(err);
2077 }
2078
2079 int ip6_route_add(struct fib6_config *cfg)
2080 {
2081         struct mx6_config mxc = { .mx = NULL, };
2082         struct rt6_info *rt;
2083         int err;
2084
2085         rt = ip6_route_info_create(cfg);
2086         if (IS_ERR(rt)) {
2087                 err = PTR_ERR(rt);
2088                 rt = NULL;
2089                 goto out;
2090         }
2091
2092         err = ip6_convert_metrics(&mxc, cfg);
2093         if (err)
2094                 goto out;
2095
2096         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2097
2098         kfree(mxc.mx);
2099
2100         return err;
2101 out:
2102         if (rt)
2103                 dst_free(&rt->dst);
2104
2105         return err;
2106 }
2107
2108 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2109 {
2110         int err;
2111         struct fib6_table *table;
2112         struct net *net = dev_net(rt->dst.dev);
2113
2114         if (rt == net->ipv6.ip6_null_entry ||
2115             rt->dst.flags & DST_NOCACHE) {
2116                 err = -ENOENT;
2117                 goto out;
2118         }
2119
2120         table = rt->rt6i_table;
2121         write_lock_bh(&table->tb6_lock);
2122         err = fib6_del(rt, info);
2123         write_unlock_bh(&table->tb6_lock);
2124
2125 out:
2126         ip6_rt_put(rt);
2127         return err;
2128 }
2129
2130 int ip6_del_rt(struct rt6_info *rt)
2131 {
2132         struct nl_info info = {
2133                 .nl_net = dev_net(rt->dst.dev),
2134         };
2135         return __ip6_del_rt(rt, &info);
2136 }
2137
2138 static int ip6_route_del(struct fib6_config *cfg)
2139 {
2140         struct fib6_table *table;
2141         struct fib6_node *fn;
2142         struct rt6_info *rt;
2143         int err = -ESRCH;
2144
2145         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2146         if (!table)
2147                 return err;
2148
2149         read_lock_bh(&table->tb6_lock);
2150
2151         fn = fib6_locate(&table->tb6_root,
2152                          &cfg->fc_dst, cfg->fc_dst_len,
2153                          &cfg->fc_src, cfg->fc_src_len);
2154
2155         if (fn) {
2156                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2157                         if ((rt->rt6i_flags & RTF_CACHE) &&
2158                             !(cfg->fc_flags & RTF_CACHE))
2159                                 continue;
2160                         if (cfg->fc_ifindex &&
2161                             (!rt->dst.dev ||
2162                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2163                                 continue;
2164                         if (cfg->fc_flags & RTF_GATEWAY &&
2165                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2166                                 continue;
2167                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2168                                 continue;
2169                         dst_hold(&rt->dst);
2170                         read_unlock_bh(&table->tb6_lock);
2171
2172                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2173                 }
2174         }
2175         read_unlock_bh(&table->tb6_lock);
2176
2177         return err;
2178 }
2179
2180 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2181 {
2182         struct netevent_redirect netevent;
2183         struct rt6_info *rt, *nrt = NULL;
2184         struct ndisc_options ndopts;
2185         struct inet6_dev *in6_dev;
2186         struct neighbour *neigh;
2187         struct rd_msg *msg;
2188         int optlen, on_link;
2189         u8 *lladdr;
2190
2191         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2192         optlen -= sizeof(*msg);
2193
2194         if (optlen < 0) {
2195                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2196                 return;
2197         }
2198
2199         msg = (struct rd_msg *)icmp6_hdr(skb);
2200
2201         if (ipv6_addr_is_multicast(&msg->dest)) {
2202                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2203                 return;
2204         }
2205
2206         on_link = 0;
2207         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2208                 on_link = 1;
2209         } else if (ipv6_addr_type(&msg->target) !=
2210                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2211                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2212                 return;
2213         }
2214
2215         in6_dev = __in6_dev_get(skb->dev);
2216         if (!in6_dev)
2217                 return;
2218         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2219                 return;
2220
2221         /* RFC2461 8.1:
2222          *      The IP source address of the Redirect MUST be the same as the current
2223          *      first-hop router for the specified ICMP Destination Address.
2224          */
2225
2226         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2227                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2228                 return;
2229         }
2230
2231         lladdr = NULL;
2232         if (ndopts.nd_opts_tgt_lladdr) {
2233                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2234                                              skb->dev);
2235                 if (!lladdr) {
2236                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2237                         return;
2238                 }
2239         }
2240
2241         rt = (struct rt6_info *) dst;
2242         if (rt->rt6i_flags & RTF_REJECT) {
2243                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2244                 return;
2245         }
2246
2247         /* Redirect received -> path was valid.
2248          * Look, redirects are sent only in response to data packets,
2249          * so that this nexthop apparently is reachable. --ANK
2250          */
2251         dst_confirm(&rt->dst);
2252
2253         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2254         if (!neigh)
2255                 return;
2256
2257         /*
2258          *      We have finally decided to accept it.
2259          */
2260
2261         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2262                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2263                      NEIGH_UPDATE_F_OVERRIDE|
2264                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2265                                      NEIGH_UPDATE_F_ISROUTER)),
2266                      NDISC_REDIRECT, &ndopts);
2267
2268         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2269         if (!nrt)
2270                 goto out;
2271
2272         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2273         if (on_link)
2274                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2275
2276         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2277
2278         if (ip6_ins_rt(nrt))
2279                 goto out;
2280
2281         netevent.old = &rt->dst;
2282         netevent.new = &nrt->dst;
2283         netevent.daddr = &msg->dest;
2284         netevent.neigh = neigh;
2285         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2286
2287         if (rt->rt6i_flags & RTF_CACHE) {
2288                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2289                 ip6_del_rt(rt);
2290         }
2291
2292 out:
2293         neigh_release(neigh);
2294 }
2295
2296 /*
2297  *      Misc support functions
2298  */
2299
2300 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2301 {
2302         BUG_ON(from->dst.from);
2303
2304         rt->rt6i_flags &= ~RTF_EXPIRES;
2305         dst_hold(&from->dst);
2306         rt->dst.from = &from->dst;
2307         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2308 }
2309
2310 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2311 {
2312         rt->dst.input = ort->dst.input;
2313         rt->dst.output = ort->dst.output;
2314         rt->rt6i_dst = ort->rt6i_dst;
2315         rt->dst.error = ort->dst.error;
2316         rt->rt6i_idev = ort->rt6i_idev;
2317         if (rt->rt6i_idev)
2318                 in6_dev_hold(rt->rt6i_idev);
2319         rt->dst.lastuse = jiffies;
2320         rt->rt6i_gateway = ort->rt6i_gateway;
2321         rt->rt6i_flags = ort->rt6i_flags;
2322         rt6_set_from(rt, ort);
2323         rt->rt6i_metric = ort->rt6i_metric;
2324 #ifdef CONFIG_IPV6_SUBTREES
2325         rt->rt6i_src = ort->rt6i_src;
2326 #endif
2327         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2328         rt->rt6i_table = ort->rt6i_table;
2329         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2330 }
2331
2332 #ifdef CONFIG_IPV6_ROUTE_INFO
2333 static struct rt6_info *rt6_get_route_info(struct net *net,
2334                                            const struct in6_addr *prefix, int prefixlen,
2335                                            const struct in6_addr *gwaddr,
2336                                            struct net_device *dev)
2337 {
2338         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2339         int ifindex = dev->ifindex;
2340         struct fib6_node *fn;
2341         struct rt6_info *rt = NULL;
2342         struct fib6_table *table;
2343
2344         table = fib6_get_table(net, tb_id);
2345         if (!table)
2346                 return NULL;
2347
2348         read_lock_bh(&table->tb6_lock);
2349         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2350         if (!fn)
2351                 goto out;
2352
2353         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2354                 if (rt->dst.dev->ifindex != ifindex)
2355                         continue;
2356                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2357                         continue;
2358                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2359                         continue;
2360                 dst_hold(&rt->dst);
2361                 break;
2362         }
2363 out:
2364         read_unlock_bh(&table->tb6_lock);
2365         return rt;
2366 }
2367
2368 static struct rt6_info *rt6_add_route_info(struct net *net,
2369                                            const struct in6_addr *prefix, int prefixlen,
2370                                            const struct in6_addr *gwaddr,
2371                                            struct net_device *dev,
2372                                            unsigned int pref)
2373 {
2374         struct fib6_config cfg = {
2375                 .fc_metric      = IP6_RT_PRIO_USER,
2376                 .fc_ifindex     = dev->ifindex,
2377                 .fc_dst_len     = prefixlen,
2378                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2379                                   RTF_UP | RTF_PREF(pref),
2380                 .fc_nlinfo.portid = 0,
2381                 .fc_nlinfo.nlh = NULL,
2382                 .fc_nlinfo.nl_net = net,
2383         };
2384
2385         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2386         cfg.fc_dst = *prefix;
2387         cfg.fc_gateway = *gwaddr;
2388
2389         /* We should treat it as a default route if prefix length is 0. */
2390         if (!prefixlen)
2391                 cfg.fc_flags |= RTF_DEFAULT;
2392
2393         ip6_route_add(&cfg);
2394
2395         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2396 }
2397 #endif
2398
2399 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2400 {
2401         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2402         struct rt6_info *rt;
2403         struct fib6_table *table;
2404
2405         table = fib6_get_table(dev_net(dev), tb_id);
2406         if (!table)
2407                 return NULL;
2408
2409         read_lock_bh(&table->tb6_lock);
2410         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2411                 if (dev == rt->dst.dev &&
2412                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2413                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2414                         break;
2415         }
2416         if (rt)
2417                 dst_hold(&rt->dst);
2418         read_unlock_bh(&table->tb6_lock);
2419         return rt;
2420 }
2421
2422 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2423                                      struct net_device *dev,
2424                                      unsigned int pref)
2425 {
2426         struct fib6_config cfg = {
2427                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2428                 .fc_metric      = IP6_RT_PRIO_USER,
2429                 .fc_ifindex     = dev->ifindex,
2430                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2431                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2432                 .fc_nlinfo.portid = 0,
2433                 .fc_nlinfo.nlh = NULL,
2434                 .fc_nlinfo.nl_net = dev_net(dev),
2435         };
2436
2437         cfg.fc_gateway = *gwaddr;
2438
2439         if (!ip6_route_add(&cfg)) {
2440                 struct fib6_table *table;
2441
2442                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2443                 if (table)
2444                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2445         }
2446
2447         return rt6_get_dflt_router(gwaddr, dev);
2448 }
2449
2450 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2451 {
2452         struct rt6_info *rt;
2453
2454 restart:
2455         read_lock_bh(&table->tb6_lock);
2456         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2457                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2458                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2459                         dst_hold(&rt->dst);
2460                         read_unlock_bh(&table->tb6_lock);
2461                         ip6_del_rt(rt);
2462                         goto restart;
2463                 }
2464         }
2465         read_unlock_bh(&table->tb6_lock);
2466
2467         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2468 }
2469
2470 void rt6_purge_dflt_routers(struct net *net)
2471 {
2472         struct fib6_table *table;
2473         struct hlist_head *head;
2474         unsigned int h;
2475
2476         rcu_read_lock();
2477
2478         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2479                 head = &net->ipv6.fib_table_hash[h];
2480                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2481                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2482                                 __rt6_purge_dflt_routers(table);
2483                 }
2484         }
2485
2486         rcu_read_unlock();
2487 }
2488
2489 static void rtmsg_to_fib6_config(struct net *net,
2490                                  struct in6_rtmsg *rtmsg,
2491                                  struct fib6_config *cfg)
2492 {
2493         memset(cfg, 0, sizeof(*cfg));
2494
2495         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2496                          : RT6_TABLE_MAIN;
2497         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2498         cfg->fc_metric = rtmsg->rtmsg_metric;
2499         cfg->fc_expires = rtmsg->rtmsg_info;
2500         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2501         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2502         cfg->fc_flags = rtmsg->rtmsg_flags;
2503
2504         cfg->fc_nlinfo.nl_net = net;
2505
2506         cfg->fc_dst = rtmsg->rtmsg_dst;
2507         cfg->fc_src = rtmsg->rtmsg_src;
2508         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2509 }
2510
2511 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2512 {
2513         struct fib6_config cfg;
2514         struct in6_rtmsg rtmsg;
2515         int err;
2516
2517         switch (cmd) {
2518         case SIOCADDRT:         /* Add a route */
2519         case SIOCDELRT:         /* Delete a route */
2520                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2521                         return -EPERM;
2522                 err = copy_from_user(&rtmsg, arg,
2523                                      sizeof(struct in6_rtmsg));
2524                 if (err)
2525                         return -EFAULT;
2526
2527                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2528
2529                 rtnl_lock();
2530                 switch (cmd) {
2531                 case SIOCADDRT:
2532                         err = ip6_route_add(&cfg);
2533                         break;
2534                 case SIOCDELRT:
2535                         err = ip6_route_del(&cfg);
2536                         break;
2537                 default:
2538                         err = -EINVAL;
2539                 }
2540                 rtnl_unlock();
2541
2542                 return err;
2543         }
2544
2545         return -EINVAL;
2546 }
2547
2548 /*
2549  *      Drop the packet on the floor
2550  */
2551
2552 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2553 {
2554         int type;
2555         struct dst_entry *dst = skb_dst(skb);
2556         switch (ipstats_mib_noroutes) {
2557         case IPSTATS_MIB_INNOROUTES:
2558                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2559                 if (type == IPV6_ADDR_ANY) {
2560                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2561                                       IPSTATS_MIB_INADDRERRORS);
2562                         break;
2563                 }
2564                 /* FALLTHROUGH */
2565         case IPSTATS_MIB_OUTNOROUTES:
2566                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2567                               ipstats_mib_noroutes);
2568                 break;
2569         }
2570         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2571         kfree_skb(skb);
2572         return 0;
2573 }
2574
2575 static int ip6_pkt_discard(struct sk_buff *skb)
2576 {
2577         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2578 }
2579
2580 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2581 {
2582         skb->dev = skb_dst(skb)->dev;
2583         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2584 }
2585
2586 static int ip6_pkt_prohibit(struct sk_buff *skb)
2587 {
2588         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2589 }
2590
2591 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2592 {
2593         skb->dev = skb_dst(skb)->dev;
2594         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2595 }
2596
2597 /*
2598  *      Allocate a dst for local (unicast / anycast) address.
2599  */
2600
2601 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2602                                     const struct in6_addr *addr,
2603                                     bool anycast)
2604 {
2605         u32 tb_id;
2606         struct net *net = dev_net(idev->dev);
2607         struct net_device *dev = net->loopback_dev;
2608         struct rt6_info *rt;
2609
2610         /* use L3 Master device as loopback for host routes if device
2611          * is enslaved and address is not link local or multicast
2612          */
2613         if (!rt6_need_strict(addr))
2614                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2615
2616         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2617         if (!rt)
2618                 return ERR_PTR(-ENOMEM);
2619
2620         in6_dev_hold(idev);
2621
2622         rt->dst.flags |= DST_HOST;
2623         rt->dst.input = ip6_input;
2624         rt->dst.output = ip6_output;
2625         rt->rt6i_idev = idev;
2626
2627         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2628         if (anycast)
2629                 rt->rt6i_flags |= RTF_ANYCAST;
2630         else
2631                 rt->rt6i_flags |= RTF_LOCAL;
2632
2633         rt->rt6i_gateway  = *addr;
2634         rt->rt6i_dst.addr = *addr;
2635         rt->rt6i_dst.plen = 128;
2636         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2637         rt->rt6i_table = fib6_get_table(net, tb_id);
2638         rt->dst.flags |= DST_NOCACHE;
2639
2640         atomic_set(&rt->dst.__refcnt, 1);
2641
2642         return rt;
2643 }
2644
2645 /* remove deleted ip from prefsrc entries */
2646 struct arg_dev_net_ip {
2647         struct net_device *dev;
2648         struct net *net;
2649         struct in6_addr *addr;
2650 };
2651
2652 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2653 {
2654         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2655         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2656         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2657
2658         if (((void *)rt->dst.dev == dev || !dev) &&
2659             rt != net->ipv6.ip6_null_entry &&
2660             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2661                 /* remove prefsrc entry */
2662                 rt->rt6i_prefsrc.plen = 0;
2663         }
2664         return 0;
2665 }
2666
2667 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2668 {
2669         struct net *net = dev_net(ifp->idev->dev);
2670         struct arg_dev_net_ip adni = {
2671                 .dev = ifp->idev->dev,
2672                 .net = net,
2673                 .addr = &ifp->addr,
2674         };
2675         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2676 }
2677
2678 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2679 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2680
2681 /* Remove routers and update dst entries when gateway turn into host. */
2682 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2683 {
2684         struct in6_addr *gateway = (struct in6_addr *)arg;
2685
2686         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2687              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2688              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2689                 return -1;
2690         }
2691         return 0;
2692 }
2693
2694 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2695 {
2696         fib6_clean_all(net, fib6_clean_tohost, gateway);
2697 }
2698
2699 struct arg_dev_net {
2700         struct net_device *dev;
2701         struct net *net;
2702 };
2703
2704 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2705 {
2706         const struct arg_dev_net *adn = arg;
2707         const struct net_device *dev = adn->dev;
2708
2709         if ((rt->dst.dev == dev || !dev) &&
2710             rt != adn->net->ipv6.ip6_null_entry)
2711                 return -1;
2712
2713         return 0;
2714 }
2715
2716 void rt6_ifdown(struct net *net, struct net_device *dev)
2717 {
2718         struct arg_dev_net adn = {
2719                 .dev = dev,
2720                 .net = net,
2721         };
2722
2723         fib6_clean_all(net, fib6_ifdown, &adn);
2724         icmp6_clean_all(fib6_ifdown, &adn);
2725         if (dev)
2726                 rt6_uncached_list_flush_dev(net, dev);
2727 }
2728
2729 struct rt6_mtu_change_arg {
2730         struct net_device *dev;
2731         unsigned int mtu;
2732 };
2733
2734 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2735 {
2736         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2737         struct inet6_dev *idev;
2738
2739         /* In IPv6 pmtu discovery is not optional,
2740            so that RTAX_MTU lock cannot disable it.
2741            We still use this lock to block changes
2742            caused by addrconf/ndisc.
2743         */
2744
2745         idev = __in6_dev_get(arg->dev);
2746         if (!idev)
2747                 return 0;
2748
2749         /* For administrative MTU increase, there is no way to discover
2750            IPv6 PMTU increase, so PMTU increase should be updated here.
2751            Since RFC 1981 doesn't include administrative MTU increase
2752            update PMTU increase is a MUST. (i.e. jumbo frame)
2753          */
2754         /*
2755            If new MTU is less than route PMTU, this new MTU will be the
2756            lowest MTU in the path, update the route PMTU to reflect PMTU
2757            decreases; if new MTU is greater than route PMTU, and the
2758            old MTU is the lowest MTU in the path, update the route PMTU
2759            to reflect the increase. In this case if the other nodes' MTU
2760            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2761            PMTU discouvery.
2762          */
2763         if (rt->dst.dev == arg->dev &&
2764             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2765             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2766                 if (rt->rt6i_flags & RTF_CACHE) {
2767                         /* For RTF_CACHE with rt6i_pmtu == 0
2768                          * (i.e. a redirected route),
2769                          * the metrics of its rt->dst.from has already
2770                          * been updated.
2771                          */
2772                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2773                                 rt->rt6i_pmtu = arg->mtu;
2774                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2775                            (dst_mtu(&rt->dst) < arg->mtu &&
2776                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2777                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2778                 }
2779         }
2780         return 0;
2781 }
2782
2783 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2784 {
2785         struct rt6_mtu_change_arg arg = {
2786                 .dev = dev,
2787                 .mtu = mtu,
2788         };
2789
2790         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2791 }
2792
2793 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2794         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2795         [RTA_OIF]               = { .type = NLA_U32 },
2796         [RTA_IIF]               = { .type = NLA_U32 },
2797         [RTA_PRIORITY]          = { .type = NLA_U32 },
2798         [RTA_METRICS]           = { .type = NLA_NESTED },
2799         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2800         [RTA_PREF]              = { .type = NLA_U8 },
2801         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2802         [RTA_ENCAP]             = { .type = NLA_NESTED },
2803         [RTA_EXPIRES]           = { .type = NLA_U32 },
2804 };
2805
2806 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2807                               struct fib6_config *cfg)
2808 {
2809         struct rtmsg *rtm;
2810         struct nlattr *tb[RTA_MAX+1];
2811         unsigned int pref;
2812         int err;
2813
2814         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2815         if (err < 0)
2816                 goto errout;
2817
2818         err = -EINVAL;
2819         rtm = nlmsg_data(nlh);
2820         memset(cfg, 0, sizeof(*cfg));
2821
2822         cfg->fc_table = rtm->rtm_table;
2823         cfg->fc_dst_len = rtm->rtm_dst_len;
2824         cfg->fc_src_len = rtm->rtm_src_len;
2825         cfg->fc_flags = RTF_UP;
2826         cfg->fc_protocol = rtm->rtm_protocol;
2827         cfg->fc_type = rtm->rtm_type;
2828
2829         if (rtm->rtm_type == RTN_UNREACHABLE ||
2830             rtm->rtm_type == RTN_BLACKHOLE ||
2831             rtm->rtm_type == RTN_PROHIBIT ||
2832             rtm->rtm_type == RTN_THROW)
2833                 cfg->fc_flags |= RTF_REJECT;
2834
2835         if (rtm->rtm_type == RTN_LOCAL)
2836                 cfg->fc_flags |= RTF_LOCAL;
2837
2838         if (rtm->rtm_flags & RTM_F_CLONED)
2839                 cfg->fc_flags |= RTF_CACHE;
2840
2841         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2842         cfg->fc_nlinfo.nlh = nlh;
2843         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2844
2845         if (tb[RTA_GATEWAY]) {
2846                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2847                 cfg->fc_flags |= RTF_GATEWAY;
2848         }
2849
2850         if (tb[RTA_DST]) {
2851                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2852
2853                 if (nla_len(tb[RTA_DST]) < plen)
2854                         goto errout;
2855
2856                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2857         }
2858
2859         if (tb[RTA_SRC]) {
2860                 int plen = (rtm->rtm_src_len + 7) >> 3;
2861
2862                 if (nla_len(tb[RTA_SRC]) < plen)
2863                         goto errout;
2864
2865                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2866         }
2867
2868         if (tb[RTA_PREFSRC])
2869                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2870
2871         if (tb[RTA_OIF])
2872                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2873
2874         if (tb[RTA_PRIORITY])
2875                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2876
2877         if (tb[RTA_METRICS]) {
2878                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2879                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2880         }
2881
2882         if (tb[RTA_TABLE])
2883                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2884
2885         if (tb[RTA_MULTIPATH]) {
2886                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2887                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2888         }
2889
2890         if (tb[RTA_PREF]) {
2891                 pref = nla_get_u8(tb[RTA_PREF]);
2892                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2893                     pref != ICMPV6_ROUTER_PREF_HIGH)
2894                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2895                 cfg->fc_flags |= RTF_PREF(pref);
2896         }
2897
2898         if (tb[RTA_ENCAP])
2899                 cfg->fc_encap = tb[RTA_ENCAP];
2900
2901         if (tb[RTA_ENCAP_TYPE])
2902                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2903
2904         if (tb[RTA_EXPIRES]) {
2905                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2906
2907                 if (addrconf_finite_timeout(timeout)) {
2908                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2909                         cfg->fc_flags |= RTF_EXPIRES;
2910                 }
2911         }
2912
2913         err = 0;
2914 errout:
2915         return err;
2916 }
2917
2918 struct rt6_nh {
2919         struct rt6_info *rt6_info;
2920         struct fib6_config r_cfg;
2921         struct mx6_config mxc;
2922         struct list_head next;
2923 };
2924
2925 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2926 {
2927         struct rt6_nh *nh;
2928
2929         list_for_each_entry(nh, rt6_nh_list, next) {
2930                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2931                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2932                         nh->r_cfg.fc_ifindex);
2933         }
2934 }
2935
2936 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2937                                  struct rt6_info *rt, struct fib6_config *r_cfg)
2938 {
2939         struct rt6_nh *nh;
2940         struct rt6_info *rtnh;
2941         int err = -EEXIST;
2942
2943         list_for_each_entry(nh, rt6_nh_list, next) {
2944                 /* check if rt6_info already exists */
2945                 rtnh = nh->rt6_info;
2946
2947                 if (rtnh->dst.dev == rt->dst.dev &&
2948                     rtnh->rt6i_idev == rt->rt6i_idev &&
2949                     ipv6_addr_equal(&rtnh->rt6i_gateway,
2950                                     &rt->rt6i_gateway))
2951                         return err;
2952         }
2953
2954         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2955         if (!nh)
2956                 return -ENOMEM;
2957         nh->rt6_info = rt;
2958         err = ip6_convert_metrics(&nh->mxc, r_cfg);
2959         if (err) {
2960                 kfree(nh);
2961                 return err;
2962         }
2963         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2964         list_add_tail(&nh->next, rt6_nh_list);
2965
2966         return 0;
2967 }
2968
2969 static int ip6_route_multipath_add(struct fib6_config *cfg)
2970 {
2971         struct fib6_config r_cfg;
2972         struct rtnexthop *rtnh;
2973         struct rt6_info *rt;
2974         struct rt6_nh *err_nh;
2975         struct rt6_nh *nh, *nh_safe;
2976         int remaining;
2977         int attrlen;
2978         int err = 1;
2979         int nhn = 0;
2980         int replace = (cfg->fc_nlinfo.nlh &&
2981                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2982         LIST_HEAD(rt6_nh_list);
2983
2984         remaining = cfg->fc_mp_len;
2985         rtnh = (struct rtnexthop *)cfg->fc_mp;
2986
2987         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
2988          * rt6_info structs per nexthop
2989          */
2990         while (rtnh_ok(rtnh, remaining)) {
2991                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2992                 if (rtnh->rtnh_ifindex)
2993                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2994
2995                 attrlen = rtnh_attrlen(rtnh);
2996                 if (attrlen > 0) {
2997                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2998
2999                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3000                         if (nla) {
3001                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3002                                 r_cfg.fc_flags |= RTF_GATEWAY;
3003                         }
3004                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3005                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3006                         if (nla)
3007                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3008                 }
3009
3010                 rt = ip6_route_info_create(&r_cfg);
3011                 if (IS_ERR(rt)) {
3012                         err = PTR_ERR(rt);
3013                         rt = NULL;
3014                         goto cleanup;
3015                 }
3016
3017                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3018                 if (err) {
3019                         dst_free(&rt->dst);
3020                         goto cleanup;
3021                 }
3022
3023                 rtnh = rtnh_next(rtnh, &remaining);
3024         }
3025
3026         err_nh = NULL;
3027         list_for_each_entry(nh, &rt6_nh_list, next) {
3028                 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
3029                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3030                 nh->rt6_info = NULL;
3031                 if (err) {
3032                         if (replace && nhn)
3033                                 ip6_print_replace_route_err(&rt6_nh_list);
3034                         err_nh = nh;
3035                         goto add_errout;
3036                 }
3037
3038                 /* Because each route is added like a single route we remove
3039                  * these flags after the first nexthop: if there is a collision,
3040                  * we have already failed to add the first nexthop:
3041                  * fib6_add_rt2node() has rejected it; when replacing, old
3042                  * nexthops have been replaced by first new, the rest should
3043                  * be added to it.
3044                  */
3045                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3046                                                      NLM_F_REPLACE);
3047                 nhn++;
3048         }
3049
3050         goto cleanup;
3051
3052 add_errout:
3053         /* Delete routes that were already added */
3054         list_for_each_entry(nh, &rt6_nh_list, next) {
3055                 if (err_nh == nh)
3056                         break;
3057                 ip6_route_del(&nh->r_cfg);
3058         }
3059
3060 cleanup:
3061         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3062                 if (nh->rt6_info)
3063                         dst_free(&nh->rt6_info->dst);
3064                 kfree(nh->mxc.mx);
3065                 list_del(&nh->next);
3066                 kfree(nh);
3067         }
3068
3069         return err;
3070 }
3071
3072 static int ip6_route_multipath_del(struct fib6_config *cfg)
3073 {
3074         struct fib6_config r_cfg;
3075         struct rtnexthop *rtnh;
3076         int remaining;
3077         int attrlen;
3078         int err = 1, last_err = 0;
3079
3080         remaining = cfg->fc_mp_len;
3081         rtnh = (struct rtnexthop *)cfg->fc_mp;
3082
3083         /* Parse a Multipath Entry */
3084         while (rtnh_ok(rtnh, remaining)) {
3085                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3086                 if (rtnh->rtnh_ifindex)
3087                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3088
3089                 attrlen = rtnh_attrlen(rtnh);
3090                 if (attrlen > 0) {
3091                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3092
3093                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3094                         if (nla) {
3095                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3096                                 r_cfg.fc_flags |= RTF_GATEWAY;
3097                         }
3098                 }
3099                 err = ip6_route_del(&r_cfg);
3100                 if (err)
3101                         last_err = err;
3102
3103                 rtnh = rtnh_next(rtnh, &remaining);
3104         }
3105
3106         return last_err;
3107 }
3108
3109 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3110 {
3111         struct fib6_config cfg;
3112         int err;
3113
3114         err = rtm_to_fib6_config(skb, nlh, &cfg);
3115         if (err < 0)
3116                 return err;
3117
3118         if (cfg.fc_mp)
3119                 return ip6_route_multipath_del(&cfg);
3120         else
3121                 return ip6_route_del(&cfg);
3122 }
3123
3124 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3125 {
3126         struct fib6_config cfg;
3127         int err;
3128
3129         err = rtm_to_fib6_config(skb, nlh, &cfg);
3130         if (err < 0)
3131                 return err;
3132
3133         if (cfg.fc_mp)
3134                 return ip6_route_multipath_add(&cfg);
3135         else
3136                 return ip6_route_add(&cfg);
3137 }
3138
3139 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3140 {
3141         return NLMSG_ALIGN(sizeof(struct rtmsg))
3142                + nla_total_size(16) /* RTA_SRC */
3143                + nla_total_size(16) /* RTA_DST */
3144                + nla_total_size(16) /* RTA_GATEWAY */
3145                + nla_total_size(16) /* RTA_PREFSRC */
3146                + nla_total_size(4) /* RTA_TABLE */
3147                + nla_total_size(4) /* RTA_IIF */
3148                + nla_total_size(4) /* RTA_OIF */
3149                + nla_total_size(4) /* RTA_PRIORITY */
3150                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3151                + nla_total_size(sizeof(struct rta_cacheinfo))
3152                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3153                + nla_total_size(1) /* RTA_PREF */
3154                + lwtunnel_get_encap_size(rt->dst.lwtstate);
3155 }
3156
3157 static int rt6_fill_node(struct net *net,
3158                          struct sk_buff *skb, struct rt6_info *rt,
3159                          struct in6_addr *dst, struct in6_addr *src,
3160                          int iif, int type, u32 portid, u32 seq,
3161                          int prefix, int nowait, unsigned int flags)
3162 {
3163         u32 metrics[RTAX_MAX];
3164         struct rtmsg *rtm;
3165         struct nlmsghdr *nlh;
3166         long expires;
3167         u32 table;
3168
3169         if (prefix) {   /* user wants prefix routes only */
3170                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3171                         /* success since this is not a prefix route */
3172                         return 1;
3173                 }
3174         }
3175
3176         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3177         if (!nlh)
3178                 return -EMSGSIZE;
3179
3180         rtm = nlmsg_data(nlh);
3181         rtm->rtm_family = AF_INET6;
3182         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3183         rtm->rtm_src_len = rt->rt6i_src.plen;
3184         rtm->rtm_tos = 0;
3185         if (rt->rt6i_table)
3186                 table = rt->rt6i_table->tb6_id;
3187         else
3188                 table = RT6_TABLE_UNSPEC;
3189         rtm->rtm_table = table;
3190         if (nla_put_u32(skb, RTA_TABLE, table))
3191                 goto nla_put_failure;
3192         if (rt->rt6i_flags & RTF_REJECT) {
3193                 switch (rt->dst.error) {
3194                 case -EINVAL:
3195                         rtm->rtm_type = RTN_BLACKHOLE;
3196                         break;
3197                 case -EACCES:
3198                         rtm->rtm_type = RTN_PROHIBIT;
3199                         break;
3200                 case -EAGAIN:
3201                         rtm->rtm_type = RTN_THROW;
3202                         break;
3203                 default:
3204                         rtm->rtm_type = RTN_UNREACHABLE;
3205                         break;
3206                 }
3207         }
3208         else if (rt->rt6i_flags & RTF_LOCAL)
3209                 rtm->rtm_type = RTN_LOCAL;
3210         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3211                 rtm->rtm_type = RTN_LOCAL;
3212         else
3213                 rtm->rtm_type = RTN_UNICAST;
3214         rtm->rtm_flags = 0;
3215         if (!netif_carrier_ok(rt->dst.dev)) {
3216                 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3217                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3218                         rtm->rtm_flags |= RTNH_F_DEAD;
3219         }
3220         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3221         rtm->rtm_protocol = rt->rt6i_protocol;
3222         if (rt->rt6i_flags & RTF_DYNAMIC)
3223                 rtm->rtm_protocol = RTPROT_REDIRECT;
3224         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3225                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3226                         rtm->rtm_protocol = RTPROT_RA;
3227                 else
3228                         rtm->rtm_protocol = RTPROT_KERNEL;
3229         }
3230
3231         if (rt->rt6i_flags & RTF_CACHE)
3232                 rtm->rtm_flags |= RTM_F_CLONED;
3233
3234         if (dst) {
3235                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3236                         goto nla_put_failure;
3237                 rtm->rtm_dst_len = 128;
3238         } else if (rtm->rtm_dst_len)
3239                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3240                         goto nla_put_failure;
3241 #ifdef CONFIG_IPV6_SUBTREES
3242         if (src) {
3243                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3244                         goto nla_put_failure;
3245                 rtm->rtm_src_len = 128;
3246         } else if (rtm->rtm_src_len &&
3247                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3248                 goto nla_put_failure;
3249 #endif
3250         if (iif) {
3251 #ifdef CONFIG_IPV6_MROUTE
3252                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3253                         int err = ip6mr_get_route(net, skb, rtm, nowait,
3254                                                   portid);
3255
3256                         if (err <= 0) {
3257                                 if (!nowait) {
3258                                         if (err == 0)
3259                                                 return 0;
3260                                         goto nla_put_failure;
3261                                 } else {
3262                                         if (err == -EMSGSIZE)
3263                                                 goto nla_put_failure;
3264                                 }
3265                         }
3266                 } else
3267 #endif
3268                         if (nla_put_u32(skb, RTA_IIF, iif))
3269                                 goto nla_put_failure;
3270         } else if (dst) {
3271                 struct in6_addr saddr_buf;
3272                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3273                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3274                         goto nla_put_failure;
3275         }
3276
3277         if (rt->rt6i_prefsrc.plen) {
3278                 struct in6_addr saddr_buf;
3279                 saddr_buf = rt->rt6i_prefsrc.addr;
3280                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3281                         goto nla_put_failure;
3282         }
3283
3284         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3285         if (rt->rt6i_pmtu)
3286                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3287         if (rtnetlink_put_metrics(skb, metrics) < 0)
3288                 goto nla_put_failure;
3289
3290         if (rt->rt6i_flags & RTF_GATEWAY) {
3291                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3292                         goto nla_put_failure;
3293         }
3294
3295         if (rt->dst.dev &&
3296             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3297                 goto nla_put_failure;
3298         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3299                 goto nla_put_failure;
3300
3301         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3302
3303         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3304                 goto nla_put_failure;
3305
3306         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3307                 goto nla_put_failure;
3308
3309         lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3310
3311         nlmsg_end(skb, nlh);
3312         return 0;
3313
3314 nla_put_failure:
3315         nlmsg_cancel(skb, nlh);
3316         return -EMSGSIZE;
3317 }
3318
3319 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3320 {
3321         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3322         int prefix;
3323
3324         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3325                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3326                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3327         } else
3328                 prefix = 0;
3329
3330         return rt6_fill_node(arg->net,
3331                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3332                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3333                      prefix, 0, NLM_F_MULTI);
3334 }
3335
3336 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3337 {
3338         struct net *net = sock_net(in_skb->sk);
3339         struct nlattr *tb[RTA_MAX+1];
3340         struct rt6_info *rt;
3341         struct sk_buff *skb;
3342         struct rtmsg *rtm;
3343         struct flowi6 fl6;
3344         int err, iif = 0, oif = 0;
3345
3346         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3347         if (err < 0)
3348                 goto errout;
3349
3350         err = -EINVAL;
3351         memset(&fl6, 0, sizeof(fl6));
3352         rtm = nlmsg_data(nlh);
3353         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3354
3355         if (tb[RTA_SRC]) {
3356                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3357                         goto errout;
3358
3359                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3360         }
3361
3362         if (tb[RTA_DST]) {
3363                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3364                         goto errout;
3365
3366                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3367         }
3368
3369         if (tb[RTA_IIF])
3370                 iif = nla_get_u32(tb[RTA_IIF]);
3371
3372         if (tb[RTA_OIF])
3373                 oif = nla_get_u32(tb[RTA_OIF]);
3374
3375         if (tb[RTA_MARK])
3376                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3377
3378         if (iif) {
3379                 struct net_device *dev;
3380                 int flags = 0;
3381
3382                 dev = __dev_get_by_index(net, iif);
3383                 if (!dev) {
3384                         err = -ENODEV;
3385                         goto errout;
3386                 }
3387
3388                 fl6.flowi6_iif = iif;
3389
3390                 if (!ipv6_addr_any(&fl6.saddr))
3391                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3392
3393                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3394                                                                flags);
3395         } else {
3396                 fl6.flowi6_oif = oif;
3397
3398                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3399         }
3400
3401         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3402         if (!skb) {
3403                 ip6_rt_put(rt);
3404                 err = -ENOBUFS;
3405                 goto errout;
3406         }
3407
3408         /* Reserve room for dummy headers, this skb can pass
3409            through good chunk of routing engine.
3410          */
3411         skb_reset_mac_header(skb);
3412         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3413
3414         skb_dst_set(skb, &rt->dst);
3415
3416         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3417                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3418                             nlh->nlmsg_seq, 0, 0, 0);
3419         if (err < 0) {
3420                 kfree_skb(skb);
3421                 goto errout;
3422         }
3423
3424         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3425 errout:
3426         return err;
3427 }
3428
3429 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3430                      unsigned int nlm_flags)
3431 {
3432         struct sk_buff *skb;
3433         struct net *net = info->nl_net;
3434         u32 seq;
3435         int err;
3436
3437         err = -ENOBUFS;
3438         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3439
3440         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3441         if (!skb)
3442                 goto errout;
3443
3444         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3445                                 event, info->portid, seq, 0, 0, nlm_flags);
3446         if (err < 0) {
3447                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3448                 WARN_ON(err == -EMSGSIZE);
3449                 kfree_skb(skb);
3450                 goto errout;
3451         }
3452         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3453                     info->nlh, gfp_any());
3454         return;
3455 errout:
3456         if (err < 0)
3457                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3458 }
3459
3460 static int ip6_route_dev_notify(struct notifier_block *this,
3461                                 unsigned long event, void *ptr)
3462 {
3463         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3464         struct net *net = dev_net(dev);
3465
3466         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3467                 net->ipv6.ip6_null_entry->dst.dev = dev;
3468                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3469 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3470                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3471                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3472                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3473                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3474 #endif
3475         }
3476
3477         return NOTIFY_OK;
3478 }
3479
3480 /*
3481  *      /proc
3482  */
3483
3484 #ifdef CONFIG_PROC_FS
3485
3486 static const struct file_operations ipv6_route_proc_fops = {
3487         .owner          = THIS_MODULE,
3488         .open           = ipv6_route_open,
3489         .read           = seq_read,
3490         .llseek         = seq_lseek,
3491         .release        = seq_release_net,
3492 };
3493
3494 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3495 {
3496         struct net *net = (struct net *)seq->private;
3497         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3498                    net->ipv6.rt6_stats->fib_nodes,
3499                    net->ipv6.rt6_stats->fib_route_nodes,
3500                    net->ipv6.rt6_stats->fib_rt_alloc,
3501                    net->ipv6.rt6_stats->fib_rt_entries,
3502                    net->ipv6.rt6_stats->fib_rt_cache,
3503                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3504                    net->ipv6.rt6_stats->fib_discarded_routes);
3505
3506         return 0;
3507 }
3508
3509 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3510 {
3511         return single_open_net(inode, file, rt6_stats_seq_show);
3512 }
3513
3514 static const struct file_operations rt6_stats_seq_fops = {
3515         .owner   = THIS_MODULE,
3516         .open    = rt6_stats_seq_open,
3517         .read    = seq_read,
3518         .llseek  = seq_lseek,
3519         .release = single_release_net,
3520 };
3521 #endif  /* CONFIG_PROC_FS */
3522
3523 #ifdef CONFIG_SYSCTL
3524
3525 static
3526 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3527                               void __user *buffer, size_t *lenp, loff_t *ppos)
3528 {
3529         struct net *net;
3530         int delay;
3531         if (!write)
3532                 return -EINVAL;
3533
3534         net = (struct net *)ctl->extra1;
3535         delay = net->ipv6.sysctl.flush_delay;
3536         proc_dointvec(ctl, write, buffer, lenp, ppos);
3537         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3538         return 0;
3539 }
3540
3541 struct ctl_table ipv6_route_table_template[] = {
3542         {
3543                 .procname       =       "flush",
3544                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3545                 .maxlen         =       sizeof(int),
3546                 .mode           =       0200,
3547                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3548         },
3549         {
3550                 .procname       =       "gc_thresh",
3551                 .data           =       &ip6_dst_ops_template.gc_thresh,
3552                 .maxlen         =       sizeof(int),
3553                 .mode           =       0644,
3554                 .proc_handler   =       proc_dointvec,
3555         },
3556         {
3557                 .procname       =       "max_size",
3558                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3559                 .maxlen         =       sizeof(int),
3560                 .mode           =       0644,
3561                 .proc_handler   =       proc_dointvec,
3562         },
3563         {
3564                 .procname       =       "gc_min_interval",
3565                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3566                 .maxlen         =       sizeof(int),
3567                 .mode           =       0644,
3568                 .proc_handler   =       proc_dointvec_jiffies,
3569         },
3570         {
3571                 .procname       =       "gc_timeout",
3572                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3573                 .maxlen         =       sizeof(int),
3574                 .mode           =       0644,
3575                 .proc_handler   =       proc_dointvec_jiffies,
3576         },
3577         {
3578                 .procname       =       "gc_interval",
3579                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3580                 .maxlen         =       sizeof(int),
3581                 .mode           =       0644,
3582                 .proc_handler   =       proc_dointvec_jiffies,
3583         },
3584         {
3585                 .procname       =       "gc_elasticity",
3586                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3587                 .maxlen         =       sizeof(int),
3588                 .mode           =       0644,
3589                 .proc_handler   =       proc_dointvec,
3590         },
3591         {
3592                 .procname       =       "mtu_expires",
3593                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3594                 .maxlen         =       sizeof(int),
3595                 .mode           =       0644,
3596                 .proc_handler   =       proc_dointvec_jiffies,
3597         },
3598         {
3599                 .procname       =       "min_adv_mss",
3600                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3601                 .maxlen         =       sizeof(int),
3602                 .mode           =       0644,
3603                 .proc_handler   =       proc_dointvec,
3604         },
3605         {
3606                 .procname       =       "gc_min_interval_ms",
3607                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3608                 .maxlen         =       sizeof(int),
3609                 .mode           =       0644,
3610                 .proc_handler   =       proc_dointvec_ms_jiffies,
3611         },
3612         { }
3613 };
3614
3615 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3616 {
3617         struct ctl_table *table;
3618
3619         table = kmemdup(ipv6_route_table_template,
3620                         sizeof(ipv6_route_table_template),
3621                         GFP_KERNEL);
3622
3623         if (table) {
3624                 table[0].data = &net->ipv6.sysctl.flush_delay;
3625                 table[0].extra1 = net;
3626                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3627                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3628                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3629                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3630                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3631                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3632                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3633                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3634                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3635
3636                 /* Don't export sysctls to unprivileged users */
3637                 if (net->user_ns != &init_user_ns)
3638                         table[0].procname = NULL;
3639         }
3640
3641         return table;
3642 }
3643 #endif
3644
3645 static int __net_init ip6_route_net_init(struct net *net)
3646 {
3647         int ret = -ENOMEM;
3648
3649         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3650                sizeof(net->ipv6.ip6_dst_ops));
3651
3652         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3653                 goto out_ip6_dst_ops;
3654
3655         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3656                                            sizeof(*net->ipv6.ip6_null_entry),
3657                                            GFP_KERNEL);
3658         if (!net->ipv6.ip6_null_entry)
3659                 goto out_ip6_dst_entries;
3660         net->ipv6.ip6_null_entry->dst.path =
3661                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3662         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3663         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3664                          ip6_template_metrics, true);
3665
3666 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3667         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3668                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3669                                                GFP_KERNEL);
3670         if (!net->ipv6.ip6_prohibit_entry)
3671                 goto out_ip6_null_entry;
3672         net->ipv6.ip6_prohibit_entry->dst.path =
3673                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3674         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3675         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3676                          ip6_template_metrics, true);
3677
3678         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3679                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3680                                                GFP_KERNEL);
3681         if (!net->ipv6.ip6_blk_hole_entry)
3682                 goto out_ip6_prohibit_entry;
3683         net->ipv6.ip6_blk_hole_entry->dst.path =
3684                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3685         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3686         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3687                          ip6_template_metrics, true);
3688 #endif
3689
3690         net->ipv6.sysctl.flush_delay = 0;
3691         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3692         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3693         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3694         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3695         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3696         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3697         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3698
3699         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3700
3701         ret = 0;
3702 out:
3703         return ret;
3704
3705 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3706 out_ip6_prohibit_entry:
3707         kfree(net->ipv6.ip6_prohibit_entry);
3708 out_ip6_null_entry:
3709         kfree(net->ipv6.ip6_null_entry);
3710 #endif
3711 out_ip6_dst_entries:
3712         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3713 out_ip6_dst_ops:
3714         goto out;
3715 }
3716
3717 static void __net_exit ip6_route_net_exit(struct net *net)
3718 {
3719         kfree(net->ipv6.ip6_null_entry);
3720 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3721         kfree(net->ipv6.ip6_prohibit_entry);
3722         kfree(net->ipv6.ip6_blk_hole_entry);
3723 #endif
3724         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3725 }
3726
3727 static int __net_init ip6_route_net_init_late(struct net *net)
3728 {
3729 #ifdef CONFIG_PROC_FS
3730         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3731         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3732 #endif
3733         return 0;
3734 }
3735
3736 static void __net_exit ip6_route_net_exit_late(struct net *net)
3737 {
3738 #ifdef CONFIG_PROC_FS
3739         remove_proc_entry("ipv6_route", net->proc_net);
3740         remove_proc_entry("rt6_stats", net->proc_net);
3741 #endif
3742 }
3743
3744 static struct pernet_operations ip6_route_net_ops = {
3745         .init = ip6_route_net_init,
3746         .exit = ip6_route_net_exit,
3747 };
3748
3749 static int __net_init ipv6_inetpeer_init(struct net *net)
3750 {
3751         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3752
3753         if (!bp)
3754                 return -ENOMEM;
3755         inet_peer_base_init(bp);
3756         net->ipv6.peers = bp;
3757         return 0;
3758 }
3759
3760 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3761 {
3762         struct inet_peer_base *bp = net->ipv6.peers;
3763
3764         net->ipv6.peers = NULL;
3765         inetpeer_invalidate_tree(bp);
3766         kfree(bp);
3767 }
3768
3769 static struct pernet_operations ipv6_inetpeer_ops = {
3770         .init   =       ipv6_inetpeer_init,
3771         .exit   =       ipv6_inetpeer_exit,
3772 };
3773
3774 static struct pernet_operations ip6_route_net_late_ops = {
3775         .init = ip6_route_net_init_late,
3776         .exit = ip6_route_net_exit_late,
3777 };
3778
3779 static struct notifier_block ip6_route_dev_notifier = {
3780         .notifier_call = ip6_route_dev_notify,
3781         .priority = 0,
3782 };
3783
3784 int __init ip6_route_init(void)
3785 {
3786         int ret;
3787         int cpu;
3788
3789         ret = -ENOMEM;
3790         ip6_dst_ops_template.kmem_cachep =
3791                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3792                                   SLAB_HWCACHE_ALIGN, NULL);
3793         if (!ip6_dst_ops_template.kmem_cachep)
3794                 goto out;
3795
3796         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3797         if (ret)
3798                 goto out_kmem_cache;
3799
3800         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3801         if (ret)
3802                 goto out_dst_entries;
3803
3804         ret = register_pernet_subsys(&ip6_route_net_ops);
3805         if (ret)
3806                 goto out_register_inetpeer;
3807
3808         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3809
3810         /* Registering of the loopback is done before this portion of code,
3811          * the loopback reference in rt6_info will not be taken, do it
3812          * manually for init_net */
3813         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3814         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3815   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3816         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3817         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3818         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3819         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3820   #endif
3821         ret = fib6_init();
3822         if (ret)
3823                 goto out_register_subsys;
3824
3825         ret = xfrm6_init();
3826         if (ret)
3827                 goto out_fib6_init;
3828
3829         ret = fib6_rules_init();
3830         if (ret)
3831                 goto xfrm6_init;
3832
3833         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3834         if (ret)
3835                 goto fib6_rules_init;
3836
3837         ret = -ENOBUFS;
3838         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3839             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3840             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3841                 goto out_register_late_subsys;
3842
3843         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3844         if (ret)
3845                 goto out_register_late_subsys;
3846
3847         for_each_possible_cpu(cpu) {
3848                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3849
3850                 INIT_LIST_HEAD(&ul->head);
3851                 spin_lock_init(&ul->lock);
3852         }
3853
3854 out:
3855         return ret;
3856
3857 out_register_late_subsys:
3858         unregister_pernet_subsys(&ip6_route_net_late_ops);
3859 fib6_rules_init:
3860         fib6_rules_cleanup();
3861 xfrm6_init:
3862         xfrm6_fini();
3863 out_fib6_init:
3864         fib6_gc_cleanup();
3865 out_register_subsys:
3866         unregister_pernet_subsys(&ip6_route_net_ops);
3867 out_register_inetpeer:
3868         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3869 out_dst_entries:
3870         dst_entries_destroy(&ip6_dst_blackhole_ops);
3871 out_kmem_cache:
3872         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3873         goto out;
3874 }
3875
3876 void ip6_route_cleanup(void)
3877 {
3878         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3879         unregister_pernet_subsys(&ip6_route_net_late_ops);
3880         fib6_rules_cleanup();
3881         xfrm6_fini();
3882         fib6_gc_cleanup();
3883         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3884         unregister_pernet_subsys(&ip6_route_net_ops);
3885         dst_entries_destroy(&ip6_dst_blackhole_ops);
3886         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3887 }