[IPV6] ROUTE: Routing by FWMARK.
[linux-2.6-microblaze.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 struct rt6_info ip6_prohibit_entry = {
145         .u = {
146                 .dst = {
147                         .__refcnt       = ATOMIC_INIT(1),
148                         .__use          = 1,
149                         .dev            = &loopback_dev,
150                         .obsolete       = -1,
151                         .error          = -EACCES,
152                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
153                         .input          = ip6_pkt_discard,
154                         .output         = ip6_pkt_discard_out,
155                         .ops            = &ip6_dst_ops,
156                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
157                 }
158         },
159         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
160         .rt6i_metric    = ~(u32) 0,
161         .rt6i_ref       = ATOMIC_INIT(1),
162 };
163
164 struct rt6_info ip6_blk_hole_entry = {
165         .u = {
166                 .dst = {
167                         .__refcnt       = ATOMIC_INIT(1),
168                         .__use          = 1,
169                         .dev            = &loopback_dev,
170                         .obsolete       = -1,
171                         .error          = -EINVAL,
172                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
173                         .input          = ip6_pkt_discard,
174                         .output         = ip6_pkt_discard_out,
175                         .ops            = &ip6_dst_ops,
176                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
177                 }
178         },
179         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
180         .rt6i_metric    = ~(u32) 0,
181         .rt6i_ref       = ATOMIC_INIT(1),
182 };
183
184 #endif
185
186 /* allocate dst with ip6_dst_ops */
187 static __inline__ struct rt6_info *ip6_dst_alloc(void)
188 {
189         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
190 }
191
192 static void ip6_dst_destroy(struct dst_entry *dst)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195         struct inet6_dev *idev = rt->rt6i_idev;
196
197         if (idev != NULL) {
198                 rt->rt6i_idev = NULL;
199                 in6_dev_put(idev);
200         }       
201 }
202
203 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
204                            int how)
205 {
206         struct rt6_info *rt = (struct rt6_info *)dst;
207         struct inet6_dev *idev = rt->rt6i_idev;
208
209         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
210                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
211                 if (loopback_idev != NULL) {
212                         rt->rt6i_idev = loopback_idev;
213                         in6_dev_put(idev);
214                 }
215         }
216 }
217
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219 {
220         return (rt->rt6i_flags & RTF_EXPIRES &&
221                 time_after(jiffies, rt->rt6i_expires));
222 }
223
224 static inline int rt6_need_strict(struct in6_addr *daddr)
225 {
226         return (ipv6_addr_type(daddr) &
227                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
228 }
229
230 /*
231  *      Route lookup. Any table->tb6_lock is implied.
232  */
233
234 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
235                                                     int oif,
236                                                     int strict)
237 {
238         struct rt6_info *local = NULL;
239         struct rt6_info *sprt;
240
241         if (oif) {
242                 for (sprt = rt; sprt; sprt = sprt->u.next) {
243                         struct net_device *dev = sprt->rt6i_dev;
244                         if (dev->ifindex == oif)
245                                 return sprt;
246                         if (dev->flags & IFF_LOOPBACK) {
247                                 if (sprt->rt6i_idev == NULL ||
248                                     sprt->rt6i_idev->dev->ifindex != oif) {
249                                         if (strict && oif)
250                                                 continue;
251                                         if (local && (!oif || 
252                                                       local->rt6i_idev->dev->ifindex == oif))
253                                                 continue;
254                                 }
255                                 local = sprt;
256                         }
257                 }
258
259                 if (local)
260                         return local;
261
262                 if (strict)
263                         return &ip6_null_entry;
264         }
265         return rt;
266 }
267
268 #ifdef CONFIG_IPV6_ROUTER_PREF
269 static void rt6_probe(struct rt6_info *rt)
270 {
271         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
272         /*
273          * Okay, this does not seem to be appropriate
274          * for now, however, we need to check if it
275          * is really so; aka Router Reachability Probing.
276          *
277          * Router Reachability Probe MUST be rate-limited
278          * to no more than one per minute.
279          */
280         if (!neigh || (neigh->nud_state & NUD_VALID))
281                 return;
282         read_lock_bh(&neigh->lock);
283         if (!(neigh->nud_state & NUD_VALID) &&
284             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
285                 struct in6_addr mcaddr;
286                 struct in6_addr *target;
287
288                 neigh->updated = jiffies;
289                 read_unlock_bh(&neigh->lock);
290
291                 target = (struct in6_addr *)&neigh->primary_key;
292                 addrconf_addr_solict_mult(target, &mcaddr);
293                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
294         } else
295                 read_unlock_bh(&neigh->lock);
296 }
297 #else
298 static inline void rt6_probe(struct rt6_info *rt)
299 {
300         return;
301 }
302 #endif
303
304 /*
305  * Default Router Selection (RFC 2461 6.3.6)
306  */
307 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
308 {
309         struct net_device *dev = rt->rt6i_dev;
310         if (!oif || dev->ifindex == oif)
311                 return 2;
312         if ((dev->flags & IFF_LOOPBACK) &&
313             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
314                 return 1;
315         return 0;
316 }
317
318 static int inline rt6_check_neigh(struct rt6_info *rt)
319 {
320         struct neighbour *neigh = rt->rt6i_nexthop;
321         int m = 0;
322         if (rt->rt6i_flags & RTF_NONEXTHOP ||
323             !(rt->rt6i_flags & RTF_GATEWAY))
324                 m = 1;
325         else if (neigh) {
326                 read_lock_bh(&neigh->lock);
327                 if (neigh->nud_state & NUD_VALID)
328                         m = 2;
329                 read_unlock_bh(&neigh->lock);
330         }
331         return m;
332 }
333
334 static int rt6_score_route(struct rt6_info *rt, int oif,
335                            int strict)
336 {
337         int m, n;
338                 
339         m = rt6_check_dev(rt, oif);
340         if (!m && (strict & RT6_LOOKUP_F_IFACE))
341                 return -1;
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
344 #endif
345         n = rt6_check_neigh(rt);
346         if (n > 1)
347                 m |= 16;
348         else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
349                 return -1;
350         return m;
351 }
352
353 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
354                                    int strict)
355 {
356         struct rt6_info *match = NULL, *last = NULL;
357         struct rt6_info *rt, *rt0 = *head;
358         u32 metric;
359         int mpri = -1;
360
361         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
362                   __FUNCTION__, head, head ? *head : NULL, oif);
363
364         for (rt = rt0, metric = rt0->rt6i_metric;
365              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
366              rt = rt->u.next) {
367                 int m;
368
369                 if (rt6_check_expired(rt))
370                         continue;
371
372                 last = rt;
373
374                 m = rt6_score_route(rt, oif, strict);
375                 if (m < 0)
376                         continue;
377
378                 if (m > mpri) {
379                         rt6_probe(match);
380                         match = rt;
381                         mpri = m;
382                 } else {
383                         rt6_probe(rt);
384                 }
385         }
386
387         if (!match &&
388             (strict & RT6_LOOKUP_F_REACHABLE) &&
389             last && last != rt0) {
390                 /* no entries matched; do round-robin */
391                 static DEFINE_SPINLOCK(lock);
392                 spin_lock(&lock);
393                 *head = rt0->u.next;
394                 rt0->u.next = last->u.next;
395                 last->u.next = rt0;
396                 spin_unlock(&lock);
397         }
398
399         RT6_TRACE("%s() => %p, score=%d\n",
400                   __FUNCTION__, match, mpri);
401
402         return (match ? match : &ip6_null_entry);
403 }
404
405 #ifdef CONFIG_IPV6_ROUTE_INFO
406 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
407                   struct in6_addr *gwaddr)
408 {
409         struct route_info *rinfo = (struct route_info *) opt;
410         struct in6_addr prefix_buf, *prefix;
411         unsigned int pref;
412         u32 lifetime;
413         struct rt6_info *rt;
414
415         if (len < sizeof(struct route_info)) {
416                 return -EINVAL;
417         }
418
419         /* Sanity check for prefix_len and length */
420         if (rinfo->length > 3) {
421                 return -EINVAL;
422         } else if (rinfo->prefix_len > 128) {
423                 return -EINVAL;
424         } else if (rinfo->prefix_len > 64) {
425                 if (rinfo->length < 2) {
426                         return -EINVAL;
427                 }
428         } else if (rinfo->prefix_len > 0) {
429                 if (rinfo->length < 1) {
430                         return -EINVAL;
431                 }
432         }
433
434         pref = rinfo->route_pref;
435         if (pref == ICMPV6_ROUTER_PREF_INVALID)
436                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
437
438         lifetime = htonl(rinfo->lifetime);
439         if (lifetime == 0xffffffff) {
440                 /* infinity */
441         } else if (lifetime > 0x7fffffff/HZ) {
442                 /* Avoid arithmetic overflow */
443                 lifetime = 0x7fffffff/HZ - 1;
444         }
445
446         if (rinfo->length == 3)
447                 prefix = (struct in6_addr *)rinfo->prefix;
448         else {
449                 /* this function is safe */
450                 ipv6_addr_prefix(&prefix_buf,
451                                  (struct in6_addr *)rinfo->prefix,
452                                  rinfo->prefix_len);
453                 prefix = &prefix_buf;
454         }
455
456         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
457
458         if (rt && !lifetime) {
459                 ip6_del_rt(rt);
460                 rt = NULL;
461         }
462
463         if (!rt && lifetime)
464                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
465                                         pref);
466         else if (rt)
467                 rt->rt6i_flags = RTF_ROUTEINFO |
468                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
469
470         if (rt) {
471                 if (lifetime == 0xffffffff) {
472                         rt->rt6i_flags &= ~RTF_EXPIRES;
473                 } else {
474                         rt->rt6i_expires = jiffies + HZ * lifetime;
475                         rt->rt6i_flags |= RTF_EXPIRES;
476                 }
477                 dst_release(&rt->u.dst);
478         }
479         return 0;
480 }
481 #endif
482
483 #define BACKTRACK(saddr) \
484 do { \
485         if (rt == &ip6_null_entry) { \
486                 struct fib6_node *pn; \
487                 while (fn) { \
488                         if (fn->fn_flags & RTN_TL_ROOT) \
489                                 goto out; \
490                         pn = fn->parent; \
491                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
492                                 fn = fib6_lookup(pn->subtree, NULL, saddr); \
493                         else \
494                                 fn = pn; \
495                         if (fn->fn_flags & RTN_RTINFO) \
496                                 goto restart; \
497                 } \
498         } \
499 } while(0)
500
501 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
502                                              struct flowi *fl, int flags)
503 {
504         struct fib6_node *fn;
505         struct rt6_info *rt;
506
507         read_lock_bh(&table->tb6_lock);
508         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
509 restart:
510         rt = fn->leaf;
511         rt = rt6_device_match(rt, fl->oif, flags);
512         BACKTRACK(&fl->fl6_src);
513         dst_hold(&rt->u.dst);
514 out:
515         read_unlock_bh(&table->tb6_lock);
516
517         rt->u.dst.lastuse = jiffies;
518         rt->u.dst.__use++;
519
520         return rt;
521
522 }
523
524 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
525                             int oif, int strict)
526 {
527         struct flowi fl = {
528                 .oif = oif,
529                 .nl_u = {
530                         .ip6_u = {
531                                 .daddr = *daddr,
532                                 /* TODO: saddr */
533                         },
534                 },
535         };
536         struct dst_entry *dst;
537         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
538
539         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
540         if (dst->error == 0)
541                 return (struct rt6_info *) dst;
542
543         dst_release(dst);
544
545         return NULL;
546 }
547
548 /* ip6_ins_rt is called with FREE table->tb6_lock.
549    It takes new route entry, the addition fails by any reason the
550    route is freed. In any case, if caller does not hold it, it may
551    be destroyed.
552  */
553
554 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
555 {
556         int err;
557         struct fib6_table *table;
558
559         table = rt->rt6i_table;
560         write_lock_bh(&table->tb6_lock);
561         err = fib6_add(&table->tb6_root, rt, info);
562         write_unlock_bh(&table->tb6_lock);
563
564         return err;
565 }
566
567 int ip6_ins_rt(struct rt6_info *rt)
568 {
569         return __ip6_ins_rt(rt, NULL);
570 }
571
572 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
573                                       struct in6_addr *saddr)
574 {
575         struct rt6_info *rt;
576
577         /*
578          *      Clone the route.
579          */
580
581         rt = ip6_rt_copy(ort);
582
583         if (rt) {
584                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
585                         if (rt->rt6i_dst.plen != 128 &&
586                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
587                                 rt->rt6i_flags |= RTF_ANYCAST;
588                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
589                 }
590
591                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
592                 rt->rt6i_dst.plen = 128;
593                 rt->rt6i_flags |= RTF_CACHE;
594                 rt->u.dst.flags |= DST_HOST;
595
596 #ifdef CONFIG_IPV6_SUBTREES
597                 if (rt->rt6i_src.plen && saddr) {
598                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
599                         rt->rt6i_src.plen = 128;
600                 }
601 #endif
602
603                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
604
605         }
606
607         return rt;
608 }
609
610 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
611 {
612         struct rt6_info *rt = ip6_rt_copy(ort);
613         if (rt) {
614                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
615                 rt->rt6i_dst.plen = 128;
616                 rt->rt6i_flags |= RTF_CACHE;
617                 if (rt->rt6i_flags & RTF_REJECT)
618                         rt->u.dst.error = ort->u.dst.error;
619                 rt->u.dst.flags |= DST_HOST;
620                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
621         }
622         return rt;
623 }
624
625 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
626                                             struct flowi *fl, int flags)
627 {
628         struct fib6_node *fn;
629         struct rt6_info *rt, *nrt;
630         int strict = 0;
631         int attempts = 3;
632         int err;
633         int reachable = RT6_LOOKUP_F_REACHABLE;
634
635         strict |= flags & RT6_LOOKUP_F_IFACE;
636
637 relookup:
638         read_lock_bh(&table->tb6_lock);
639
640 restart_2:
641         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
642
643 restart:
644         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
645         BACKTRACK(&fl->fl6_src);
646         if (rt == &ip6_null_entry ||
647             rt->rt6i_flags & RTF_CACHE)
648                 goto out;
649
650         dst_hold(&rt->u.dst);
651         read_unlock_bh(&table->tb6_lock);
652
653         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
654                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
655         else {
656 #if CLONE_OFFLINK_ROUTE
657                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
658 #else
659                 goto out2;
660 #endif
661         }
662
663         dst_release(&rt->u.dst);
664         rt = nrt ? : &ip6_null_entry;
665
666         dst_hold(&rt->u.dst);
667         if (nrt) {
668                 err = ip6_ins_rt(nrt);
669                 if (!err)
670                         goto out2;
671         }
672
673         if (--attempts <= 0)
674                 goto out2;
675
676         /*
677          * Race condition! In the gap, when table->tb6_lock was
678          * released someone could insert this route.  Relookup.
679          */
680         dst_release(&rt->u.dst);
681         goto relookup;
682
683 out:
684         if (reachable) {
685                 reachable = 0;
686                 goto restart_2;
687         }
688         dst_hold(&rt->u.dst);
689         read_unlock_bh(&table->tb6_lock);
690 out2:
691         rt->u.dst.lastuse = jiffies;
692         rt->u.dst.__use++;
693
694         return rt;
695 }
696
697 void ip6_route_input(struct sk_buff *skb)
698 {
699         struct ipv6hdr *iph = skb->nh.ipv6h;
700         struct flowi fl = {
701                 .iif = skb->dev->ifindex,
702                 .nl_u = {
703                         .ip6_u = {
704                                 .daddr = iph->daddr,
705                                 .saddr = iph->saddr,
706                                 .fwmark = skb->nfmark,
707                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
708                         },
709                 },
710                 .proto = iph->nexthdr,
711         };
712         int flags = rt6_need_strict(&iph->daddr) ? RT6_LOOKUP_F_IFACE : 0;
713
714         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
715 }
716
717 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
718                                              struct flowi *fl, int flags)
719 {
720         struct fib6_node *fn;
721         struct rt6_info *rt, *nrt;
722         int strict = 0;
723         int attempts = 3;
724         int err;
725         int reachable = RT6_LOOKUP_F_REACHABLE;
726
727         strict |= flags & RT6_LOOKUP_F_IFACE;
728
729 relookup:
730         read_lock_bh(&table->tb6_lock);
731
732 restart_2:
733         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
734
735 restart:
736         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
737         BACKTRACK(&fl->fl6_src);
738         if (rt == &ip6_null_entry ||
739             rt->rt6i_flags & RTF_CACHE)
740                 goto out;
741
742         dst_hold(&rt->u.dst);
743         read_unlock_bh(&table->tb6_lock);
744
745         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
746                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
747         else {
748 #if CLONE_OFFLINK_ROUTE
749                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
750 #else
751                 goto out2;
752 #endif
753         }
754
755         dst_release(&rt->u.dst);
756         rt = nrt ? : &ip6_null_entry;
757
758         dst_hold(&rt->u.dst);
759         if (nrt) {
760                 err = ip6_ins_rt(nrt);
761                 if (!err)
762                         goto out2;
763         }
764
765         if (--attempts <= 0)
766                 goto out2;
767
768         /*
769          * Race condition! In the gap, when table->tb6_lock was
770          * released someone could insert this route.  Relookup.
771          */
772         dst_release(&rt->u.dst);
773         goto relookup;
774
775 out:
776         if (reachable) {
777                 reachable = 0;
778                 goto restart_2;
779         }
780         dst_hold(&rt->u.dst);
781         read_unlock_bh(&table->tb6_lock);
782 out2:
783         rt->u.dst.lastuse = jiffies;
784         rt->u.dst.__use++;
785         return rt;
786 }
787
788 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
789 {
790         int flags = 0;
791
792         if (rt6_need_strict(&fl->fl6_dst))
793                 flags |= RT6_LOOKUP_F_IFACE;
794
795         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
796 }
797
798
799 /*
800  *      Destination cache support functions
801  */
802
803 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
804 {
805         struct rt6_info *rt;
806
807         rt = (struct rt6_info *) dst;
808
809         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
810                 return dst;
811
812         return NULL;
813 }
814
815 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
816 {
817         struct rt6_info *rt = (struct rt6_info *) dst;
818
819         if (rt) {
820                 if (rt->rt6i_flags & RTF_CACHE)
821                         ip6_del_rt(rt);
822                 else
823                         dst_release(dst);
824         }
825         return NULL;
826 }
827
828 static void ip6_link_failure(struct sk_buff *skb)
829 {
830         struct rt6_info *rt;
831
832         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
833
834         rt = (struct rt6_info *) skb->dst;
835         if (rt) {
836                 if (rt->rt6i_flags&RTF_CACHE) {
837                         dst_set_expires(&rt->u.dst, 0);
838                         rt->rt6i_flags |= RTF_EXPIRES;
839                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
840                         rt->rt6i_node->fn_sernum = -1;
841         }
842 }
843
844 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
845 {
846         struct rt6_info *rt6 = (struct rt6_info*)dst;
847
848         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
849                 rt6->rt6i_flags |= RTF_MODIFIED;
850                 if (mtu < IPV6_MIN_MTU) {
851                         mtu = IPV6_MIN_MTU;
852                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
853                 }
854                 dst->metrics[RTAX_MTU-1] = mtu;
855                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
856         }
857 }
858
859 static int ipv6_get_mtu(struct net_device *dev);
860
861 static inline unsigned int ipv6_advmss(unsigned int mtu)
862 {
863         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
864
865         if (mtu < ip6_rt_min_advmss)
866                 mtu = ip6_rt_min_advmss;
867
868         /*
869          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
870          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
871          * IPV6_MAXPLEN is also valid and means: "any MSS, 
872          * rely only on pmtu discovery"
873          */
874         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
875                 mtu = IPV6_MAXPLEN;
876         return mtu;
877 }
878
879 static struct dst_entry *ndisc_dst_gc_list;
880 static DEFINE_SPINLOCK(ndisc_lock);
881
882 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
883                                   struct neighbour *neigh,
884                                   struct in6_addr *addr,
885                                   int (*output)(struct sk_buff *))
886 {
887         struct rt6_info *rt;
888         struct inet6_dev *idev = in6_dev_get(dev);
889
890         if (unlikely(idev == NULL))
891                 return NULL;
892
893         rt = ip6_dst_alloc();
894         if (unlikely(rt == NULL)) {
895                 in6_dev_put(idev);
896                 goto out;
897         }
898
899         dev_hold(dev);
900         if (neigh)
901                 neigh_hold(neigh);
902         else
903                 neigh = ndisc_get_neigh(dev, addr);
904
905         rt->rt6i_dev      = dev;
906         rt->rt6i_idev     = idev;
907         rt->rt6i_nexthop  = neigh;
908         atomic_set(&rt->u.dst.__refcnt, 1);
909         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
910         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
911         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
912         rt->u.dst.output  = output;
913
914 #if 0   /* there's no chance to use these for ndisc */
915         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
916                                 ? DST_HOST 
917                                 : 0;
918         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
919         rt->rt6i_dst.plen = 128;
920 #endif
921
922         spin_lock_bh(&ndisc_lock);
923         rt->u.dst.next = ndisc_dst_gc_list;
924         ndisc_dst_gc_list = &rt->u.dst;
925         spin_unlock_bh(&ndisc_lock);
926
927         fib6_force_start_gc();
928
929 out:
930         return (struct dst_entry *)rt;
931 }
932
933 int ndisc_dst_gc(int *more)
934 {
935         struct dst_entry *dst, *next, **pprev;
936         int freed;
937
938         next = NULL;
939         freed = 0;
940
941         spin_lock_bh(&ndisc_lock);
942         pprev = &ndisc_dst_gc_list;
943
944         while ((dst = *pprev) != NULL) {
945                 if (!atomic_read(&dst->__refcnt)) {
946                         *pprev = dst->next;
947                         dst_free(dst);
948                         freed++;
949                 } else {
950                         pprev = &dst->next;
951                         (*more)++;
952                 }
953         }
954
955         spin_unlock_bh(&ndisc_lock);
956
957         return freed;
958 }
959
960 static int ip6_dst_gc(void)
961 {
962         static unsigned expire = 30*HZ;
963         static unsigned long last_gc;
964         unsigned long now = jiffies;
965
966         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
967             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
968                 goto out;
969
970         expire++;
971         fib6_run_gc(expire);
972         last_gc = now;
973         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
974                 expire = ip6_rt_gc_timeout>>1;
975
976 out:
977         expire -= expire>>ip6_rt_gc_elasticity;
978         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
979 }
980
981 /* Clean host part of a prefix. Not necessary in radix tree,
982    but results in cleaner routing tables.
983
984    Remove it only when all the things will work!
985  */
986
987 static int ipv6_get_mtu(struct net_device *dev)
988 {
989         int mtu = IPV6_MIN_MTU;
990         struct inet6_dev *idev;
991
992         idev = in6_dev_get(dev);
993         if (idev) {
994                 mtu = idev->cnf.mtu6;
995                 in6_dev_put(idev);
996         }
997         return mtu;
998 }
999
1000 int ipv6_get_hoplimit(struct net_device *dev)
1001 {
1002         int hoplimit = ipv6_devconf.hop_limit;
1003         struct inet6_dev *idev;
1004
1005         idev = in6_dev_get(dev);
1006         if (idev) {
1007                 hoplimit = idev->cnf.hop_limit;
1008                 in6_dev_put(idev);
1009         }
1010         return hoplimit;
1011 }
1012
1013 /*
1014  *
1015  */
1016
1017 int ip6_route_add(struct fib6_config *cfg)
1018 {
1019         int err;
1020         struct rt6_info *rt = NULL;
1021         struct net_device *dev = NULL;
1022         struct inet6_dev *idev = NULL;
1023         struct fib6_table *table;
1024         int addr_type;
1025
1026         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1027                 return -EINVAL;
1028 #ifndef CONFIG_IPV6_SUBTREES
1029         if (cfg->fc_src_len)
1030                 return -EINVAL;
1031 #endif
1032         if (cfg->fc_ifindex) {
1033                 err = -ENODEV;
1034                 dev = dev_get_by_index(cfg->fc_ifindex);
1035                 if (!dev)
1036                         goto out;
1037                 idev = in6_dev_get(dev);
1038                 if (!idev)
1039                         goto out;
1040         }
1041
1042         if (cfg->fc_metric == 0)
1043                 cfg->fc_metric = IP6_RT_PRIO_USER;
1044
1045         table = fib6_new_table(cfg->fc_table);
1046         if (table == NULL) {
1047                 err = -ENOBUFS;
1048                 goto out;
1049         }
1050
1051         rt = ip6_dst_alloc();
1052
1053         if (rt == NULL) {
1054                 err = -ENOMEM;
1055                 goto out;
1056         }
1057
1058         rt->u.dst.obsolete = -1;
1059         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1060
1061         if (cfg->fc_protocol == RTPROT_UNSPEC)
1062                 cfg->fc_protocol = RTPROT_BOOT;
1063         rt->rt6i_protocol = cfg->fc_protocol;
1064
1065         addr_type = ipv6_addr_type(&cfg->fc_dst);
1066
1067         if (addr_type & IPV6_ADDR_MULTICAST)
1068                 rt->u.dst.input = ip6_mc_input;
1069         else
1070                 rt->u.dst.input = ip6_forward;
1071
1072         rt->u.dst.output = ip6_output;
1073
1074         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1075         rt->rt6i_dst.plen = cfg->fc_dst_len;
1076         if (rt->rt6i_dst.plen == 128)
1077                rt->u.dst.flags = DST_HOST;
1078
1079 #ifdef CONFIG_IPV6_SUBTREES
1080         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1081         rt->rt6i_src.plen = cfg->fc_src_len;
1082 #endif
1083
1084         rt->rt6i_metric = cfg->fc_metric;
1085
1086         /* We cannot add true routes via loopback here,
1087            they would result in kernel looping; promote them to reject routes
1088          */
1089         if ((cfg->fc_flags & RTF_REJECT) ||
1090             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1091                 /* hold loopback dev/idev if we haven't done so. */
1092                 if (dev != &loopback_dev) {
1093                         if (dev) {
1094                                 dev_put(dev);
1095                                 in6_dev_put(idev);
1096                         }
1097                         dev = &loopback_dev;
1098                         dev_hold(dev);
1099                         idev = in6_dev_get(dev);
1100                         if (!idev) {
1101                                 err = -ENODEV;
1102                                 goto out;
1103                         }
1104                 }
1105                 rt->u.dst.output = ip6_pkt_discard_out;
1106                 rt->u.dst.input = ip6_pkt_discard;
1107                 rt->u.dst.error = -ENETUNREACH;
1108                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1109                 goto install_route;
1110         }
1111
1112         if (cfg->fc_flags & RTF_GATEWAY) {
1113                 struct in6_addr *gw_addr;
1114                 int gwa_type;
1115
1116                 gw_addr = &cfg->fc_gateway;
1117                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1118                 gwa_type = ipv6_addr_type(gw_addr);
1119
1120                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1121                         struct rt6_info *grt;
1122
1123                         /* IPv6 strictly inhibits using not link-local
1124                            addresses as nexthop address.
1125                            Otherwise, router will not able to send redirects.
1126                            It is very good, but in some (rare!) circumstances
1127                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1128                            some exceptions. --ANK
1129                          */
1130                         err = -EINVAL;
1131                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1132                                 goto out;
1133
1134                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1135
1136                         err = -EHOSTUNREACH;
1137                         if (grt == NULL)
1138                                 goto out;
1139                         if (dev) {
1140                                 if (dev != grt->rt6i_dev) {
1141                                         dst_release(&grt->u.dst);
1142                                         goto out;
1143                                 }
1144                         } else {
1145                                 dev = grt->rt6i_dev;
1146                                 idev = grt->rt6i_idev;
1147                                 dev_hold(dev);
1148                                 in6_dev_hold(grt->rt6i_idev);
1149                         }
1150                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1151                                 err = 0;
1152                         dst_release(&grt->u.dst);
1153
1154                         if (err)
1155                                 goto out;
1156                 }
1157                 err = -EINVAL;
1158                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1159                         goto out;
1160         }
1161
1162         err = -ENODEV;
1163         if (dev == NULL)
1164                 goto out;
1165
1166         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1167                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1168                 if (IS_ERR(rt->rt6i_nexthop)) {
1169                         err = PTR_ERR(rt->rt6i_nexthop);
1170                         rt->rt6i_nexthop = NULL;
1171                         goto out;
1172                 }
1173         }
1174
1175         rt->rt6i_flags = cfg->fc_flags;
1176
1177 install_route:
1178         if (cfg->fc_mx) {
1179                 struct nlattr *nla;
1180                 int remaining;
1181
1182                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1183                         int type = nla->nla_type;
1184
1185                         if (type) {
1186                                 if (type > RTAX_MAX) {
1187                                         err = -EINVAL;
1188                                         goto out;
1189                                 }
1190
1191                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1192                         }
1193                 }
1194         }
1195
1196         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1197                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1198         if (!rt->u.dst.metrics[RTAX_MTU-1])
1199                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1200         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1201                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1202         rt->u.dst.dev = dev;
1203         rt->rt6i_idev = idev;
1204         rt->rt6i_table = table;
1205         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1206
1207 out:
1208         if (dev)
1209                 dev_put(dev);
1210         if (idev)
1211                 in6_dev_put(idev);
1212         if (rt)
1213                 dst_free((struct dst_entry *) rt);
1214         return err;
1215 }
1216
1217 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1218 {
1219         int err;
1220         struct fib6_table *table;
1221
1222         if (rt == &ip6_null_entry)
1223                 return -ENOENT;
1224
1225         table = rt->rt6i_table;
1226         write_lock_bh(&table->tb6_lock);
1227
1228         err = fib6_del(rt, info);
1229         dst_release(&rt->u.dst);
1230
1231         write_unlock_bh(&table->tb6_lock);
1232
1233         return err;
1234 }
1235
1236 int ip6_del_rt(struct rt6_info *rt)
1237 {
1238         return __ip6_del_rt(rt, NULL);
1239 }
1240
1241 static int ip6_route_del(struct fib6_config *cfg)
1242 {
1243         struct fib6_table *table;
1244         struct fib6_node *fn;
1245         struct rt6_info *rt;
1246         int err = -ESRCH;
1247
1248         table = fib6_get_table(cfg->fc_table);
1249         if (table == NULL)
1250                 return err;
1251
1252         read_lock_bh(&table->tb6_lock);
1253
1254         fn = fib6_locate(&table->tb6_root,
1255                          &cfg->fc_dst, cfg->fc_dst_len,
1256                          &cfg->fc_src, cfg->fc_src_len);
1257         
1258         if (fn) {
1259                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1260                         if (cfg->fc_ifindex &&
1261                             (rt->rt6i_dev == NULL ||
1262                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1263                                 continue;
1264                         if (cfg->fc_flags & RTF_GATEWAY &&
1265                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1266                                 continue;
1267                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1268                                 continue;
1269                         dst_hold(&rt->u.dst);
1270                         read_unlock_bh(&table->tb6_lock);
1271
1272                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1273                 }
1274         }
1275         read_unlock_bh(&table->tb6_lock);
1276
1277         return err;
1278 }
1279
1280 /*
1281  *      Handle redirects
1282  */
1283 struct ip6rd_flowi {
1284         struct flowi fl;
1285         struct in6_addr gateway;
1286 };
1287
1288 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1289                                              struct flowi *fl,
1290                                              int flags)
1291 {
1292         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1293         struct rt6_info *rt;
1294         struct fib6_node *fn;
1295
1296         /*
1297          * Get the "current" route for this destination and
1298          * check if the redirect has come from approriate router.
1299          *
1300          * RFC 2461 specifies that redirects should only be
1301          * accepted if they come from the nexthop to the target.
1302          * Due to the way the routes are chosen, this notion
1303          * is a bit fuzzy and one might need to check all possible
1304          * routes.
1305          */
1306
1307         read_lock_bh(&table->tb6_lock);
1308         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1309 restart:
1310         for (rt = fn->leaf; rt; rt = rt->u.next) {
1311                 /*
1312                  * Current route is on-link; redirect is always invalid.
1313                  *
1314                  * Seems, previous statement is not true. It could
1315                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1316                  * But then router serving it might decide, that we should
1317                  * know truth 8)8) --ANK (980726).
1318                  */
1319                 if (rt6_check_expired(rt))
1320                         continue;
1321                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1322                         continue;
1323                 if (fl->oif != rt->rt6i_dev->ifindex)
1324                         continue;
1325                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1326                         continue;
1327                 break;
1328         }
1329
1330         if (!rt)
1331                 rt = &ip6_null_entry;
1332         BACKTRACK(&fl->fl6_src);
1333 out:
1334         dst_hold(&rt->u.dst);
1335
1336         read_unlock_bh(&table->tb6_lock);
1337
1338         return rt;
1339 };
1340
1341 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1342                                            struct in6_addr *src,
1343                                            struct in6_addr *gateway,
1344                                            struct net_device *dev)
1345 {
1346         struct ip6rd_flowi rdfl = {
1347                 .fl = {
1348                         .oif = dev->ifindex,
1349                         .nl_u = {
1350                                 .ip6_u = {
1351                                         .daddr = *dest,
1352                                         .saddr = *src,
1353                                 },
1354                         },
1355                 },
1356                 .gateway = *gateway,
1357         };
1358         int flags = rt6_need_strict(dest) ? RT6_LOOKUP_F_IFACE : 0;
1359
1360         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1361 }
1362
1363 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1364                   struct in6_addr *saddr,
1365                   struct neighbour *neigh, u8 *lladdr, int on_link)
1366 {
1367         struct rt6_info *rt, *nrt = NULL;
1368         struct netevent_redirect netevent;
1369
1370         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1371
1372         if (rt == &ip6_null_entry) {
1373                 if (net_ratelimit())
1374                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1375                                "for redirect target\n");
1376                 goto out;
1377         }
1378
1379         /*
1380          *      We have finally decided to accept it.
1381          */
1382
1383         neigh_update(neigh, lladdr, NUD_STALE, 
1384                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1385                      NEIGH_UPDATE_F_OVERRIDE|
1386                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1387                                      NEIGH_UPDATE_F_ISROUTER))
1388                      );
1389
1390         /*
1391          * Redirect received -> path was valid.
1392          * Look, redirects are sent only in response to data packets,
1393          * so that this nexthop apparently is reachable. --ANK
1394          */
1395         dst_confirm(&rt->u.dst);
1396
1397         /* Duplicate redirect: silently ignore. */
1398         if (neigh == rt->u.dst.neighbour)
1399                 goto out;
1400
1401         nrt = ip6_rt_copy(rt);
1402         if (nrt == NULL)
1403                 goto out;
1404
1405         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1406         if (on_link)
1407                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1408
1409         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1410         nrt->rt6i_dst.plen = 128;
1411         nrt->u.dst.flags |= DST_HOST;
1412
1413         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1414         nrt->rt6i_nexthop = neigh_clone(neigh);
1415         /* Reset pmtu, it may be better */
1416         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1417         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1418
1419         if (ip6_ins_rt(nrt))
1420                 goto out;
1421
1422         netevent.old = &rt->u.dst;
1423         netevent.new = &nrt->u.dst;
1424         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1425
1426         if (rt->rt6i_flags&RTF_CACHE) {
1427                 ip6_del_rt(rt);
1428                 return;
1429         }
1430
1431 out:
1432         dst_release(&rt->u.dst);
1433         return;
1434 }
1435
1436 /*
1437  *      Handle ICMP "packet too big" messages
1438  *      i.e. Path MTU discovery
1439  */
1440
1441 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1442                         struct net_device *dev, u32 pmtu)
1443 {
1444         struct rt6_info *rt, *nrt;
1445         int allfrag = 0;
1446
1447         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1448         if (rt == NULL)
1449                 return;
1450
1451         if (pmtu >= dst_mtu(&rt->u.dst))
1452                 goto out;
1453
1454         if (pmtu < IPV6_MIN_MTU) {
1455                 /*
1456                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1457                  * MTU (1280) and a fragment header should always be included
1458                  * after a node receiving Too Big message reporting PMTU is
1459                  * less than the IPv6 Minimum Link MTU.
1460                  */
1461                 pmtu = IPV6_MIN_MTU;
1462                 allfrag = 1;
1463         }
1464
1465         /* New mtu received -> path was valid.
1466            They are sent only in response to data packets,
1467            so that this nexthop apparently is reachable. --ANK
1468          */
1469         dst_confirm(&rt->u.dst);
1470
1471         /* Host route. If it is static, it would be better
1472            not to override it, but add new one, so that
1473            when cache entry will expire old pmtu
1474            would return automatically.
1475          */
1476         if (rt->rt6i_flags & RTF_CACHE) {
1477                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1478                 if (allfrag)
1479                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1480                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1481                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1482                 goto out;
1483         }
1484
1485         /* Network route.
1486            Two cases are possible:
1487            1. It is connected route. Action: COW
1488            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1489          */
1490         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1491                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1492         else
1493                 nrt = rt6_alloc_clone(rt, daddr);
1494
1495         if (nrt) {
1496                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1497                 if (allfrag)
1498                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1499
1500                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1501                  * happened within 5 mins, the recommended timer is 10 mins.
1502                  * Here this route expiration time is set to ip6_rt_mtu_expires
1503                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1504                  * and detecting PMTU increase will be automatically happened.
1505                  */
1506                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1507                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1508
1509                 ip6_ins_rt(nrt);
1510         }
1511 out:
1512         dst_release(&rt->u.dst);
1513 }
1514
1515 /*
1516  *      Misc support functions
1517  */
1518
1519 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1520 {
1521         struct rt6_info *rt = ip6_dst_alloc();
1522
1523         if (rt) {
1524                 rt->u.dst.input = ort->u.dst.input;
1525                 rt->u.dst.output = ort->u.dst.output;
1526
1527                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1528                 rt->u.dst.dev = ort->u.dst.dev;
1529                 if (rt->u.dst.dev)
1530                         dev_hold(rt->u.dst.dev);
1531                 rt->rt6i_idev = ort->rt6i_idev;
1532                 if (rt->rt6i_idev)
1533                         in6_dev_hold(rt->rt6i_idev);
1534                 rt->u.dst.lastuse = jiffies;
1535                 rt->rt6i_expires = 0;
1536
1537                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1538                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1539                 rt->rt6i_metric = 0;
1540
1541                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1542 #ifdef CONFIG_IPV6_SUBTREES
1543                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1544 #endif
1545                 rt->rt6i_table = ort->rt6i_table;
1546         }
1547         return rt;
1548 }
1549
1550 #ifdef CONFIG_IPV6_ROUTE_INFO
1551 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1552                                            struct in6_addr *gwaddr, int ifindex)
1553 {
1554         struct fib6_node *fn;
1555         struct rt6_info *rt = NULL;
1556         struct fib6_table *table;
1557
1558         table = fib6_get_table(RT6_TABLE_INFO);
1559         if (table == NULL)
1560                 return NULL;
1561
1562         write_lock_bh(&table->tb6_lock);
1563         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1564         if (!fn)
1565                 goto out;
1566
1567         for (rt = fn->leaf; rt; rt = rt->u.next) {
1568                 if (rt->rt6i_dev->ifindex != ifindex)
1569                         continue;
1570                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1571                         continue;
1572                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1573                         continue;
1574                 dst_hold(&rt->u.dst);
1575                 break;
1576         }
1577 out:
1578         write_unlock_bh(&table->tb6_lock);
1579         return rt;
1580 }
1581
1582 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1583                                            struct in6_addr *gwaddr, int ifindex,
1584                                            unsigned pref)
1585 {
1586         struct fib6_config cfg = {
1587                 .fc_table       = RT6_TABLE_INFO,
1588                 .fc_metric      = 1024,
1589                 .fc_ifindex     = ifindex,
1590                 .fc_dst_len     = prefixlen,
1591                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1592                                   RTF_UP | RTF_PREF(pref),
1593         };
1594
1595         ipv6_addr_copy(&cfg.fc_dst, prefix);
1596         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1597
1598         /* We should treat it as a default route if prefix length is 0. */
1599         if (!prefixlen)
1600                 cfg.fc_flags |= RTF_DEFAULT;
1601
1602         ip6_route_add(&cfg);
1603
1604         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1605 }
1606 #endif
1607
1608 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1609 {       
1610         struct rt6_info *rt;
1611         struct fib6_table *table;
1612
1613         table = fib6_get_table(RT6_TABLE_DFLT);
1614         if (table == NULL)
1615                 return NULL;
1616
1617         write_lock_bh(&table->tb6_lock);
1618         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1619                 if (dev == rt->rt6i_dev &&
1620                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1621                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1622                         break;
1623         }
1624         if (rt)
1625                 dst_hold(&rt->u.dst);
1626         write_unlock_bh(&table->tb6_lock);
1627         return rt;
1628 }
1629
1630 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1631                                      struct net_device *dev,
1632                                      unsigned int pref)
1633 {
1634         struct fib6_config cfg = {
1635                 .fc_table       = RT6_TABLE_DFLT,
1636                 .fc_metric      = 1024,
1637                 .fc_ifindex     = dev->ifindex,
1638                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1639                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1640         };
1641
1642         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1643
1644         ip6_route_add(&cfg);
1645
1646         return rt6_get_dflt_router(gwaddr, dev);
1647 }
1648
1649 void rt6_purge_dflt_routers(void)
1650 {
1651         struct rt6_info *rt;
1652         struct fib6_table *table;
1653
1654         /* NOTE: Keep consistent with rt6_get_dflt_router */
1655         table = fib6_get_table(RT6_TABLE_DFLT);
1656         if (table == NULL)
1657                 return;
1658
1659 restart:
1660         read_lock_bh(&table->tb6_lock);
1661         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1662                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1663                         dst_hold(&rt->u.dst);
1664                         read_unlock_bh(&table->tb6_lock);
1665                         ip6_del_rt(rt);
1666                         goto restart;
1667                 }
1668         }
1669         read_unlock_bh(&table->tb6_lock);
1670 }
1671
1672 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1673                                  struct fib6_config *cfg)
1674 {
1675         memset(cfg, 0, sizeof(*cfg));
1676
1677         cfg->fc_table = RT6_TABLE_MAIN;
1678         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1679         cfg->fc_metric = rtmsg->rtmsg_metric;
1680         cfg->fc_expires = rtmsg->rtmsg_info;
1681         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1682         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1683         cfg->fc_flags = rtmsg->rtmsg_flags;
1684
1685         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1686         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1687         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1688 }
1689
1690 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1691 {
1692         struct fib6_config cfg;
1693         struct in6_rtmsg rtmsg;
1694         int err;
1695
1696         switch(cmd) {
1697         case SIOCADDRT:         /* Add a route */
1698         case SIOCDELRT:         /* Delete a route */
1699                 if (!capable(CAP_NET_ADMIN))
1700                         return -EPERM;
1701                 err = copy_from_user(&rtmsg, arg,
1702                                      sizeof(struct in6_rtmsg));
1703                 if (err)
1704                         return -EFAULT;
1705
1706                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1707
1708                 rtnl_lock();
1709                 switch (cmd) {
1710                 case SIOCADDRT:
1711                         err = ip6_route_add(&cfg);
1712                         break;
1713                 case SIOCDELRT:
1714                         err = ip6_route_del(&cfg);
1715                         break;
1716                 default:
1717                         err = -EINVAL;
1718                 }
1719                 rtnl_unlock();
1720
1721                 return err;
1722         };
1723
1724         return -EINVAL;
1725 }
1726
1727 /*
1728  *      Drop the packet on the floor
1729  */
1730
1731 static int ip6_pkt_discard(struct sk_buff *skb)
1732 {
1733         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1734         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1735                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1736
1737         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1738         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1739         kfree_skb(skb);
1740         return 0;
1741 }
1742
1743 static int ip6_pkt_discard_out(struct sk_buff *skb)
1744 {
1745         skb->dev = skb->dst->dev;
1746         return ip6_pkt_discard(skb);
1747 }
1748
1749 /*
1750  *      Allocate a dst for local (unicast / anycast) address.
1751  */
1752
1753 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1754                                     const struct in6_addr *addr,
1755                                     int anycast)
1756 {
1757         struct rt6_info *rt = ip6_dst_alloc();
1758
1759         if (rt == NULL)
1760                 return ERR_PTR(-ENOMEM);
1761
1762         dev_hold(&loopback_dev);
1763         in6_dev_hold(idev);
1764
1765         rt->u.dst.flags = DST_HOST;
1766         rt->u.dst.input = ip6_input;
1767         rt->u.dst.output = ip6_output;
1768         rt->rt6i_dev = &loopback_dev;
1769         rt->rt6i_idev = idev;
1770         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1771         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1772         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1773         rt->u.dst.obsolete = -1;
1774
1775         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1776         if (anycast)
1777                 rt->rt6i_flags |= RTF_ANYCAST;
1778         else
1779                 rt->rt6i_flags |= RTF_LOCAL;
1780         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1781         if (rt->rt6i_nexthop == NULL) {
1782                 dst_free((struct dst_entry *) rt);
1783                 return ERR_PTR(-ENOMEM);
1784         }
1785
1786         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1787         rt->rt6i_dst.plen = 128;
1788         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1789
1790         atomic_set(&rt->u.dst.__refcnt, 1);
1791
1792         return rt;
1793 }
1794
1795 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1796 {
1797         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1798             rt != &ip6_null_entry) {
1799                 RT6_TRACE("deleted by ifdown %p\n", rt);
1800                 return -1;
1801         }
1802         return 0;
1803 }
1804
1805 void rt6_ifdown(struct net_device *dev)
1806 {
1807         fib6_clean_all(fib6_ifdown, 0, dev);
1808 }
1809
1810 struct rt6_mtu_change_arg
1811 {
1812         struct net_device *dev;
1813         unsigned mtu;
1814 };
1815
1816 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1817 {
1818         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1819         struct inet6_dev *idev;
1820
1821         /* In IPv6 pmtu discovery is not optional,
1822            so that RTAX_MTU lock cannot disable it.
1823            We still use this lock to block changes
1824            caused by addrconf/ndisc.
1825         */
1826
1827         idev = __in6_dev_get(arg->dev);
1828         if (idev == NULL)
1829                 return 0;
1830
1831         /* For administrative MTU increase, there is no way to discover
1832            IPv6 PMTU increase, so PMTU increase should be updated here.
1833            Since RFC 1981 doesn't include administrative MTU increase
1834            update PMTU increase is a MUST. (i.e. jumbo frame)
1835          */
1836         /*
1837            If new MTU is less than route PMTU, this new MTU will be the
1838            lowest MTU in the path, update the route PMTU to reflect PMTU
1839            decreases; if new MTU is greater than route PMTU, and the
1840            old MTU is the lowest MTU in the path, update the route PMTU
1841            to reflect the increase. In this case if the other nodes' MTU
1842            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1843            PMTU discouvery.
1844          */
1845         if (rt->rt6i_dev == arg->dev &&
1846             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1847             (dst_mtu(&rt->u.dst) > arg->mtu ||
1848              (dst_mtu(&rt->u.dst) < arg->mtu &&
1849               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1850                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1851         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1852         return 0;
1853 }
1854
1855 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1856 {
1857         struct rt6_mtu_change_arg arg = {
1858                 .dev = dev,
1859                 .mtu = mtu,
1860         };
1861
1862         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1863 }
1864
1865 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1866         [RTA_GATEWAY]           = { .minlen = sizeof(struct in6_addr) },
1867         [RTA_OIF]               = { .type = NLA_U32 },
1868         [RTA_IIF]               = { .type = NLA_U32 },
1869         [RTA_PRIORITY]          = { .type = NLA_U32 },
1870         [RTA_METRICS]           = { .type = NLA_NESTED },
1871 };
1872
1873 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1874                               struct fib6_config *cfg)
1875 {
1876         struct rtmsg *rtm;
1877         struct nlattr *tb[RTA_MAX+1];
1878         int err;
1879
1880         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1881         if (err < 0)
1882                 goto errout;
1883
1884         err = -EINVAL;
1885         rtm = nlmsg_data(nlh);
1886         memset(cfg, 0, sizeof(*cfg));
1887
1888         cfg->fc_table = rtm->rtm_table;
1889         cfg->fc_dst_len = rtm->rtm_dst_len;
1890         cfg->fc_src_len = rtm->rtm_src_len;
1891         cfg->fc_flags = RTF_UP;
1892         cfg->fc_protocol = rtm->rtm_protocol;
1893
1894         if (rtm->rtm_type == RTN_UNREACHABLE)
1895                 cfg->fc_flags |= RTF_REJECT;
1896
1897         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1898         cfg->fc_nlinfo.nlh = nlh;
1899
1900         if (tb[RTA_GATEWAY]) {
1901                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1902                 cfg->fc_flags |= RTF_GATEWAY;
1903         }
1904
1905         if (tb[RTA_DST]) {
1906                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1907
1908                 if (nla_len(tb[RTA_DST]) < plen)
1909                         goto errout;
1910
1911                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1912         }
1913
1914         if (tb[RTA_SRC]) {
1915                 int plen = (rtm->rtm_src_len + 7) >> 3;
1916
1917                 if (nla_len(tb[RTA_SRC]) < plen)
1918                         goto errout;
1919
1920                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1921         }
1922
1923         if (tb[RTA_OIF])
1924                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1925
1926         if (tb[RTA_PRIORITY])
1927                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1928
1929         if (tb[RTA_METRICS]) {
1930                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1931                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1932         }
1933
1934         if (tb[RTA_TABLE])
1935                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1936
1937         err = 0;
1938 errout:
1939         return err;
1940 }
1941
1942 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1943 {
1944         struct fib6_config cfg;
1945         int err;
1946
1947         err = rtm_to_fib6_config(skb, nlh, &cfg);
1948         if (err < 0)
1949                 return err;
1950
1951         return ip6_route_del(&cfg);
1952 }
1953
1954 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1955 {
1956         struct fib6_config cfg;
1957         int err;
1958
1959         err = rtm_to_fib6_config(skb, nlh, &cfg);
1960         if (err < 0)
1961                 return err;
1962
1963         return ip6_route_add(&cfg);
1964 }
1965
1966 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1967                          struct in6_addr *dst, struct in6_addr *src,
1968                          int iif, int type, u32 pid, u32 seq,
1969                          int prefix, unsigned int flags)
1970 {
1971         struct rtmsg *rtm;
1972         struct nlmsghdr *nlh;
1973         struct rta_cacheinfo ci;
1974         u32 table;
1975
1976         if (prefix) {   /* user wants prefix routes only */
1977                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1978                         /* success since this is not a prefix route */
1979                         return 1;
1980                 }
1981         }
1982
1983         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1984         if (nlh == NULL)
1985                 return -ENOBUFS;
1986
1987         rtm = nlmsg_data(nlh);
1988         rtm->rtm_family = AF_INET6;
1989         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1990         rtm->rtm_src_len = rt->rt6i_src.plen;
1991         rtm->rtm_tos = 0;
1992         if (rt->rt6i_table)
1993                 table = rt->rt6i_table->tb6_id;
1994         else
1995                 table = RT6_TABLE_UNSPEC;
1996         rtm->rtm_table = table;
1997         NLA_PUT_U32(skb, RTA_TABLE, table);
1998         if (rt->rt6i_flags&RTF_REJECT)
1999                 rtm->rtm_type = RTN_UNREACHABLE;
2000         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2001                 rtm->rtm_type = RTN_LOCAL;
2002         else
2003                 rtm->rtm_type = RTN_UNICAST;
2004         rtm->rtm_flags = 0;
2005         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2006         rtm->rtm_protocol = rt->rt6i_protocol;
2007         if (rt->rt6i_flags&RTF_DYNAMIC)
2008                 rtm->rtm_protocol = RTPROT_REDIRECT;
2009         else if (rt->rt6i_flags & RTF_ADDRCONF)
2010                 rtm->rtm_protocol = RTPROT_KERNEL;
2011         else if (rt->rt6i_flags&RTF_DEFAULT)
2012                 rtm->rtm_protocol = RTPROT_RA;
2013
2014         if (rt->rt6i_flags&RTF_CACHE)
2015                 rtm->rtm_flags |= RTM_F_CLONED;
2016
2017         if (dst) {
2018                 NLA_PUT(skb, RTA_DST, 16, dst);
2019                 rtm->rtm_dst_len = 128;
2020         } else if (rtm->rtm_dst_len)
2021                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2022 #ifdef CONFIG_IPV6_SUBTREES
2023         if (src) {
2024                 NLA_PUT(skb, RTA_SRC, 16, src);
2025                 rtm->rtm_src_len = 128;
2026         } else if (rtm->rtm_src_len)
2027                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2028 #endif
2029         if (iif)
2030                 NLA_PUT_U32(skb, RTA_IIF, iif);
2031         else if (dst) {
2032                 struct in6_addr saddr_buf;
2033                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2034                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2035         }
2036
2037         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2038                 goto nla_put_failure;
2039
2040         if (rt->u.dst.neighbour)
2041                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2042
2043         if (rt->u.dst.dev)
2044                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2045
2046         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2047         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2048         if (rt->rt6i_expires)
2049                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2050         else
2051                 ci.rta_expires = 0;
2052         ci.rta_used = rt->u.dst.__use;
2053         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2054         ci.rta_error = rt->u.dst.error;
2055         ci.rta_id = 0;
2056         ci.rta_ts = 0;
2057         ci.rta_tsage = 0;
2058         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2059
2060         return nlmsg_end(skb, nlh);
2061
2062 nla_put_failure:
2063         return nlmsg_cancel(skb, nlh);
2064 }
2065
2066 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2067 {
2068         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2069         int prefix;
2070
2071         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2072                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2073                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2074         } else
2075                 prefix = 0;
2076
2077         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2078                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2079                      prefix, NLM_F_MULTI);
2080 }
2081
2082 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2083 {
2084         struct nlattr *tb[RTA_MAX+1];
2085         struct rt6_info *rt;
2086         struct sk_buff *skb;
2087         struct rtmsg *rtm;
2088         struct flowi fl;
2089         int err, iif = 0;
2090
2091         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2092         if (err < 0)
2093                 goto errout;
2094
2095         err = -EINVAL;
2096         memset(&fl, 0, sizeof(fl));
2097
2098         if (tb[RTA_SRC]) {
2099                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2100                         goto errout;
2101
2102                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2103         }
2104
2105         if (tb[RTA_DST]) {
2106                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2107                         goto errout;
2108
2109                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2110         }
2111
2112         if (tb[RTA_IIF])
2113                 iif = nla_get_u32(tb[RTA_IIF]);
2114
2115         if (tb[RTA_OIF])
2116                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2117
2118         if (iif) {
2119                 struct net_device *dev;
2120                 dev = __dev_get_by_index(iif);
2121                 if (!dev) {
2122                         err = -ENODEV;
2123                         goto errout;
2124                 }
2125         }
2126
2127         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2128         if (skb == NULL) {
2129                 err = -ENOBUFS;
2130                 goto errout;
2131         }
2132
2133         /* Reserve room for dummy headers, this skb can pass
2134            through good chunk of routing engine.
2135          */
2136         skb->mac.raw = skb->data;
2137         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2138
2139         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2140         skb->dst = &rt->u.dst;
2141
2142         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2143                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2144                             nlh->nlmsg_seq, 0, 0);
2145         if (err < 0) {
2146                 kfree_skb(skb);
2147                 goto errout;
2148         }
2149
2150         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2151 errout:
2152         return err;
2153 }
2154
2155 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2156 {
2157         struct sk_buff *skb;
2158         u32 pid = 0, seq = 0;
2159         struct nlmsghdr *nlh = NULL;
2160         int payload = sizeof(struct rtmsg) + 256;
2161         int err = -ENOBUFS;
2162
2163         if (info) {
2164                 pid = info->pid;
2165                 nlh = info->nlh;
2166                 if (nlh)
2167                         seq = nlh->nlmsg_seq;
2168         }
2169
2170         skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2171         if (skb == NULL)
2172                 goto errout;
2173
2174         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2175         if (err < 0) {
2176                 kfree_skb(skb);
2177                 goto errout;
2178         }
2179
2180         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2181 errout:
2182         if (err < 0)
2183                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2184 }
2185
2186 /*
2187  *      /proc
2188  */
2189
2190 #ifdef CONFIG_PROC_FS
2191
2192 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2193
2194 struct rt6_proc_arg
2195 {
2196         char *buffer;
2197         int offset;
2198         int length;
2199         int skip;
2200         int len;
2201 };
2202
2203 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2204 {
2205         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2206         int i;
2207
2208         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2209                 arg->skip++;
2210                 return 0;
2211         }
2212
2213         if (arg->len >= arg->length)
2214                 return 0;
2215
2216         for (i=0; i<16; i++) {
2217                 sprintf(arg->buffer + arg->len, "%02x",
2218                         rt->rt6i_dst.addr.s6_addr[i]);
2219                 arg->len += 2;
2220         }
2221         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2222                             rt->rt6i_dst.plen);
2223
2224 #ifdef CONFIG_IPV6_SUBTREES
2225         for (i=0; i<16; i++) {
2226                 sprintf(arg->buffer + arg->len, "%02x",
2227                         rt->rt6i_src.addr.s6_addr[i]);
2228                 arg->len += 2;
2229         }
2230         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2231                             rt->rt6i_src.plen);
2232 #else
2233         sprintf(arg->buffer + arg->len,
2234                 "00000000000000000000000000000000 00 ");
2235         arg->len += 36;
2236 #endif
2237
2238         if (rt->rt6i_nexthop) {
2239                 for (i=0; i<16; i++) {
2240                         sprintf(arg->buffer + arg->len, "%02x",
2241                                 rt->rt6i_nexthop->primary_key[i]);
2242                         arg->len += 2;
2243                 }
2244         } else {
2245                 sprintf(arg->buffer + arg->len,
2246                         "00000000000000000000000000000000");
2247                 arg->len += 32;
2248         }
2249         arg->len += sprintf(arg->buffer + arg->len,
2250                             " %08x %08x %08x %08x %8s\n",
2251                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2252                             rt->u.dst.__use, rt->rt6i_flags, 
2253                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2254         return 0;
2255 }
2256
2257 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2258 {
2259         struct rt6_proc_arg arg = {
2260                 .buffer = buffer,
2261                 .offset = offset,
2262                 .length = length,
2263         };
2264
2265         fib6_clean_all(rt6_info_route, 0, &arg);
2266
2267         *start = buffer;
2268         if (offset)
2269                 *start += offset % RT6_INFO_LEN;
2270
2271         arg.len -= offset % RT6_INFO_LEN;
2272
2273         if (arg.len > length)
2274                 arg.len = length;
2275         if (arg.len < 0)
2276                 arg.len = 0;
2277
2278         return arg.len;
2279 }
2280
2281 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2282 {
2283         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2284                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2285                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2286                       rt6_stats.fib_rt_cache,
2287                       atomic_read(&ip6_dst_ops.entries),
2288                       rt6_stats.fib_discarded_routes);
2289
2290         return 0;
2291 }
2292
2293 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2294 {
2295         return single_open(file, rt6_stats_seq_show, NULL);
2296 }
2297
2298 static struct file_operations rt6_stats_seq_fops = {
2299         .owner   = THIS_MODULE,
2300         .open    = rt6_stats_seq_open,
2301         .read    = seq_read,
2302         .llseek  = seq_lseek,
2303         .release = single_release,
2304 };
2305 #endif  /* CONFIG_PROC_FS */
2306
2307 #ifdef CONFIG_SYSCTL
2308
2309 static int flush_delay;
2310
2311 static
2312 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2313                               void __user *buffer, size_t *lenp, loff_t *ppos)
2314 {
2315         if (write) {
2316                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2317                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2318                 return 0;
2319         } else
2320                 return -EINVAL;
2321 }
2322
2323 ctl_table ipv6_route_table[] = {
2324         {
2325                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2326                 .procname       =       "flush",
2327                 .data           =       &flush_delay,
2328                 .maxlen         =       sizeof(int),
2329                 .mode           =       0200,
2330                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2331         },
2332         {
2333                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2334                 .procname       =       "gc_thresh",
2335                 .data           =       &ip6_dst_ops.gc_thresh,
2336                 .maxlen         =       sizeof(int),
2337                 .mode           =       0644,
2338                 .proc_handler   =       &proc_dointvec,
2339         },
2340         {
2341                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2342                 .procname       =       "max_size",
2343                 .data           =       &ip6_rt_max_size,
2344                 .maxlen         =       sizeof(int),
2345                 .mode           =       0644,
2346                 .proc_handler   =       &proc_dointvec,
2347         },
2348         {
2349                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2350                 .procname       =       "gc_min_interval",
2351                 .data           =       &ip6_rt_gc_min_interval,
2352                 .maxlen         =       sizeof(int),
2353                 .mode           =       0644,
2354                 .proc_handler   =       &proc_dointvec_jiffies,
2355                 .strategy       =       &sysctl_jiffies,
2356         },
2357         {
2358                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2359                 .procname       =       "gc_timeout",
2360                 .data           =       &ip6_rt_gc_timeout,
2361                 .maxlen         =       sizeof(int),
2362                 .mode           =       0644,
2363                 .proc_handler   =       &proc_dointvec_jiffies,
2364                 .strategy       =       &sysctl_jiffies,
2365         },
2366         {
2367                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2368                 .procname       =       "gc_interval",
2369                 .data           =       &ip6_rt_gc_interval,
2370                 .maxlen         =       sizeof(int),
2371                 .mode           =       0644,
2372                 .proc_handler   =       &proc_dointvec_jiffies,
2373                 .strategy       =       &sysctl_jiffies,
2374         },
2375         {
2376                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2377                 .procname       =       "gc_elasticity",
2378                 .data           =       &ip6_rt_gc_elasticity,
2379                 .maxlen         =       sizeof(int),
2380                 .mode           =       0644,
2381                 .proc_handler   =       &proc_dointvec_jiffies,
2382                 .strategy       =       &sysctl_jiffies,
2383         },
2384         {
2385                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2386                 .procname       =       "mtu_expires",
2387                 .data           =       &ip6_rt_mtu_expires,
2388                 .maxlen         =       sizeof(int),
2389                 .mode           =       0644,
2390                 .proc_handler   =       &proc_dointvec_jiffies,
2391                 .strategy       =       &sysctl_jiffies,
2392         },
2393         {
2394                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2395                 .procname       =       "min_adv_mss",
2396                 .data           =       &ip6_rt_min_advmss,
2397                 .maxlen         =       sizeof(int),
2398                 .mode           =       0644,
2399                 .proc_handler   =       &proc_dointvec_jiffies,
2400                 .strategy       =       &sysctl_jiffies,
2401         },
2402         {
2403                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2404                 .procname       =       "gc_min_interval_ms",
2405                 .data           =       &ip6_rt_gc_min_interval,
2406                 .maxlen         =       sizeof(int),
2407                 .mode           =       0644,
2408                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2409                 .strategy       =       &sysctl_ms_jiffies,
2410         },
2411         { .ctl_name = 0 }
2412 };
2413
2414 #endif
2415
2416 void __init ip6_route_init(void)
2417 {
2418         struct proc_dir_entry *p;
2419
2420         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2421                                                      sizeof(struct rt6_info),
2422                                                      0, SLAB_HWCACHE_ALIGN,
2423                                                      NULL, NULL);
2424         if (!ip6_dst_ops.kmem_cachep)
2425                 panic("cannot create ip6_dst_cache");
2426
2427         fib6_init();
2428 #ifdef  CONFIG_PROC_FS
2429         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2430         if (p)
2431                 p->owner = THIS_MODULE;
2432
2433         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2434 #endif
2435 #ifdef CONFIG_XFRM
2436         xfrm6_init();
2437 #endif
2438 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2439         fib6_rules_init();
2440 #endif
2441 }
2442
2443 void ip6_route_cleanup(void)
2444 {
2445 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2446         fib6_rules_cleanup();
2447 #endif
2448 #ifdef CONFIG_PROC_FS
2449         proc_net_remove("ipv6_route");
2450         proc_net_remove("rt6_stats");
2451 #endif
2452 #ifdef CONFIG_XFRM
2453         xfrm6_fini();
2454 #endif
2455         rt6_ifdown(NULL);
2456         fib6_gc_cleanup();
2457         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2458 }