net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         peer = rt_get_peer_create(rt, rt->rt_dst);
 166         if (peer) {
 167                 u32 *old_p = __DST_METRICS_PTR(old);
 168                 unsigned long prev, new;
 169
 170                 p = peer->metrics;
 171                 if (inet_metrics_new(peer))
 172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 173
 174                 new = (unsigned long) p;
 175                 prev = cmpxchg(&dst->_metrics, old, new);
 176
 177                 if (prev != old) {
 178                         p = __DST_METRICS_PTR(prev);
 179                         if (prev & DST_METRICS_READ_ONLY)
 180                                 p = NULL;
 181                 } else {
 182                         if (rt->fi) {
 183                                 fib_info_put(rt->fi);
 184                                 rt->fi = NULL;
 185                         }
 186                 }
 187         }
 188         return p;
 189 }
 190
 191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 192
 193 static struct dst_ops ipv4_dst_ops = {
 194         .family =               AF_INET,
 195         .protocol =             cpu_to_be16(ETH_P_IP),
 196         .gc =                   rt_garbage_collect,
 197         .check =                ipv4_dst_check,
 198         .default_advmss =       ipv4_default_advmss,
 199         .mtu =                  ipv4_mtu,
 200         .cow_metrics =          ipv4_cow_metrics,
 201         .destroy =              ipv4_dst_destroy,
 202         .ifdown =               ipv4_dst_ifdown,
 203         .negative_advice =      ipv4_negative_advice,
 204         .link_failure =         ipv4_link_failure,
 205         .update_pmtu =          ip_rt_update_pmtu,
 206         .local_out =            __ip_local_out,
 207         .neigh_lookup =         ipv4_neigh_lookup,
 208 };
 209
 210 #define ECN_OR_COST(class)      TC_PRIO_##class
 211
 212 const __u8 ip_tos2prio[16] = {
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BESTEFFORT,
 216         ECN_OR_COST(BESTEFFORT),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_BULK,
 220         ECN_OR_COST(BULK),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE,
 224         ECN_OR_COST(INTERACTIVE),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK),
 227         TC_PRIO_INTERACTIVE_BULK,
 228         ECN_OR_COST(INTERACTIVE_BULK)
 229 };
 230 EXPORT_SYMBOL(ip_tos2prio);
 231
 232 /*
 233  * Route cache.
 234  */
 235
 236 /* The locking scheme is rather straight forward:
 237  *
 238  * 1) Read-Copy Update protects the buckets of the central route hash.
 239  * 2) Only writers remove entries, and they hold the lock
 240  *    as they look at rtable reference counts.
 241  * 3) Only readers acquire references to rtable entries,
 242  *    they do so with atomic increments and with the
 243  *    lock held.
 244  */
 245
 246 struct rt_hash_bucket {
 247         struct rtable __rcu     *chain;
 248 };
 249
 250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 251         defined(CONFIG_PROVE_LOCKING)
 252 /*
 253  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 254  * The size of this table is a power of two and depends on the number of CPUS.
 255  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 256  */
 257 #ifdef CONFIG_LOCKDEP
 258 # define RT_HASH_LOCK_SZ        256
 259 #else
 260 # if NR_CPUS >= 32
 261 #  define RT_HASH_LOCK_SZ       4096
 262 # elif NR_CPUS >= 16
 263 #  define RT_HASH_LOCK_SZ       2048
 264 # elif NR_CPUS >= 8
 265 #  define RT_HASH_LOCK_SZ       1024
 266 # elif NR_CPUS >= 4
 267 #  define RT_HASH_LOCK_SZ       512
 268 # else
 269 #  define RT_HASH_LOCK_SZ       256
 270 # endif
 271 #endif
 272
 273 static spinlock_t       *rt_hash_locks;
 274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 275
 276 static __init void rt_hash_lock_init(void)
 277 {
 278         int i;
 279
 280         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 281                         GFP_KERNEL);
 282         if (!rt_hash_locks)
 283                 panic("IP: failed to allocate rt_hash_locks\n");
 284
 285         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 286                 spin_lock_init(&rt_hash_locks[i]);
 287 }
 288 #else
 289 # define rt_hash_lock_addr(slot) NULL
 290
 291 static inline void rt_hash_lock_init(void)
 292 {
 293 }
 294 #endif
 295
 296 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 297 static unsigned int             rt_hash_mask __read_mostly;
 298 static unsigned int             rt_hash_log  __read_mostly;
 299
 300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 302
 303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 304                                    int genid)
 305 {
 306         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 307                             idx, genid)
 308                 & rt_hash_mask;
 309 }
 310
 311 static inline int rt_genid(struct net *net)
 312 {
 313         return atomic_read(&net->ipv4.rt_genid);
 314 }
 315
 316 #ifdef CONFIG_PROC_FS
 317 struct rt_cache_iter_state {
 318         struct seq_net_private p;
 319         int bucket;
 320         int genid;
 321 };
 322
 323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 324 {
 325         struct rt_cache_iter_state *st = seq->private;
 326         struct rtable *r = NULL;
 327
 328         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 329                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 330                         continue;
 331                 rcu_read_lock_bh();
 332                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 333                 while (r) {
 334                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 335                             r->rt_genid == st->genid)
 336                                 return r;
 337                         r = rcu_dereference_bh(r->dst.rt_next);
 338                 }
 339                 rcu_read_unlock_bh();
 340         }
 341         return r;
 342 }
 343
 344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 345                                           struct rtable *r)
 346 {
 347         struct rt_cache_iter_state *st = seq->private;
 348
 349         r = rcu_dereference_bh(r->dst.rt_next);
 350         while (!r) {
 351                 rcu_read_unlock_bh();
 352                 do {
 353                         if (--st->bucket < 0)
 354                                 return NULL;
 355                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 356                 rcu_read_lock_bh();
 357                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 358         }
 359         return r;
 360 }
 361
 362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 363                                         struct rtable *r)
 364 {
 365         struct rt_cache_iter_state *st = seq->private;
 366         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 367                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 368                         continue;
 369                 if (r->rt_genid == st->genid)
 370                         break;
 371         }
 372         return r;
 373 }
 374
 375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 376 {
 377         struct rtable *r = rt_cache_get_first(seq);
 378
 379         if (r)
 380                 while (pos && (r = rt_cache_get_next(seq, r)))
 381                         --pos;
 382         return pos ? NULL : r;
 383 }
 384
 385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 386 {
 387         struct rt_cache_iter_state *st = seq->private;
 388         if (*pos)
 389                 return rt_cache_get_idx(seq, *pos - 1);
 390         st->genid = rt_genid(seq_file_net(seq));
 391         return SEQ_START_TOKEN;
 392 }
 393
 394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 395 {
 396         struct rtable *r;
 397
 398         if (v == SEQ_START_TOKEN)
 399                 r = rt_cache_get_first(seq);
 400         else
 401                 r = rt_cache_get_next(seq, v);
 402         ++*pos;
 403         return r;
 404 }
 405
 406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 407 {
 408         if (v && v != SEQ_START_TOKEN)
 409                 rcu_read_unlock_bh();
 410 }
 411
 412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 413 {
 414         if (v == SEQ_START_TOKEN)
 415                 seq_printf(seq, "%-127s\n",
 416                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 417                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 418                            "HHUptod\tSpecDst");
 419         else {
 420                 struct rtable *r = v;
 421                 struct neighbour *n;
 422                 int len, HHUptod;
 423
 424                 rcu_read_lock();
 425                 n = dst_get_neighbour_noref(&r->dst);
 426                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 427                 rcu_read_unlock();
 428
 429                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 430                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 431                         r->dst.dev ? r->dst.dev->name : "*",
 432                         (__force u32)r->rt_dst,
 433                         (__force u32)r->rt_gateway,
 434                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 435                         r->dst.__use, 0, (__force u32)r->rt_src,
 436                         dst_metric_advmss(&r->dst) + 40,
 437                         dst_metric(&r->dst, RTAX_WINDOW),
 438                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 439                               dst_metric(&r->dst, RTAX_RTTVAR)),
 440                         r->rt_key_tos,
 441                         -1,
 442                         HHUptod,
 443                         r->rt_spec_dst, &len);
 444
 445                 seq_printf(seq, "%*s\n", 127 - len, "");
 446         }
 447         return 0;
 448 }
 449
 450 static const struct seq_operations rt_cache_seq_ops = {
 451         .start  = rt_cache_seq_start,
 452         .next   = rt_cache_seq_next,
 453         .stop   = rt_cache_seq_stop,
 454         .show   = rt_cache_seq_show,
 455 };
 456
 457 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 458 {
 459         return seq_open_net(inode, file, &rt_cache_seq_ops,
 460                         sizeof(struct rt_cache_iter_state));
 461 }
 462
 463 static const struct file_operations rt_cache_seq_fops = {
 464         .owner   = THIS_MODULE,
 465         .open    = rt_cache_seq_open,
 466         .read    = seq_read,
 467         .llseek  = seq_lseek,
 468         .release = seq_release_net,
 469 };
 470
 471
 472 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 473 {
 474         int cpu;
 475
 476         if (*pos == 0)
 477                 return SEQ_START_TOKEN;
 478
 479         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 480                 if (!cpu_possible(cpu))
 481                         continue;
 482                 *pos = cpu+1;
 483                 return &per_cpu(rt_cache_stat, cpu);
 484         }
 485         return NULL;
 486 }
 487
 488 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 489 {
 490         int cpu;
 491
 492         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 493                 if (!cpu_possible(cpu))
 494                         continue;
 495                 *pos = cpu+1;
 496                 return &per_cpu(rt_cache_stat, cpu);
 497         }
 498         return NULL;
 499
 500 }
 501
 502 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 503 {
 504
 505 }
 506
 507 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 508 {
 509         struct rt_cache_stat *st = v;
 510
 511         if (v == SEQ_START_TOKEN) {
 512                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 513                 return 0;
 514         }
 515
 516         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 517                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 518                    dst_entries_get_slow(&ipv4_dst_ops),
 519                    st->in_hit,
 520                    st->in_slow_tot,
 521                    st->in_slow_mc,
 522                    st->in_no_route,
 523                    st->in_brd,
 524                    st->in_martian_dst,
 525                    st->in_martian_src,
 526
 527                    st->out_hit,
 528                    st->out_slow_tot,
 529                    st->out_slow_mc,
 530
 531                    st->gc_total,
 532                    st->gc_ignored,
 533                    st->gc_goal_miss,
 534                    st->gc_dst_overflow,
 535                    st->in_hlist_search,
 536                    st->out_hlist_search
 537                 );
 538         return 0;
 539 }
 540
 541 static const struct seq_operations rt_cpu_seq_ops = {
 542         .start  = rt_cpu_seq_start,
 543         .next   = rt_cpu_seq_next,
 544         .stop   = rt_cpu_seq_stop,
 545         .show   = rt_cpu_seq_show,
 546 };
 547
 548
 549 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 550 {
 551         return seq_open(file, &rt_cpu_seq_ops);
 552 }
 553
 554 static const struct file_operations rt_cpu_seq_fops = {
 555         .owner   = THIS_MODULE,
 556         .open    = rt_cpu_seq_open,
 557         .read    = seq_read,
 558         .llseek  = seq_lseek,
 559         .release = seq_release,
 560 };
 561
 562 #ifdef CONFIG_IP_ROUTE_CLASSID
 563 static int rt_acct_proc_show(struct seq_file *m, void *v)
 564 {
 565         struct ip_rt_acct *dst, *src;
 566         unsigned int i, j;
 567
 568         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 569         if (!dst)
 570                 return -ENOMEM;
 571
 572         for_each_possible_cpu(i) {
 573                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 574                 for (j = 0; j < 256; j++) {
 575                         dst[j].o_bytes   += src[j].o_bytes;
 576                         dst[j].o_packets += src[j].o_packets;
 577                         dst[j].i_bytes   += src[j].i_bytes;
 578                         dst[j].i_packets += src[j].i_packets;
 579                 }
 580         }
 581
 582         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 583         kfree(dst);
 584         return 0;
 585 }
 586
 587 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 588 {
 589         return single_open(file, rt_acct_proc_show, NULL);
 590 }
 591
 592 static const struct file_operations rt_acct_proc_fops = {
 593         .owner          = THIS_MODULE,
 594         .open           = rt_acct_proc_open,
 595         .read           = seq_read,
 596         .llseek         = seq_lseek,
 597         .release        = single_release,
 598 };
 599 #endif
 600
 601 static int __net_init ip_rt_do_proc_init(struct net *net)
 602 {
 603         struct proc_dir_entry *pde;
 604
 605         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 606                         &rt_cache_seq_fops);
 607         if (!pde)
 608                 goto err1;
 609
 610         pde = proc_create("rt_cache", S_IRUGO,
 611                           net->proc_net_stat, &rt_cpu_seq_fops);
 612         if (!pde)
 613                 goto err2;
 614
 615 #ifdef CONFIG_IP_ROUTE_CLASSID
 616         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 617         if (!pde)
 618                 goto err3;
 619 #endif
 620         return 0;
 621
 622 #ifdef CONFIG_IP_ROUTE_CLASSID
 623 err3:
 624         remove_proc_entry("rt_cache", net->proc_net_stat);
 625 #endif
 626 err2:
 627         remove_proc_entry("rt_cache", net->proc_net);
 628 err1:
 629         return -ENOMEM;
 630 }
 631
 632 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 633 {
 634         remove_proc_entry("rt_cache", net->proc_net_stat);
 635         remove_proc_entry("rt_cache", net->proc_net);
 636 #ifdef CONFIG_IP_ROUTE_CLASSID
 637         remove_proc_entry("rt_acct", net->proc_net);
 638 #endif
 639 }
 640
 641 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 642         .init = ip_rt_do_proc_init,
 643         .exit = ip_rt_do_proc_exit,
 644 };
 645
 646 static int __init ip_rt_proc_init(void)
 647 {
 648         return register_pernet_subsys(&ip_rt_proc_ops);
 649 }
 650
 651 #else
 652 static inline int ip_rt_proc_init(void)
 653 {
 654         return 0;
 655 }
 656 #endif /* CONFIG_PROC_FS */
 657
 658 static inline void rt_free(struct rtable *rt)
 659 {
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline void rt_drop(struct rtable *rt)
 664 {
 665         ip_rt_put(rt);
 666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 667 }
 668
 669 static inline int rt_fast_clean(struct rtable *rth)
 670 {
 671         /* Kill broadcast/multicast entries very aggresively, if they
 672            collide in hash table with more useful entries */
 673         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 674                 rt_is_input_route(rth) && rth->dst.rt_next;
 675 }
 676
 677 static inline int rt_valuable(struct rtable *rth)
 678 {
 679         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 680                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
 681 }
 682
 683 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 684 {
 685         unsigned long age;
 686         int ret = 0;
 687
 688         if (atomic_read(&rth->dst.__refcnt))
 689                 goto out;
 690
 691         age = jiffies - rth->dst.lastuse;
 692         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 693             (age <= tmo2 && rt_valuable(rth)))
 694                 goto out;
 695         ret = 1;
 696 out:    return ret;
 697 }
 698
 699 /* Bits of score are:
 700  * 31: very valuable
 701  * 30: not quite useless
 702  * 29..0: usage counter
 703  */
 704 static inline u32 rt_score(struct rtable *rt)
 705 {
 706         u32 score = jiffies - rt->dst.lastuse;
 707
 708         score = ~score & ~(3<<30);
 709
 710         if (rt_valuable(rt))
 711                 score |= (1<<31);
 712
 713         if (rt_is_output_route(rt) ||
 714             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 715                 score |= (1<<30);
 716
 717         return score;
 718 }
 719
 720 static inline bool rt_caching(const struct net *net)
 721 {
 722         return net->ipv4.current_rt_cache_rebuild_count <=
 723                 net->ipv4.sysctl_rt_cache_rebuild_count;
 724 }
 725
 726 static inline bool compare_hash_inputs(const struct rtable *rt1,
 727                                        const struct rtable *rt2)
 728 {
 729         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 731                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 732 }
 733
 734 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 735 {
 736         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 737                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 738                 (rt1->rt_mark ^ rt2->rt_mark) |
 739                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 740                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 741                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 742 }
 743
 744 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 745 {
 746         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 747 }
 748
 749 static inline int rt_is_expired(struct rtable *rth)
 750 {
 751         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 752 }
 753
 754 /*
 755  * Perform a full scan of hash table and free all entries.
 756  * Can be called by a softirq or a process.
 757  * In the later case, we want to be reschedule if necessary
 758  */
 759 static void rt_do_flush(struct net *net, int process_context)
 760 {
 761         unsigned int i;
 762         struct rtable *rth, *next;
 763
 764         for (i = 0; i <= rt_hash_mask; i++) {
 765                 struct rtable __rcu **pprev;
 766                 struct rtable *list;
 767
 768                 if (process_context && need_resched())
 769                         cond_resched();
 770                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 771                 if (!rth)
 772                         continue;
 773
 774                 spin_lock_bh(rt_hash_lock_addr(i));
 775
 776                 list = NULL;
 777                 pprev = &rt_hash_table[i].chain;
 778                 rth = rcu_dereference_protected(*pprev,
 779                         lockdep_is_held(rt_hash_lock_addr(i)));
 780
 781                 while (rth) {
 782                         next = rcu_dereference_protected(rth->dst.rt_next,
 783                                 lockdep_is_held(rt_hash_lock_addr(i)));
 784
 785                         if (!net ||
 786                             net_eq(dev_net(rth->dst.dev), net)) {
 787                                 rcu_assign_pointer(*pprev, next);
 788                                 rcu_assign_pointer(rth->dst.rt_next, list);
 789                                 list = rth;
 790                         } else {
 791                                 pprev = &rth->dst.rt_next;
 792                         }
 793                         rth = next;
 794                 }
 795
 796                 spin_unlock_bh(rt_hash_lock_addr(i));
 797
 798                 for (; list; list = next) {
 799                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 800                         rt_free(list);
 801                 }
 802         }
 803 }
 804
 805 /*
 806  * While freeing expired entries, we compute average chain length
 807  * and standard deviation, using fixed-point arithmetic.
 808  * This to have an estimation of rt_chain_length_max
 809  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 810  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 811  */
 812
 813 #define FRACT_BITS 3
 814 #define ONE (1UL << FRACT_BITS)
 815
 816 /*
 817  * Given a hash chain and an item in this hash chain,
 818  * find if a previous entry has the same hash_inputs
 819  * (but differs on tos, mark or oif)
 820  * Returns 0 if an alias is found.
 821  * Returns ONE if rth has no alias before itself.
 822  */
 823 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 824 {
 825         const struct rtable *aux = head;
 826
 827         while (aux != rth) {
 828                 if (compare_hash_inputs(aux, rth))
 829                         return 0;
 830                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 831         }
 832         return ONE;
 833 }
 834
 835 static void rt_check_expire(void)
 836 {
 837         static unsigned int rover;
 838         unsigned int i = rover, goal;
 839         struct rtable *rth;
 840         struct rtable __rcu **rthp;
 841         unsigned long samples = 0;
 842         unsigned long sum = 0, sum2 = 0;
 843         unsigned long delta;
 844         u64 mult;
 845
 846         delta = jiffies - expires_ljiffies;
 847         expires_ljiffies = jiffies;
 848         mult = ((u64)delta) << rt_hash_log;
 849         if (ip_rt_gc_timeout > 1)
 850                 do_div(mult, ip_rt_gc_timeout);
 851         goal = (unsigned int)mult;
 852         if (goal > rt_hash_mask)
 853                 goal = rt_hash_mask + 1;
 854         for (; goal > 0; goal--) {
 855                 unsigned long tmo = ip_rt_gc_timeout;
 856                 unsigned long length;
 857
 858                 i = (i + 1) & rt_hash_mask;
 859                 rthp = &rt_hash_table[i].chain;
 860
 861                 if (need_resched())
 862                         cond_resched();
 863
 864                 samples++;
 865
 866                 if (rcu_dereference_raw(*rthp) == NULL)
 867                         continue;
 868                 length = 0;
 869                 spin_lock_bh(rt_hash_lock_addr(i));
 870                 while ((rth = rcu_dereference_protected(*rthp,
 871                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 872                         prefetch(rth->dst.rt_next);
 873                         if (rt_is_expired(rth)) {
 874                                 *rthp = rth->dst.rt_next;
 875                                 rt_free(rth);
 876                                 continue;
 877                         }
 878                         if (rth->dst.expires) {
 879                                 /* Entry is expired even if it is in use */
 880                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 881 nofree:
 882                                         tmo >>= 1;
 883                                         rthp = &rth->dst.rt_next;
 884                                         /*
 885                                          * We only count entries on
 886                                          * a chain with equal hash inputs once
 887                                          * so that entries for different QOS
 888                                          * levels, and other non-hash input
 889                                          * attributes don't unfairly skew
 890                                          * the length computation
 891                                          */
 892                                         length += has_noalias(rt_hash_table[i].chain, rth);
 893                                         continue;
 894                                 }
 895                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 896                                 goto nofree;
 897
 898                         /* Cleanup aged off entries. */
 899                         *rthp = rth->dst.rt_next;
 900                         rt_free(rth);
 901                 }
 902                 spin_unlock_bh(rt_hash_lock_addr(i));
 903                 sum += length;
 904                 sum2 += length*length;
 905         }
 906         if (samples) {
 907                 unsigned long avg = sum / samples;
 908                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 909                 rt_chain_length_max = max_t(unsigned long,
 910                                         ip_rt_gc_elasticity,
 911                                         (avg + 4*sd) >> FRACT_BITS);
 912         }
 913         rover = i;
 914 }
 915
 916 /*
 917  * rt_worker_func() is run in process context.
 918  * we call rt_check_expire() to scan part of the hash table
 919  */
 920 static void rt_worker_func(struct work_struct *work)
 921 {
 922         rt_check_expire();
 923         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 924 }
 925
 926 /*
 927  * Perturbation of rt_genid by a small quantity [1..256]
 928  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 929  * many times (2^24) without giving recent rt_genid.
 930  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 931  */
 932 static void rt_cache_invalidate(struct net *net)
 933 {
 934         unsigned char shuffle;
 935
 936         get_random_bytes(&shuffle, sizeof(shuffle));
 937         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 938         inetpeer_invalidate_tree(net->ipv4.peers);
 939 }
 940
 941 /*
 942  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 943  * delay >= 0 : invalidate & flush cache (can be long)
 944  */
 945 void rt_cache_flush(struct net *net, int delay)
 946 {
 947         rt_cache_invalidate(net);
 948         if (delay >= 0)
 949                 rt_do_flush(net, !in_softirq());
 950 }
 951
 952 /* Flush previous cache invalidated entries from the cache */
 953 void rt_cache_flush_batch(struct net *net)
 954 {
 955         rt_do_flush(net, !in_softirq());
 956 }
 957
 958 static void rt_emergency_hash_rebuild(struct net *net)
 959 {
 960         net_warn_ratelimited("Route hash chain too long!\n");
 961         rt_cache_invalidate(net);
 962 }
 963
 964 /*
 965    Short description of GC goals.
 966
 967    We want to build algorithm, which will keep routing cache
 968    at some equilibrium point, when number of aged off entries
 969    is kept approximately equal to newly generated ones.
 970
 971    Current expiration strength is variable "expire".
 972    We try to adjust it dynamically, so that if networking
 973    is idle expires is large enough to keep enough of warm entries,
 974    and when load increases it reduces to limit cache size.
 975  */
 976
 977 static int rt_garbage_collect(struct dst_ops *ops)
 978 {
 979         static unsigned long expire = RT_GC_TIMEOUT;
 980         static unsigned long last_gc;
 981         static int rover;
 982         static int equilibrium;
 983         struct rtable *rth;
 984         struct rtable __rcu **rthp;
 985         unsigned long now = jiffies;
 986         int goal;
 987         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 988
 989         /*
 990          * Garbage collection is pretty expensive,
 991          * do not make it too frequently.
 992          */
 993
 994         RT_CACHE_STAT_INC(gc_total);
 995
 996         if (now - last_gc < ip_rt_gc_min_interval &&
 997             entries < ip_rt_max_size) {
 998                 RT_CACHE_STAT_INC(gc_ignored);
 999                 goto out;
1000         }
1001
1002         entries = dst_entries_get_slow(&ipv4_dst_ops);
1003         /* Calculate number of entries, which we want to expire now. */
1004         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1005         if (goal <= 0) {
1006                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1007                         equilibrium = ipv4_dst_ops.gc_thresh;
1008                 goal = entries - equilibrium;
1009                 if (goal > 0) {
1010                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1011                         goal = entries - equilibrium;
1012                 }
1013         } else {
1014                 /* We are in dangerous area. Try to reduce cache really
1015                  * aggressively.
1016                  */
1017                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1018                 equilibrium = entries - goal;
1019         }
1020
1021         if (now - last_gc >= ip_rt_gc_min_interval)
1022                 last_gc = now;
1023
1024         if (goal <= 0) {
1025                 equilibrium += goal;
1026                 goto work_done;
1027         }
1028
1029         do {
1030                 int i, k;
1031
1032                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1033                         unsigned long tmo = expire;
1034
1035                         k = (k + 1) & rt_hash_mask;
1036                         rthp = &rt_hash_table[k].chain;
1037                         spin_lock_bh(rt_hash_lock_addr(k));
1038                         while ((rth = rcu_dereference_protected(*rthp,
1039                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1040                                 if (!rt_is_expired(rth) &&
1041                                         !rt_may_expire(rth, tmo, expire)) {
1042                                         tmo >>= 1;
1043                                         rthp = &rth->dst.rt_next;
1044                                         continue;
1045                                 }
1046                                 *rthp = rth->dst.rt_next;
1047                                 rt_free(rth);
1048                                 goal--;
1049                         }
1050                         spin_unlock_bh(rt_hash_lock_addr(k));
1051                         if (goal <= 0)
1052                                 break;
1053                 }
1054                 rover = k;
1055
1056                 if (goal <= 0)
1057                         goto work_done;
1058
1059                 /* Goal is not achieved. We stop process if:
1060
1061                    - if expire reduced to zero. Otherwise, expire is halfed.
1062                    - if table is not full.
1063                    - if we are called from interrupt.
1064                    - jiffies check is just fallback/debug loop breaker.
1065                      We will not spin here for long time in any case.
1066                  */
1067
1068                 RT_CACHE_STAT_INC(gc_goal_miss);
1069
1070                 if (expire == 0)
1071                         break;
1072
1073                 expire >>= 1;
1074
1075                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076                         goto out;
1077         } while (!in_softirq() && time_before_eq(jiffies, now));
1078
1079         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                 goto out;
1081         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1082                 goto out;
1083         net_warn_ratelimited("dst cache overflow\n");
1084         RT_CACHE_STAT_INC(gc_dst_overflow);
1085         return 1;
1086
1087 work_done:
1088         expire += ip_rt_gc_min_interval;
1089         if (expire > ip_rt_gc_timeout ||
1090             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1091             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1092                 expire = ip_rt_gc_timeout;
1093 out:    return 0;
1094 }
1095
1096 /*
1097  * Returns number of entries in a hash chain that have different hash_inputs
1098  */
1099 static int slow_chain_length(const struct rtable *head)
1100 {
1101         int length = 0;
1102         const struct rtable *rth = head;
1103
1104         while (rth) {
1105                 length += has_noalias(head, rth);
1106                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1107         }
1108         return length >> FRACT_BITS;
1109 }
1110
1111 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1112 {
1113         static const __be32 inaddr_any = 0;
1114         struct net_device *dev = dst->dev;
1115         const __be32 *pkey = daddr;
1116         const struct rtable *rt;
1117         struct neighbour *n;
1118
1119         rt = (const struct rtable *) dst;
1120
1121         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122                 pkey = &inaddr_any;
1123         else if (rt->rt_gateway)
1124                 pkey = (const __be32 *) &rt->rt_gateway;
1125
1126         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1127         if (n)
1128                 return n;
1129         return neigh_create(&arp_tbl, pkey, dev);
1130 }
1131
1132 static int rt_bind_neighbour(struct rtable *rt)
1133 {
1134         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1135         if (IS_ERR(n))
1136                 return PTR_ERR(n);
1137         dst_set_neighbour(&rt->dst, n);
1138
1139         return 0;
1140 }
1141
1142 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1143                                      struct sk_buff *skb, int ifindex)
1144 {
1145         struct rtable   *rth, *cand;
1146         struct rtable __rcu **rthp, **candp;
1147         unsigned long   now;
1148         u32             min_score;
1149         int             chain_length;
1150         int attempts = !in_softirq();
1151
1152 restart:
1153         chain_length = 0;
1154         min_score = ~(u32)0;
1155         cand = NULL;
1156         candp = NULL;
1157         now = jiffies;
1158
1159         if (!rt_caching(dev_net(rt->dst.dev))) {
1160                 /*
1161                  * If we're not caching, just tell the caller we
1162                  * were successful and don't touch the route.  The
1163                  * caller hold the sole reference to the cache entry, and
1164                  * it will be released when the caller is done with it.
1165                  * If we drop it here, the callers have no way to resolve routes
1166                  * when we're not caching.  Instead, just point *rp at rt, so
1167                  * the caller gets a single use out of the route
1168                  * Note that we do rt_free on this new route entry, so that
1169                  * once its refcount hits zero, we are still able to reap it
1170                  * (Thanks Alexey)
1171                  * Note: To avoid expensive rcu stuff for this uncached dst,
1172                  * we set DST_NOCACHE so that dst_release() can free dst without
1173                  * waiting a grace period.
1174                  */
1175
1176                 rt->dst.flags |= DST_NOCACHE;
1177                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                         int err = rt_bind_neighbour(rt);
1179                         if (err) {
1180                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1181                                 ip_rt_put(rt);
1182                                 return ERR_PTR(err);
1183                         }
1184                 }
1185
1186                 goto skip_hashing;
1187         }
1188
1189         rthp = &rt_hash_table[hash].chain;
1190
1191         spin_lock_bh(rt_hash_lock_addr(hash));
1192         while ((rth = rcu_dereference_protected(*rthp,
1193                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194                 if (rt_is_expired(rth)) {
1195                         *rthp = rth->dst.rt_next;
1196                         rt_free(rth);
1197                         continue;
1198                 }
1199                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200                         /* Put it first */
1201                         *rthp = rth->dst.rt_next;
1202                         /*
1203                          * Since lookup is lockfree, the deletion
1204                          * must be visible to another weakly ordered CPU before
1205                          * the insertion at the start of the hash chain.
1206                          */
1207                         rcu_assign_pointer(rth->dst.rt_next,
1208                                            rt_hash_table[hash].chain);
1209                         /*
1210                          * Since lookup is lockfree, the update writes
1211                          * must be ordered for consistency on SMP.
1212                          */
1213                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215                         dst_use(&rth->dst, now);
1216                         spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218                         rt_drop(rt);
1219                         if (skb)
1220                                 skb_dst_set(skb, &rth->dst);
1221                         return rth;
1222                 }
1223
1224                 if (!atomic_read(&rth->dst.__refcnt)) {
1225                         u32 score = rt_score(rth);
1226
1227                         if (score <= min_score) {
1228                                 cand = rth;
1229                                 candp = rthp;
1230                                 min_score = score;
1231                         }
1232                 }
1233
1234                 chain_length++;
1235
1236                 rthp = &rth->dst.rt_next;
1237         }
1238
1239         if (cand) {
1240                 /* ip_rt_gc_elasticity used to be average length of chain
1241                  * length, when exceeded gc becomes really aggressive.
1242                  *
1243                  * The second limit is less certain. At the moment it allows
1244                  * only 2 entries per bucket. We will see.
1245                  */
1246                 if (chain_length > ip_rt_gc_elasticity) {
1247                         *candp = cand->dst.rt_next;
1248                         rt_free(cand);
1249                 }
1250         } else {
1251                 if (chain_length > rt_chain_length_max &&
1252                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253                         struct net *net = dev_net(rt->dst.dev);
1254                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255                         if (!rt_caching(net)) {
1256                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1257                                         rt->dst.dev->name, num);
1258                         }
1259                         rt_emergency_hash_rebuild(net);
1260                         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263                                         ifindex, rt_genid(net));
1264                         goto restart;
1265                 }
1266         }
1267
1268         /* Try to bind route to arp only if it is output
1269            route or unicast forwarding path.
1270          */
1271         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272                 int err = rt_bind_neighbour(rt);
1273                 if (err) {
1274                         spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276                         if (err != -ENOBUFS) {
1277                                 rt_drop(rt);
1278                                 return ERR_PTR(err);
1279                         }
1280
1281                         /* Neighbour tables are full and nothing
1282                            can be released. Try to shrink route cache,
1283                            it is most likely it holds some neighbour records.
1284                          */
1285                         if (attempts-- > 0) {
1286                                 int saved_elasticity = ip_rt_gc_elasticity;
1287                                 int saved_int = ip_rt_gc_min_interval;
1288                                 ip_rt_gc_elasticity     = 1;
1289                                 ip_rt_gc_min_interval   = 0;
1290                                 rt_garbage_collect(&ipv4_dst_ops);
1291                                 ip_rt_gc_min_interval   = saved_int;
1292                                 ip_rt_gc_elasticity     = saved_elasticity;
1293                                 goto restart;
1294                         }
1295
1296                         net_warn_ratelimited("Neighbour table overflow\n");
1297                         rt_drop(rt);
1298                         return ERR_PTR(-ENOBUFS);
1299                 }
1300         }
1301
1302         rt->dst.rt_next = rt_hash_table[hash].chain;
1303
1304         /*
1305          * Since lookup is lockfree, we must make sure
1306          * previous writes to rt are committed to memory
1307          * before making rt visible to other CPUS.
1308          */
1309         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1310
1311         spin_unlock_bh(rt_hash_lock_addr(hash));
1312
1313 skip_hashing:
1314         if (skb)
1315                 skb_dst_set(skb, &rt->dst);
1316         return rt;
1317 }
1318
1319 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1320
1321 static u32 rt_peer_genid(void)
1322 {
1323         return atomic_read(&__rt_peer_genid);
1324 }
1325
1326 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1327 {
1328         struct inet_peer_base *base;
1329         struct inet_peer *peer;
1330
1331         base = inetpeer_base_ptr(rt->_peer);
1332         if (!base)
1333                 return;
1334
1335         peer = inet_getpeer_v4(base, daddr, create);
1336
1337         if (!rt_set_peer(rt, peer))
1338                 inet_putpeer(peer);
1339         else
1340                 rt->rt_peer_genid = rt_peer_genid();
1341 }
1342
1343 /*
1344  * Peer allocation may fail only in serious out-of-memory conditions.  However
1345  * we still can generate some output.
1346  * Random ID selection looks a bit dangerous because we have no chances to
1347  * select ID being unique in a reasonable period of time.
1348  * But broken packet identifier may be better than no packet at all.
1349  */
1350 static void ip_select_fb_ident(struct iphdr *iph)
1351 {
1352         static DEFINE_SPINLOCK(ip_fb_id_lock);
1353         static u32 ip_fallback_id;
1354         u32 salt;
1355
1356         spin_lock_bh(&ip_fb_id_lock);
1357         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1358         iph->id = htons(salt & 0xFFFF);
1359         ip_fallback_id = salt;
1360         spin_unlock_bh(&ip_fb_id_lock);
1361 }
1362
1363 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1364 {
1365         struct rtable *rt = (struct rtable *) dst;
1366
1367         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1368                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1369
1370                 /* If peer is attached to destination, it is never detached,
1371                    so that we need not to grab a lock to dereference it.
1372                  */
1373                 if (peer) {
1374                         iph->id = htons(inet_getid(peer, more));
1375                         return;
1376                 }
1377         } else if (!rt)
1378                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1379
1380         ip_select_fb_ident(iph);
1381 }
1382 EXPORT_SYMBOL(__ip_select_ident);
1383
1384 static void rt_del(unsigned int hash, struct rtable *rt)
1385 {
1386         struct rtable __rcu **rthp;
1387         struct rtable *aux;
1388
1389         rthp = &rt_hash_table[hash].chain;
1390         spin_lock_bh(rt_hash_lock_addr(hash));
1391         ip_rt_put(rt);
1392         while ((aux = rcu_dereference_protected(*rthp,
1393                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1394                 if (aux == rt || rt_is_expired(aux)) {
1395                         *rthp = aux->dst.rt_next;
1396                         rt_free(aux);
1397                         continue;
1398                 }
1399                 rthp = &aux->dst.rt_next;
1400         }
1401         spin_unlock_bh(rt_hash_lock_addr(hash));
1402 }
1403
1404 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1405 {
1406         struct rtable *rt = (struct rtable *) dst;
1407         __be32 orig_gw = rt->rt_gateway;
1408         struct neighbour *n, *old_n;
1409
1410         dst_confirm(&rt->dst);
1411
1412         rt->rt_gateway = peer->redirect_learned.a4;
1413
1414         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1415         if (IS_ERR(n)) {
1416                 rt->rt_gateway = orig_gw;
1417                 return;
1418         }
1419         old_n = xchg(&rt->dst._neighbour, n);
1420         if (old_n)
1421                 neigh_release(old_n);
1422         if (!(n->nud_state & NUD_VALID)) {
1423                 neigh_event_send(n, NULL);
1424         } else {
1425                 rt->rt_flags |= RTCF_REDIRECTED;
1426                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1427         }
1428 }
1429
1430 /* called in rcu_read_lock() section */
1431 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1432                     __be32 saddr, struct net_device *dev)
1433 {
1434         int s, i;
1435         struct in_device *in_dev = __in_dev_get_rcu(dev);
1436         __be32 skeys[2] = { saddr, 0 };
1437         int    ikeys[2] = { dev->ifindex, 0 };
1438         struct inet_peer *peer;
1439         struct net *net;
1440
1441         if (!in_dev)
1442                 return;
1443
1444         net = dev_net(dev);
1445         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1446             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1447             ipv4_is_zeronet(new_gw))
1448                 goto reject_redirect;
1449
1450         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1451                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1452                         goto reject_redirect;
1453                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1454                         goto reject_redirect;
1455         } else {
1456                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1457                         goto reject_redirect;
1458         }
1459
1460         for (s = 0; s < 2; s++) {
1461                 for (i = 0; i < 2; i++) {
1462                         unsigned int hash;
1463                         struct rtable __rcu **rthp;
1464                         struct rtable *rt;
1465
1466                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1467
1468                         rthp = &rt_hash_table[hash].chain;
1469
1470                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1471                                 rthp = &rt->dst.rt_next;
1472
1473                                 if (rt->rt_key_dst != daddr ||
1474                                     rt->rt_key_src != skeys[s] ||
1475                                     rt->rt_oif != ikeys[i] ||
1476                                     rt_is_input_route(rt) ||
1477                                     rt_is_expired(rt) ||
1478                                     !net_eq(dev_net(rt->dst.dev), net) ||
1479                                     rt->dst.error ||
1480                                     rt->dst.dev != dev ||
1481                                     rt->rt_gateway != old_gw)
1482                                         continue;
1483
1484                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1485                                 if (peer) {
1486                                         if (peer->redirect_learned.a4 != new_gw) {
1487                                                 peer->redirect_learned.a4 = new_gw;
1488                                                 atomic_inc(&__rt_peer_genid);
1489                                         }
1490                                         check_peer_redir(&rt->dst, peer);
1491                                 }
1492                         }
1493                 }
1494         }
1495         return;
1496
1497 reject_redirect:
1498 #ifdef CONFIG_IP_ROUTE_VERBOSE
1499         if (IN_DEV_LOG_MARTIANS(in_dev))
1500                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1501                                      "  Advised path = %pI4 -> %pI4\n",
1502                                      &old_gw, dev->name, &new_gw,
1503                                      &saddr, &daddr);
1504 #endif
1505         ;
1506 }
1507
1508 static bool peer_pmtu_expired(struct inet_peer *peer)
1509 {
1510         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1511
1512         return orig &&
1513                time_after_eq(jiffies, orig) &&
1514                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1515 }
1516
1517 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1518 {
1519         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1520
1521         return orig &&
1522                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1523 }
1524
1525 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1526 {
1527         struct rtable *rt = (struct rtable *)dst;
1528         struct dst_entry *ret = dst;
1529
1530         if (rt) {
1531                 if (dst->obsolete > 0) {
1532                         ip_rt_put(rt);
1533                         ret = NULL;
1534                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1535                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1536                                                 rt->rt_oif,
1537                                                 rt_genid(dev_net(dst->dev)));
1538                         rt_del(hash, rt);
1539                         ret = NULL;
1540                 } else if (rt_has_peer(rt)) {
1541                         struct inet_peer *peer = rt_peer_ptr(rt);
1542                         if (peer_pmtu_expired(peer))
1543                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1544                 }
1545         }
1546         return ret;
1547 }
1548
1549 /*
1550  * Algorithm:
1551  *      1. The first ip_rt_redirect_number redirects are sent
1552  *         with exponential backoff, then we stop sending them at all,
1553  *         assuming that the host ignores our redirects.
1554  *      2. If we did not see packets requiring redirects
1555  *         during ip_rt_redirect_silence, we assume that the host
1556  *         forgot redirected route and start to send redirects again.
1557  *
1558  * This algorithm is much cheaper and more intelligent than dumb load limiting
1559  * in icmp.c.
1560  *
1561  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1562  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1563  */
1564
1565 void ip_rt_send_redirect(struct sk_buff *skb)
1566 {
1567         struct rtable *rt = skb_rtable(skb);
1568         struct in_device *in_dev;
1569         struct inet_peer *peer;
1570         int log_martians;
1571
1572         rcu_read_lock();
1573         in_dev = __in_dev_get_rcu(rt->dst.dev);
1574         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1575                 rcu_read_unlock();
1576                 return;
1577         }
1578         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1579         rcu_read_unlock();
1580
1581         peer = rt_get_peer_create(rt, rt->rt_dst);
1582         if (!peer) {
1583                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1584                 return;
1585         }
1586
1587         /* No redirected packets during ip_rt_redirect_silence;
1588          * reset the algorithm.
1589          */
1590         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1591                 peer->rate_tokens = 0;
1592
1593         /* Too many ignored redirects; do not send anything
1594          * set dst.rate_last to the last seen redirected packet.
1595          */
1596         if (peer->rate_tokens >= ip_rt_redirect_number) {
1597                 peer->rate_last = jiffies;
1598                 return;
1599         }
1600
1601         /* Check for load limit; set rate_last to the latest sent
1602          * redirect.
1603          */
1604         if (peer->rate_tokens == 0 ||
1605             time_after(jiffies,
1606                        (peer->rate_last +
1607                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1608                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1609                 peer->rate_last = jiffies;
1610                 ++peer->rate_tokens;
1611 #ifdef CONFIG_IP_ROUTE_VERBOSE
1612                 if (log_martians &&
1613                     peer->rate_tokens == ip_rt_redirect_number)
1614                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1615                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1616                                              &rt->rt_dst, &rt->rt_gateway);
1617 #endif
1618         }
1619 }
1620
1621 static int ip_error(struct sk_buff *skb)
1622 {
1623         struct rtable *rt = skb_rtable(skb);
1624         struct inet_peer *peer;
1625         unsigned long now;
1626         bool send;
1627         int code;
1628
1629         switch (rt->dst.error) {
1630         case EINVAL:
1631         default:
1632                 goto out;
1633         case EHOSTUNREACH:
1634                 code = ICMP_HOST_UNREACH;
1635                 break;
1636         case ENETUNREACH:
1637                 code = ICMP_NET_UNREACH;
1638                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1639                                 IPSTATS_MIB_INNOROUTES);
1640                 break;
1641         case EACCES:
1642                 code = ICMP_PKT_FILTERED;
1643                 break;
1644         }
1645
1646         peer = rt_get_peer_create(rt, rt->rt_dst);
1647
1648         send = true;
1649         if (peer) {
1650                 now = jiffies;
1651                 peer->rate_tokens += now - peer->rate_last;
1652                 if (peer->rate_tokens > ip_rt_error_burst)
1653                         peer->rate_tokens = ip_rt_error_burst;
1654                 peer->rate_last = now;
1655                 if (peer->rate_tokens >= ip_rt_error_cost)
1656                         peer->rate_tokens -= ip_rt_error_cost;
1657                 else
1658                         send = false;
1659         }
1660         if (send)
1661                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1662
1663 out:    kfree_skb(skb);
1664         return 0;
1665 }
1666
1667 /*
1668  *      The last two values are not from the RFC but
1669  *      are needed for AMPRnet AX.25 paths.
1670  */
1671
1672 static const unsigned short mtu_plateau[] =
1673 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1674
1675 static inline unsigned short guess_mtu(unsigned short old_mtu)
1676 {
1677         int i;
1678
1679         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1680                 if (old_mtu > mtu_plateau[i])
1681                         return mtu_plateau[i];
1682         return 68;
1683 }
1684
1685 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1686                                  unsigned short new_mtu,
1687                                  struct net_device *dev)
1688 {
1689         unsigned short old_mtu = ntohs(iph->tot_len);
1690         unsigned short est_mtu = 0;
1691         struct inet_peer *peer;
1692
1693         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1694         if (peer) {
1695                 unsigned short mtu = new_mtu;
1696
1697                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1698                         /* BSD 4.2 derived systems incorrectly adjust
1699                          * tot_len by the IP header length, and report
1700                          * a zero MTU in the ICMP message.
1701                          */
1702                         if (mtu == 0 &&
1703                             old_mtu >= 68 + (iph->ihl << 2))
1704                                 old_mtu -= iph->ihl << 2;
1705                         mtu = guess_mtu(old_mtu);
1706                 }
1707
1708                 if (mtu < ip_rt_min_pmtu)
1709                         mtu = ip_rt_min_pmtu;
1710                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1711                         unsigned long pmtu_expires;
1712
1713                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1714                         if (!pmtu_expires)
1715                                 pmtu_expires = 1UL;
1716
1717                         est_mtu = mtu;
1718                         peer->pmtu_learned = mtu;
1719                         peer->pmtu_expires = pmtu_expires;
1720                         atomic_inc(&__rt_peer_genid);
1721                 }
1722
1723                 inet_putpeer(peer);
1724         }
1725         return est_mtu ? : new_mtu;
1726 }
1727
1728 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1729 {
1730         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1731
1732         if (!expires)
1733                 return;
1734         if (time_before(jiffies, expires)) {
1735                 u32 orig_dst_mtu = dst_mtu(dst);
1736                 if (peer->pmtu_learned < orig_dst_mtu) {
1737                         if (!peer->pmtu_orig)
1738                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1739                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1740                 }
1741         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1742                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1743 }
1744
1745 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1746 {
1747         struct rtable *rt = (struct rtable *) dst;
1748         struct inet_peer *peer;
1749
1750         dst_confirm(dst);
1751
1752         peer = rt_get_peer_create(rt, rt->rt_dst);
1753         if (peer) {
1754                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1755
1756                 if (mtu < ip_rt_min_pmtu)
1757                         mtu = ip_rt_min_pmtu;
1758                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1759
1760                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1761                         if (!pmtu_expires)
1762                                 pmtu_expires = 1UL;
1763
1764                         peer->pmtu_learned = mtu;
1765                         peer->pmtu_expires = pmtu_expires;
1766
1767                         atomic_inc(&__rt_peer_genid);
1768                         rt->rt_peer_genid = rt_peer_genid();
1769                 }
1770                 check_peer_pmtu(dst, peer);
1771         }
1772 }
1773
1774
1775 static void ipv4_validate_peer(struct rtable *rt)
1776 {
1777         if (rt->rt_peer_genid != rt_peer_genid()) {
1778                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1779
1780                 if (peer) {
1781                         check_peer_pmtu(&rt->dst, peer);
1782
1783                         if (peer->redirect_learned.a4 &&
1784                             peer->redirect_learned.a4 != rt->rt_gateway)
1785                                 check_peer_redir(&rt->dst, peer);
1786                 }
1787
1788                 rt->rt_peer_genid = rt_peer_genid();
1789         }
1790 }
1791
1792 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1793 {
1794         struct rtable *rt = (struct rtable *) dst;
1795
1796         if (rt_is_expired(rt))
1797                 return NULL;
1798         ipv4_validate_peer(rt);
1799         return dst;
1800 }
1801
1802 static void ipv4_dst_destroy(struct dst_entry *dst)
1803 {
1804         struct rtable *rt = (struct rtable *) dst;
1805
1806         if (rt->fi) {
1807                 fib_info_put(rt->fi);
1808                 rt->fi = NULL;
1809         }
1810         if (rt_has_peer(rt)) {
1811                 struct inet_peer *peer = rt_peer_ptr(rt);
1812                 inet_putpeer(peer);
1813         }
1814 }
1815
1816
1817 static void ipv4_link_failure(struct sk_buff *skb)
1818 {
1819         struct rtable *rt;
1820
1821         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1822
1823         rt = skb_rtable(skb);
1824         if (rt && rt_has_peer(rt)) {
1825                 struct inet_peer *peer = rt_peer_ptr(rt);
1826                 if (peer_pmtu_cleaned(peer))
1827                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1828         }
1829 }
1830
1831 static int ip_rt_bug(struct sk_buff *skb)
1832 {
1833         pr_debug("%s: %pI4 -> %pI4, %s\n",
1834                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1835                  skb->dev ? skb->dev->name : "?");
1836         kfree_skb(skb);
1837         WARN_ON(1);
1838         return 0;
1839 }
1840
1841 /*
1842    We do not cache source address of outgoing interface,
1843    because it is used only by IP RR, TS and SRR options,
1844    so that it out of fast path.
1845
1846    BTW remember: "addr" is allowed to be not aligned
1847    in IP options!
1848  */
1849
1850 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1851 {
1852         __be32 src;
1853
1854         if (rt_is_output_route(rt))
1855                 src = ip_hdr(skb)->saddr;
1856         else {
1857                 struct fib_result res;
1858                 struct flowi4 fl4;
1859                 struct iphdr *iph;
1860
1861                 iph = ip_hdr(skb);
1862
1863                 memset(&fl4, 0, sizeof(fl4));
1864                 fl4.daddr = iph->daddr;
1865                 fl4.saddr = iph->saddr;
1866                 fl4.flowi4_tos = RT_TOS(iph->tos);
1867                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1868                 fl4.flowi4_iif = skb->dev->ifindex;
1869                 fl4.flowi4_mark = skb->mark;
1870
1871                 rcu_read_lock();
1872                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1873                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1874                 else
1875                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1876                                         RT_SCOPE_UNIVERSE);
1877                 rcu_read_unlock();
1878         }
1879         memcpy(addr, &src, 4);
1880 }
1881
1882 #ifdef CONFIG_IP_ROUTE_CLASSID
1883 static void set_class_tag(struct rtable *rt, u32 tag)
1884 {
1885         if (!(rt->dst.tclassid & 0xFFFF))
1886                 rt->dst.tclassid |= tag & 0xFFFF;
1887         if (!(rt->dst.tclassid & 0xFFFF0000))
1888                 rt->dst.tclassid |= tag & 0xFFFF0000;
1889 }
1890 #endif
1891
1892 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1893 {
1894         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1895
1896         if (advmss == 0) {
1897                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1898                                ip_rt_min_advmss);
1899                 if (advmss > 65535 - 40)
1900                         advmss = 65535 - 40;
1901         }
1902         return advmss;
1903 }
1904
1905 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1906 {
1907         const struct rtable *rt = (const struct rtable *) dst;
1908         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1909
1910         if (mtu && rt_is_output_route(rt))
1911                 return mtu;
1912
1913         mtu = dst->dev->mtu;
1914
1915         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1916
1917                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1918                         mtu = 576;
1919         }
1920
1921         if (mtu > IP_MAX_MTU)
1922                 mtu = IP_MAX_MTU;
1923
1924         return mtu;
1925 }
1926
1927 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1928                             struct fib_info *fi)
1929 {
1930         struct inet_peer_base *base;
1931         struct inet_peer *peer;
1932         int create = 0;
1933
1934         /* If a peer entry exists for this destination, we must hook
1935          * it up in order to get at cached metrics.
1936          */
1937         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1938                 create = 1;
1939
1940         base = inetpeer_base_ptr(rt->_peer);
1941         BUG_ON(!base);
1942
1943         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1944         if (peer) {
1945                 __rt_set_peer(rt, peer);
1946                 rt->rt_peer_genid = rt_peer_genid();
1947                 if (inet_metrics_new(peer))
1948                         memcpy(peer->metrics, fi->fib_metrics,
1949                                sizeof(u32) * RTAX_MAX);
1950                 dst_init_metrics(&rt->dst, peer->metrics, false);
1951
1952                 check_peer_pmtu(&rt->dst, peer);
1953
1954                 if (peer->redirect_learned.a4 &&
1955                     peer->redirect_learned.a4 != rt->rt_gateway) {
1956                         rt->rt_gateway = peer->redirect_learned.a4;
1957                         rt->rt_flags |= RTCF_REDIRECTED;
1958                 }
1959         } else {
1960                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1961                         rt->fi = fi;
1962                         atomic_inc(&fi->fib_clntref);
1963                 }
1964                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1965         }
1966 }
1967
1968 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1969                            const struct fib_result *res,
1970                            struct fib_info *fi, u16 type, u32 itag)
1971 {
1972         struct dst_entry *dst = &rt->dst;
1973
1974         if (fi) {
1975                 if (FIB_RES_GW(*res) &&
1976                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1977                         rt->rt_gateway = FIB_RES_GW(*res);
1978                 rt_init_metrics(rt, fl4, fi);
1979 #ifdef CONFIG_IP_ROUTE_CLASSID
1980                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1981 #endif
1982         }
1983
1984         if (dst_mtu(dst) > IP_MAX_MTU)
1985                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1986         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1987                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1988
1989 #ifdef CONFIG_IP_ROUTE_CLASSID
1990 #ifdef CONFIG_IP_MULTIPLE_TABLES
1991         set_class_tag(rt, fib_rules_tclass(res));
1992 #endif
1993         set_class_tag(rt, itag);
1994 #endif
1995 }
1996
1997 static struct rtable *rt_dst_alloc(struct net_device *dev,
1998                                    bool nopolicy, bool noxfrm)
1999 {
2000         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2001                          DST_HOST |
2002                          (nopolicy ? DST_NOPOLICY : 0) |
2003                          (noxfrm ? DST_NOXFRM : 0));
2004 }
2005
2006 /* called in rcu_read_lock() section */
2007 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2008                                 u8 tos, struct net_device *dev, int our)
2009 {
2010         unsigned int hash;
2011         struct rtable *rth;
2012         __be32 spec_dst;
2013         struct in_device *in_dev = __in_dev_get_rcu(dev);
2014         u32 itag = 0;
2015         int err;
2016
2017         /* Primary sanity checks. */
2018
2019         if (in_dev == NULL)
2020                 return -EINVAL;
2021
2022         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2023             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2024                 goto e_inval;
2025
2026         if (ipv4_is_zeronet(saddr)) {
2027                 if (!ipv4_is_local_multicast(daddr))
2028                         goto e_inval;
2029                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2030         } else {
2031                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2032                                           &itag);
2033                 if (err < 0)
2034                         goto e_err;
2035         }
2036         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2037                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2038         if (!rth)
2039                 goto e_nobufs;
2040
2041 #ifdef CONFIG_IP_ROUTE_CLASSID
2042         rth->dst.tclassid = itag;
2043 #endif
2044         rth->dst.output = ip_rt_bug;
2045
2046         rth->rt_key_dst = daddr;
2047         rth->rt_key_src = saddr;
2048         rth->rt_genid   = rt_genid(dev_net(dev));
2049         rth->rt_flags   = RTCF_MULTICAST;
2050         rth->rt_type    = RTN_MULTICAST;
2051         rth->rt_key_tos = tos;
2052         rth->rt_dst     = daddr;
2053         rth->rt_src     = saddr;
2054         rth->rt_route_iif = dev->ifindex;
2055         rth->rt_iif     = dev->ifindex;
2056         rth->rt_oif     = 0;
2057         rth->rt_mark    = skb->mark;
2058         rth->rt_gateway = daddr;
2059         rth->rt_spec_dst= spec_dst;
2060         rth->rt_peer_genid = 0;
2061         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2062         rth->fi = NULL;
2063         if (our) {
2064                 rth->dst.input= ip_local_deliver;
2065                 rth->rt_flags |= RTCF_LOCAL;
2066         }
2067
2068 #ifdef CONFIG_IP_MROUTE
2069         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2070                 rth->dst.input = ip_mr_input;
2071 #endif
2072         RT_CACHE_STAT_INC(in_slow_mc);
2073
2074         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2075         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2076         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2077
2078 e_nobufs:
2079         return -ENOBUFS;
2080 e_inval:
2081         return -EINVAL;
2082 e_err:
2083         return err;
2084 }
2085
2086
2087 static void ip_handle_martian_source(struct net_device *dev,
2088                                      struct in_device *in_dev,
2089                                      struct sk_buff *skb,
2090                                      __be32 daddr,
2091                                      __be32 saddr)
2092 {
2093         RT_CACHE_STAT_INC(in_martian_src);
2094 #ifdef CONFIG_IP_ROUTE_VERBOSE
2095         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2096                 /*
2097                  *      RFC1812 recommendation, if source is martian,
2098                  *      the only hint is MAC header.
2099                  */
2100                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2101                         &daddr, &saddr, dev->name);
2102                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2103                         print_hex_dump(KERN_WARNING, "ll header: ",
2104                                        DUMP_PREFIX_OFFSET, 16, 1,
2105                                        skb_mac_header(skb),
2106                                        dev->hard_header_len, true);
2107                 }
2108         }
2109 #endif
2110 }
2111
2112 /* called in rcu_read_lock() section */
2113 static int __mkroute_input(struct sk_buff *skb,
2114                            const struct fib_result *res,
2115                            struct in_device *in_dev,
2116                            __be32 daddr, __be32 saddr, u32 tos,
2117                            struct rtable **result)
2118 {
2119         struct rtable *rth;
2120         int err;
2121         struct in_device *out_dev;
2122         unsigned int flags = 0;
2123         __be32 spec_dst;
2124         u32 itag;
2125
2126         /* get a working reference to the output device */
2127         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2128         if (out_dev == NULL) {
2129                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2130                 return -EINVAL;
2131         }
2132
2133
2134         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2135                                   in_dev->dev, &spec_dst, &itag);
2136         if (err < 0) {
2137                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2138                                          saddr);
2139
2140                 goto cleanup;
2141         }
2142
2143         if (err)
2144                 flags |= RTCF_DIRECTSRC;
2145
2146         if (out_dev == in_dev && err &&
2147             (IN_DEV_SHARED_MEDIA(out_dev) ||
2148              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2149                 flags |= RTCF_DOREDIRECT;
2150
2151         if (skb->protocol != htons(ETH_P_IP)) {
2152                 /* Not IP (i.e. ARP). Do not create route, if it is
2153                  * invalid for proxy arp. DNAT routes are always valid.
2154                  *
2155                  * Proxy arp feature have been extended to allow, ARP
2156                  * replies back to the same interface, to support
2157                  * Private VLAN switch technologies. See arp.c.
2158                  */
2159                 if (out_dev == in_dev &&
2160                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2161                         err = -EINVAL;
2162                         goto cleanup;
2163                 }
2164         }
2165
2166         rth = rt_dst_alloc(out_dev->dev,
2167                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2168                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2169         if (!rth) {
2170                 err = -ENOBUFS;
2171                 goto cleanup;
2172         }
2173
2174         rth->rt_key_dst = daddr;
2175         rth->rt_key_src = saddr;
2176         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2177         rth->rt_flags = flags;
2178         rth->rt_type = res->type;
2179         rth->rt_key_tos = tos;
2180         rth->rt_dst     = daddr;
2181         rth->rt_src     = saddr;
2182         rth->rt_route_iif = in_dev->dev->ifindex;
2183         rth->rt_iif     = in_dev->dev->ifindex;
2184         rth->rt_oif     = 0;
2185         rth->rt_mark    = skb->mark;
2186         rth->rt_gateway = daddr;
2187         rth->rt_spec_dst= spec_dst;
2188         rth->rt_peer_genid = 0;
2189         rt_init_peer(rth, dev_net(rth->dst.dev)->ipv4.peers);
2190         rth->fi = NULL;
2191
2192         rth->dst.input = ip_forward;
2193         rth->dst.output = ip_output;
2194
2195         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2196
2197         *result = rth;
2198         err = 0;
2199  cleanup:
2200         return err;
2201 }
2202
2203 static int ip_mkroute_input(struct sk_buff *skb,
2204                             struct fib_result *res,
2205                             const struct flowi4 *fl4,
2206                             struct in_device *in_dev,
2207                             __be32 daddr, __be32 saddr, u32 tos)
2208 {
2209         struct rtable *rth = NULL;
2210         int err;
2211         unsigned int hash;
2212
2213 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2214         if (res->fi && res->fi->fib_nhs > 1)
2215                 fib_select_multipath(res);
2216 #endif
2217
2218         /* create a routing cache entry */
2219         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2220         if (err)
2221                 return err;
2222
2223         /* put it into the cache */
2224         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2225                        rt_genid(dev_net(rth->dst.dev)));
2226         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2227         if (IS_ERR(rth))
2228                 return PTR_ERR(rth);
2229         return 0;
2230 }
2231
2232 /*
2233  *      NOTE. We drop all the packets that has local source
2234  *      addresses, because every properly looped back packet
2235  *      must have correct destination already attached by output routine.
2236  *
2237  *      Such approach solves two big problems:
2238  *      1. Not simplex devices are handled properly.
2239  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2240  *      called with rcu_read_lock()
2241  */
2242
2243 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2244                                u8 tos, struct net_device *dev)
2245 {
2246         struct fib_result res;
2247         struct in_device *in_dev = __in_dev_get_rcu(dev);
2248         struct flowi4   fl4;
2249         unsigned int    flags = 0;
2250         u32             itag = 0;
2251         struct rtable   *rth;
2252         unsigned int    hash;
2253         __be32          spec_dst;
2254         int             err = -EINVAL;
2255         struct net    *net = dev_net(dev);
2256
2257         /* IP on this device is disabled. */
2258
2259         if (!in_dev)
2260                 goto out;
2261
2262         /* Check for the most weird martians, which can be not detected
2263            by fib_lookup.
2264          */
2265
2266         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2267             ipv4_is_loopback(saddr))
2268                 goto martian_source;
2269
2270         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2271                 goto brd_input;
2272
2273         /* Accept zero addresses only to limited broadcast;
2274          * I even do not know to fix it or not. Waiting for complains :-)
2275          */
2276         if (ipv4_is_zeronet(saddr))
2277                 goto martian_source;
2278
2279         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2280                 goto martian_destination;
2281
2282         /*
2283          *      Now we are ready to route packet.
2284          */
2285         fl4.flowi4_oif = 0;
2286         fl4.flowi4_iif = dev->ifindex;
2287         fl4.flowi4_mark = skb->mark;
2288         fl4.flowi4_tos = tos;
2289         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2290         fl4.daddr = daddr;
2291         fl4.saddr = saddr;
2292         err = fib_lookup(net, &fl4, &res);
2293         if (err != 0) {
2294                 if (!IN_DEV_FORWARD(in_dev))
2295                         goto e_hostunreach;
2296                 goto no_route;
2297         }
2298
2299         RT_CACHE_STAT_INC(in_slow_tot);
2300
2301         if (res.type == RTN_BROADCAST)
2302                 goto brd_input;
2303
2304         if (res.type == RTN_LOCAL) {
2305                 err = fib_validate_source(skb, saddr, daddr, tos,
2306                                           net->loopback_dev->ifindex,
2307                                           dev, &spec_dst, &itag);
2308                 if (err < 0)
2309                         goto martian_source_keep_err;
2310                 if (err)
2311                         flags |= RTCF_DIRECTSRC;
2312                 spec_dst = daddr;
2313                 goto local_input;
2314         }
2315
2316         if (!IN_DEV_FORWARD(in_dev))
2317                 goto e_hostunreach;
2318         if (res.type != RTN_UNICAST)
2319                 goto martian_destination;
2320
2321         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2322 out:    return err;
2323
2324 brd_input:
2325         if (skb->protocol != htons(ETH_P_IP))
2326                 goto e_inval;
2327
2328         if (ipv4_is_zeronet(saddr))
2329                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2330         else {
2331                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2332                                           &itag);
2333                 if (err < 0)
2334                         goto martian_source_keep_err;
2335                 if (err)
2336                         flags |= RTCF_DIRECTSRC;
2337         }
2338         flags |= RTCF_BROADCAST;
2339         res.type = RTN_BROADCAST;
2340         RT_CACHE_STAT_INC(in_brd);
2341
2342 local_input:
2343         rth = rt_dst_alloc(net->loopback_dev,
2344                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2345         if (!rth)
2346                 goto e_nobufs;
2347
2348         rth->dst.input= ip_local_deliver;
2349         rth->dst.output= ip_rt_bug;
2350 #ifdef CONFIG_IP_ROUTE_CLASSID
2351         rth->dst.tclassid = itag;
2352 #endif
2353
2354         rth->rt_key_dst = daddr;
2355         rth->rt_key_src = saddr;
2356         rth->rt_genid = rt_genid(net);
2357         rth->rt_flags   = flags|RTCF_LOCAL;
2358         rth->rt_type    = res.type;
2359         rth->rt_key_tos = tos;
2360         rth->rt_dst     = daddr;
2361         rth->rt_src     = saddr;
2362 #ifdef CONFIG_IP_ROUTE_CLASSID
2363         rth->dst.tclassid = itag;
2364 #endif
2365         rth->rt_route_iif = dev->ifindex;
2366         rth->rt_iif     = dev->ifindex;
2367         rth->rt_oif     = 0;
2368         rth->rt_mark    = skb->mark;
2369         rth->rt_gateway = daddr;
2370         rth->rt_spec_dst= spec_dst;
2371         rth->rt_peer_genid = 0;
2372         rt_init_peer(rth, net->ipv4.peers);
2373         rth->fi = NULL;
2374         if (res.type == RTN_UNREACHABLE) {
2375                 rth->dst.input= ip_error;
2376                 rth->dst.error= -err;
2377                 rth->rt_flags   &= ~RTCF_LOCAL;
2378         }
2379         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2380         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2381         err = 0;
2382         if (IS_ERR(rth))
2383                 err = PTR_ERR(rth);
2384         goto out;
2385
2386 no_route:
2387         RT_CACHE_STAT_INC(in_no_route);
2388         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2389         res.type = RTN_UNREACHABLE;
2390         if (err == -ESRCH)
2391                 err = -ENETUNREACH;
2392         goto local_input;
2393
2394         /*
2395          *      Do not cache martian addresses: they should be logged (RFC1812)
2396          */
2397 martian_destination:
2398         RT_CACHE_STAT_INC(in_martian_dst);
2399 #ifdef CONFIG_IP_ROUTE_VERBOSE
2400         if (IN_DEV_LOG_MARTIANS(in_dev))
2401                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2402                                      &daddr, &saddr, dev->name);
2403 #endif
2404
2405 e_hostunreach:
2406         err = -EHOSTUNREACH;
2407         goto out;
2408
2409 e_inval:
2410         err = -EINVAL;
2411         goto out;
2412
2413 e_nobufs:
2414         err = -ENOBUFS;
2415         goto out;
2416
2417 martian_source:
2418         err = -EINVAL;
2419 martian_source_keep_err:
2420         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2421         goto out;
2422 }
2423
2424 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2425                            u8 tos, struct net_device *dev, bool noref)
2426 {
2427         struct rtable   *rth;
2428         unsigned int    hash;
2429         int iif = dev->ifindex;
2430         struct net *net;
2431         int res;
2432
2433         net = dev_net(dev);
2434
2435         rcu_read_lock();
2436
2437         if (!rt_caching(net))
2438                 goto skip_cache;
2439
2440         tos &= IPTOS_RT_MASK;
2441         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2442
2443         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2444              rth = rcu_dereference(rth->dst.rt_next)) {
2445                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2446                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2447                      (rth->rt_route_iif ^ iif) |
2448                      (rth->rt_key_tos ^ tos)) == 0 &&
2449                     rth->rt_mark == skb->mark &&
2450                     net_eq(dev_net(rth->dst.dev), net) &&
2451                     !rt_is_expired(rth)) {
2452                         ipv4_validate_peer(rth);
2453                         if (noref) {
2454                                 dst_use_noref(&rth->dst, jiffies);
2455                                 skb_dst_set_noref(skb, &rth->dst);
2456                         } else {
2457                                 dst_use(&rth->dst, jiffies);
2458                                 skb_dst_set(skb, &rth->dst);
2459                         }
2460                         RT_CACHE_STAT_INC(in_hit);
2461                         rcu_read_unlock();
2462                         return 0;
2463                 }
2464                 RT_CACHE_STAT_INC(in_hlist_search);
2465         }
2466
2467 skip_cache:
2468         /* Multicast recognition logic is moved from route cache to here.
2469            The problem was that too many Ethernet cards have broken/missing
2470            hardware multicast filters :-( As result the host on multicasting
2471            network acquires a lot of useless route cache entries, sort of
2472            SDR messages from all the world. Now we try to get rid of them.
2473            Really, provided software IP multicast filter is organized
2474            reasonably (at least, hashed), it does not result in a slowdown
2475            comparing with route cache reject entries.
2476            Note, that multicast routers are not affected, because
2477            route cache entry is created eventually.
2478          */
2479         if (ipv4_is_multicast(daddr)) {
2480                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2481
2482                 if (in_dev) {
2483                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2484                                                   ip_hdr(skb)->protocol);
2485                         if (our
2486 #ifdef CONFIG_IP_MROUTE
2487                                 ||
2488                             (!ipv4_is_local_multicast(daddr) &&
2489                              IN_DEV_MFORWARD(in_dev))
2490 #endif
2491                            ) {
2492                                 int res = ip_route_input_mc(skb, daddr, saddr,
2493                                                             tos, dev, our);
2494                                 rcu_read_unlock();
2495                                 return res;
2496                         }
2497                 }
2498                 rcu_read_unlock();
2499                 return -EINVAL;
2500         }
2501         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2502         rcu_read_unlock();
2503         return res;
2504 }
2505 EXPORT_SYMBOL(ip_route_input_common);
2506
2507 /* called with rcu_read_lock() */
2508 static struct rtable *__mkroute_output(const struct fib_result *res,
2509                                        const struct flowi4 *fl4,
2510                                        __be32 orig_daddr, __be32 orig_saddr,
2511                                        int orig_oif, __u8 orig_rtos,
2512                                        struct net_device *dev_out,
2513                                        unsigned int flags)
2514 {
2515         struct fib_info *fi = res->fi;
2516         struct in_device *in_dev;
2517         u16 type = res->type;
2518         struct rtable *rth;
2519
2520         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2521                 return ERR_PTR(-EINVAL);
2522
2523         if (ipv4_is_lbcast(fl4->daddr))
2524                 type = RTN_BROADCAST;
2525         else if (ipv4_is_multicast(fl4->daddr))
2526                 type = RTN_MULTICAST;
2527         else if (ipv4_is_zeronet(fl4->daddr))
2528                 return ERR_PTR(-EINVAL);
2529
2530         if (dev_out->flags & IFF_LOOPBACK)
2531                 flags |= RTCF_LOCAL;
2532
2533         in_dev = __in_dev_get_rcu(dev_out);
2534         if (!in_dev)
2535                 return ERR_PTR(-EINVAL);
2536
2537         if (type == RTN_BROADCAST) {
2538                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2539                 fi = NULL;
2540         } else if (type == RTN_MULTICAST) {
2541                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2542                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2543                                      fl4->flowi4_proto))
2544                         flags &= ~RTCF_LOCAL;
2545                 /* If multicast route do not exist use
2546                  * default one, but do not gateway in this case.
2547                  * Yes, it is hack.
2548                  */
2549                 if (fi && res->prefixlen < 4)
2550                         fi = NULL;
2551         }
2552
2553         rth = rt_dst_alloc(dev_out,
2554                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2555                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2556         if (!rth)
2557                 return ERR_PTR(-ENOBUFS);
2558
2559         rth->dst.output = ip_output;
2560
2561         rth->rt_key_dst = orig_daddr;
2562         rth->rt_key_src = orig_saddr;
2563         rth->rt_genid = rt_genid(dev_net(dev_out));
2564         rth->rt_flags   = flags;
2565         rth->rt_type    = type;
2566         rth->rt_key_tos = orig_rtos;
2567         rth->rt_dst     = fl4->daddr;
2568         rth->rt_src     = fl4->saddr;
2569         rth->rt_route_iif = 0;
2570         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2571         rth->rt_oif     = orig_oif;
2572         rth->rt_mark    = fl4->flowi4_mark;
2573         rth->rt_gateway = fl4->daddr;
2574         rth->rt_spec_dst= fl4->saddr;
2575         rth->rt_peer_genid = 0;
2576         rt_init_peer(rth, dev_net(dev_out)->ipv4.peers);
2577         rth->fi = NULL;
2578
2579         RT_CACHE_STAT_INC(out_slow_tot);
2580
2581         if (flags & RTCF_LOCAL) {
2582                 rth->dst.input = ip_local_deliver;
2583                 rth->rt_spec_dst = fl4->daddr;
2584         }
2585         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2586                 rth->rt_spec_dst = fl4->saddr;
2587                 if (flags & RTCF_LOCAL &&
2588                     !(dev_out->flags & IFF_LOOPBACK)) {
2589                         rth->dst.output = ip_mc_output;
2590                         RT_CACHE_STAT_INC(out_slow_mc);
2591                 }
2592 #ifdef CONFIG_IP_MROUTE
2593                 if (type == RTN_MULTICAST) {
2594                         if (IN_DEV_MFORWARD(in_dev) &&
2595                             !ipv4_is_local_multicast(fl4->daddr)) {
2596                                 rth->dst.input = ip_mr_input;
2597                                 rth->dst.output = ip_mc_output;
2598                         }
2599                 }
2600 #endif
2601         }
2602
2603         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2604
2605         return rth;
2606 }
2607
2608 /*
2609  * Major route resolver routine.
2610  * called with rcu_read_lock();
2611  */
2612
2613 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2614 {
2615         struct net_device *dev_out = NULL;
2616         __u8 tos = RT_FL_TOS(fl4);
2617         unsigned int flags = 0;
2618         struct fib_result res;
2619         struct rtable *rth;
2620         __be32 orig_daddr;
2621         __be32 orig_saddr;
2622         int orig_oif;
2623
2624         res.fi          = NULL;
2625 #ifdef CONFIG_IP_MULTIPLE_TABLES
2626         res.r           = NULL;
2627 #endif
2628
2629         orig_daddr = fl4->daddr;
2630         orig_saddr = fl4->saddr;
2631         orig_oif = fl4->flowi4_oif;
2632
2633         fl4->flowi4_iif = net->loopback_dev->ifindex;
2634         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2635         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2636                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2637
2638         rcu_read_lock();
2639         if (fl4->saddr) {
2640                 rth = ERR_PTR(-EINVAL);
2641                 if (ipv4_is_multicast(fl4->saddr) ||
2642                     ipv4_is_lbcast(fl4->saddr) ||
2643                     ipv4_is_zeronet(fl4->saddr))
2644                         goto out;
2645
2646                 /* I removed check for oif == dev_out->oif here.
2647                    It was wrong for two reasons:
2648                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2649                       is assigned to multiple interfaces.
2650                    2. Moreover, we are allowed to send packets with saddr
2651                       of another iface. --ANK
2652                  */
2653
2654                 if (fl4->flowi4_oif == 0 &&
2655                     (ipv4_is_multicast(fl4->daddr) ||
2656                      ipv4_is_lbcast(fl4->daddr))) {
2657                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2658                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2659                         if (dev_out == NULL)
2660                                 goto out;
2661
2662                         /* Special hack: user can direct multicasts
2663                            and limited broadcast via necessary interface
2664                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2665                            This hack is not just for fun, it allows
2666                            vic,vat and friends to work.
2667                            They bind socket to loopback, set ttl to zero
2668                            and expect that it will work.
2669                            From the viewpoint of routing cache they are broken,
2670                            because we are not allowed to build multicast path
2671                            with loopback source addr (look, routing cache
2672                            cannot know, that ttl is zero, so that packet
2673                            will not leave this host and route is valid).
2674                            Luckily, this hack is good workaround.
2675                          */
2676
2677                         fl4->flowi4_oif = dev_out->ifindex;
2678                         goto make_route;
2679                 }
2680
2681                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2682                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2683                         if (!__ip_dev_find(net, fl4->saddr, false))
2684                                 goto out;
2685                 }
2686         }
2687
2688
2689         if (fl4->flowi4_oif) {
2690                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2691                 rth = ERR_PTR(-ENODEV);
2692                 if (dev_out == NULL)
2693                         goto out;
2694
2695                 /* RACE: Check return value of inet_select_addr instead. */
2696                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2697                         rth = ERR_PTR(-ENETUNREACH);
2698                         goto out;
2699                 }
2700                 if (ipv4_is_local_multicast(fl4->daddr) ||
2701                     ipv4_is_lbcast(fl4->daddr)) {
2702                         if (!fl4->saddr)
2703                                 fl4->saddr = inet_select_addr(dev_out, 0,
2704                                                               RT_SCOPE_LINK);
2705                         goto make_route;
2706                 }
2707                 if (fl4->saddr) {
2708                         if (ipv4_is_multicast(fl4->daddr))
2709                                 fl4->saddr = inet_select_addr(dev_out, 0,
2710                                                               fl4->flowi4_scope);
2711                         else if (!fl4->daddr)
2712                                 fl4->saddr = inet_select_addr(dev_out, 0,
2713                                                               RT_SCOPE_HOST);
2714                 }
2715         }
2716
2717         if (!fl4->daddr) {
2718                 fl4->daddr = fl4->saddr;
2719                 if (!fl4->daddr)
2720                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2721                 dev_out = net->loopback_dev;
2722                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2723                 res.type = RTN_LOCAL;
2724                 flags |= RTCF_LOCAL;
2725                 goto make_route;
2726         }
2727
2728         if (fib_lookup(net, fl4, &res)) {
2729                 res.fi = NULL;
2730                 if (fl4->flowi4_oif) {
2731                         /* Apparently, routing tables are wrong. Assume,
2732                            that the destination is on link.
2733
2734                            WHY? DW.
2735                            Because we are allowed to send to iface
2736                            even if it has NO routes and NO assigned
2737                            addresses. When oif is specified, routing
2738                            tables are looked up with only one purpose:
2739                            to catch if destination is gatewayed, rather than
2740                            direct. Moreover, if MSG_DONTROUTE is set,
2741                            we send packet, ignoring both routing tables
2742                            and ifaddr state. --ANK
2743
2744
2745                            We could make it even if oif is unknown,
2746                            likely IPv6, but we do not.
2747                          */
2748
2749                         if (fl4->saddr == 0)
2750                                 fl4->saddr = inet_select_addr(dev_out, 0,
2751                                                               RT_SCOPE_LINK);
2752                         res.type = RTN_UNICAST;
2753                         goto make_route;
2754                 }
2755                 rth = ERR_PTR(-ENETUNREACH);
2756                 goto out;
2757         }
2758
2759         if (res.type == RTN_LOCAL) {
2760                 if (!fl4->saddr) {
2761                         if (res.fi->fib_prefsrc)
2762                                 fl4->saddr = res.fi->fib_prefsrc;
2763                         else
2764                                 fl4->saddr = fl4->daddr;
2765                 }
2766                 dev_out = net->loopback_dev;
2767                 fl4->flowi4_oif = dev_out->ifindex;
2768                 res.fi = NULL;
2769                 flags |= RTCF_LOCAL;
2770                 goto make_route;
2771         }
2772
2773 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2774         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2775                 fib_select_multipath(&res);
2776         else
2777 #endif
2778         if (!res.prefixlen &&
2779             res.table->tb_num_default > 1 &&
2780             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2781                 fib_select_default(&res);
2782
2783         if (!fl4->saddr)
2784                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2785
2786         dev_out = FIB_RES_DEV(res);
2787         fl4->flowi4_oif = dev_out->ifindex;
2788
2789
2790 make_route:
2791         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2792                                tos, dev_out, flags);
2793         if (!IS_ERR(rth)) {
2794                 unsigned int hash;
2795
2796                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2797                                rt_genid(dev_net(dev_out)));
2798                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2799         }
2800
2801 out:
2802         rcu_read_unlock();
2803         return rth;
2804 }
2805
2806 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2807 {
2808         struct rtable *rth;
2809         unsigned int hash;
2810
2811         if (!rt_caching(net))
2812                 goto slow_output;
2813
2814         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2815
2816         rcu_read_lock_bh();
2817         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2818                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2819                 if (rth->rt_key_dst == flp4->daddr &&
2820                     rth->rt_key_src == flp4->saddr &&
2821                     rt_is_output_route(rth) &&
2822                     rth->rt_oif == flp4->flowi4_oif &&
2823                     rth->rt_mark == flp4->flowi4_mark &&
2824                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2825                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2826                     net_eq(dev_net(rth->dst.dev), net) &&
2827                     !rt_is_expired(rth)) {
2828                         ipv4_validate_peer(rth);
2829                         dst_use(&rth->dst, jiffies);
2830                         RT_CACHE_STAT_INC(out_hit);
2831                         rcu_read_unlock_bh();
2832                         if (!flp4->saddr)
2833                                 flp4->saddr = rth->rt_src;
2834                         if (!flp4->daddr)
2835                                 flp4->daddr = rth->rt_dst;
2836                         return rth;
2837                 }
2838                 RT_CACHE_STAT_INC(out_hlist_search);
2839         }
2840         rcu_read_unlock_bh();
2841
2842 slow_output:
2843         return ip_route_output_slow(net, flp4);
2844 }
2845 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2846
2847 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2848 {
2849         return NULL;
2850 }
2851
2852 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2853 {
2854         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2855
2856         return mtu ? : dst->dev->mtu;
2857 }
2858
2859 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2860 {
2861 }
2862
2863 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2864                                           unsigned long old)
2865 {
2866         return NULL;
2867 }
2868
2869 static struct dst_ops ipv4_dst_blackhole_ops = {
2870         .family                 =       AF_INET,
2871         .protocol               =       cpu_to_be16(ETH_P_IP),
2872         .destroy                =       ipv4_dst_destroy,
2873         .check                  =       ipv4_blackhole_dst_check,
2874         .mtu                    =       ipv4_blackhole_mtu,
2875         .default_advmss         =       ipv4_default_advmss,
2876         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2877         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2878         .neigh_lookup           =       ipv4_neigh_lookup,
2879 };
2880
2881 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2882 {
2883         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2884         struct rtable *ort = (struct rtable *) dst_orig;
2885
2886         if (rt) {
2887                 struct dst_entry *new = &rt->dst;
2888
2889                 new->__use = 1;
2890                 new->input = dst_discard;
2891                 new->output = dst_discard;
2892                 dst_copy_metrics(new, &ort->dst);
2893
2894                 new->dev = ort->dst.dev;
2895                 if (new->dev)
2896                         dev_hold(new->dev);
2897
2898                 rt->rt_key_dst = ort->rt_key_dst;
2899                 rt->rt_key_src = ort->rt_key_src;
2900                 rt->rt_key_tos = ort->rt_key_tos;
2901                 rt->rt_route_iif = ort->rt_route_iif;
2902                 rt->rt_iif = ort->rt_iif;
2903                 rt->rt_oif = ort->rt_oif;
2904                 rt->rt_mark = ort->rt_mark;
2905
2906                 rt->rt_genid = rt_genid(net);
2907                 rt->rt_flags = ort->rt_flags;
2908                 rt->rt_type = ort->rt_type;
2909                 rt->rt_dst = ort->rt_dst;
2910                 rt->rt_src = ort->rt_src;
2911                 rt->rt_gateway = ort->rt_gateway;
2912                 rt->rt_spec_dst = ort->rt_spec_dst;
2913                 rt_transfer_peer(rt, ort);
2914                 rt->fi = ort->fi;
2915                 if (rt->fi)
2916                         atomic_inc(&rt->fi->fib_clntref);
2917
2918                 dst_free(new);
2919         }
2920
2921         dst_release(dst_orig);
2922
2923         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2924 }
2925
2926 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2927                                     struct sock *sk)
2928 {
2929         struct rtable *rt = __ip_route_output_key(net, flp4);
2930
2931         if (IS_ERR(rt))
2932                 return rt;
2933
2934         if (flp4->flowi4_proto)
2935                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2936                                                    flowi4_to_flowi(flp4),
2937                                                    sk, 0);
2938
2939         return rt;
2940 }
2941 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2942
2943 static int rt_fill_info(struct net *net,
2944                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2945                         int nowait, unsigned int flags)
2946 {
2947         struct rtable *rt = skb_rtable(skb);
2948         struct rtmsg *r;
2949         struct nlmsghdr *nlh;
2950         unsigned long expires = 0;
2951         u32 id = 0, ts = 0, tsage = 0, error;
2952
2953         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2954         if (nlh == NULL)
2955                 return -EMSGSIZE;
2956
2957         r = nlmsg_data(nlh);
2958         r->rtm_family    = AF_INET;
2959         r->rtm_dst_len  = 32;
2960         r->rtm_src_len  = 0;
2961         r->rtm_tos      = rt->rt_key_tos;
2962         r->rtm_table    = RT_TABLE_MAIN;
2963         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2964                 goto nla_put_failure;
2965         r->rtm_type     = rt->rt_type;
2966         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2967         r->rtm_protocol = RTPROT_UNSPEC;
2968         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2969         if (rt->rt_flags & RTCF_NOTIFY)
2970                 r->rtm_flags |= RTM_F_NOTIFY;
2971
2972         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2973                 goto nla_put_failure;
2974         if (rt->rt_key_src) {
2975                 r->rtm_src_len = 32;
2976                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2977                         goto nla_put_failure;
2978         }
2979         if (rt->dst.dev &&
2980             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2981                 goto nla_put_failure;
2982 #ifdef CONFIG_IP_ROUTE_CLASSID
2983         if (rt->dst.tclassid &&
2984             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2985                 goto nla_put_failure;
2986 #endif
2987         if (rt_is_input_route(rt)) {
2988                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2989                         goto nla_put_failure;
2990         } else if (rt->rt_src != rt->rt_key_src) {
2991                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2992                         goto nla_put_failure;
2993         }
2994         if (rt->rt_dst != rt->rt_gateway &&
2995             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2996                 goto nla_put_failure;
2997
2998         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2999                 goto nla_put_failure;
3000
3001         if (rt->rt_mark &&
3002             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
3003                 goto nla_put_failure;
3004
3005         error = rt->dst.error;
3006         if (rt_has_peer(rt)) {
3007                 const struct inet_peer *peer = rt_peer_ptr(rt);
3008                 inet_peer_refcheck(peer);
3009                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3010                 if (peer->tcp_ts_stamp) {
3011                         ts = peer->tcp_ts;
3012                         tsage = get_seconds() - peer->tcp_ts_stamp;
3013                 }
3014                 expires = ACCESS_ONCE(peer->pmtu_expires);
3015                 if (expires) {
3016                         if (time_before(jiffies, expires))
3017                                 expires -= jiffies;
3018                         else
3019                                 expires = 0;
3020                 }
3021         }
3022
3023         if (rt_is_input_route(rt)) {
3024 #ifdef CONFIG_IP_MROUTE
3025                 __be32 dst = rt->rt_dst;
3026
3027                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3028                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3029                         int err = ipmr_get_route(net, skb,
3030                                                  rt->rt_src, rt->rt_dst,
3031                                                  r, nowait);
3032                         if (err <= 0) {
3033                                 if (!nowait) {
3034                                         if (err == 0)
3035                                                 return 0;
3036                                         goto nla_put_failure;
3037                                 } else {
3038                                         if (err == -EMSGSIZE)
3039                                                 goto nla_put_failure;
3040                                         error = err;
3041                                 }
3042                         }
3043                 } else
3044 #endif
3045                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3046                                 goto nla_put_failure;
3047         }
3048
3049         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3050                                expires, error) < 0)
3051                 goto nla_put_failure;
3052
3053         return nlmsg_end(skb, nlh);
3054
3055 nla_put_failure:
3056         nlmsg_cancel(skb, nlh);
3057         return -EMSGSIZE;
3058 }
3059
3060 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3061 {
3062         struct net *net = sock_net(in_skb->sk);
3063         struct rtmsg *rtm;
3064         struct nlattr *tb[RTA_MAX+1];
3065         struct rtable *rt = NULL;
3066         __be32 dst = 0;
3067         __be32 src = 0;
3068         u32 iif;
3069         int err;
3070         int mark;
3071         struct sk_buff *skb;
3072
3073         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3074         if (err < 0)
3075                 goto errout;
3076
3077         rtm = nlmsg_data(nlh);
3078
3079         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3080         if (skb == NULL) {
3081                 err = -ENOBUFS;
3082                 goto errout;
3083         }
3084
3085         /* Reserve room for dummy headers, this skb can pass
3086            through good chunk of routing engine.
3087          */
3088         skb_reset_mac_header(skb);
3089         skb_reset_network_header(skb);
3090
3091         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3092         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3093         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3094
3095         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3096         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3097         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3098         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3099
3100         if (iif) {
3101                 struct net_device *dev;
3102
3103                 dev = __dev_get_by_index(net, iif);
3104                 if (dev == NULL) {
3105                         err = -ENODEV;
3106                         goto errout_free;
3107                 }
3108
3109                 skb->protocol   = htons(ETH_P_IP);
3110                 skb->dev        = dev;
3111                 skb->mark       = mark;
3112                 local_bh_disable();
3113                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3114                 local_bh_enable();
3115
3116                 rt = skb_rtable(skb);
3117                 if (err == 0 && rt->dst.error)
3118                         err = -rt->dst.error;
3119         } else {
3120                 struct flowi4 fl4 = {
3121                         .daddr = dst,
3122                         .saddr = src,
3123                         .flowi4_tos = rtm->rtm_tos,
3124                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3125                         .flowi4_mark = mark,
3126                 };
3127                 rt = ip_route_output_key(net, &fl4);
3128
3129                 err = 0;
3130                 if (IS_ERR(rt))
3131                         err = PTR_ERR(rt);
3132         }
3133
3134         if (err)
3135                 goto errout_free;
3136
3137         skb_dst_set(skb, &rt->dst);
3138         if (rtm->rtm_flags & RTM_F_NOTIFY)
3139                 rt->rt_flags |= RTCF_NOTIFY;
3140
3141         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3142                            RTM_NEWROUTE, 0, 0);
3143         if (err <= 0)
3144                 goto errout_free;
3145
3146         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3147 errout:
3148         return err;
3149
3150 errout_free:
3151         kfree_skb(skb);
3152         goto errout;
3153 }
3154
3155 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3156 {
3157         struct rtable *rt;
3158         int h, s_h;
3159         int idx, s_idx;
3160         struct net *net;
3161
3162         net = sock_net(skb->sk);
3163
3164         s_h = cb->args[0];
3165         if (s_h < 0)
3166                 s_h = 0;
3167         s_idx = idx = cb->args[1];
3168         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3169                 if (!rt_hash_table[h].chain)
3170                         continue;
3171                 rcu_read_lock_bh();
3172                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3173                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3174                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3175                                 continue;
3176                         if (rt_is_expired(rt))
3177                                 continue;
3178                         skb_dst_set_noref(skb, &rt->dst);
3179                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3180                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3181                                          1, NLM_F_MULTI) <= 0) {
3182                                 skb_dst_drop(skb);
3183                                 rcu_read_unlock_bh();
3184                                 goto done;
3185                         }
3186                         skb_dst_drop(skb);
3187                 }
3188                 rcu_read_unlock_bh();
3189         }
3190
3191 done:
3192         cb->args[0] = h;
3193         cb->args[1] = idx;
3194         return skb->len;
3195 }
3196
3197 void ip_rt_multicast_event(struct in_device *in_dev)
3198 {
3199         rt_cache_flush(dev_net(in_dev->dev), 0);
3200 }
3201
3202 #ifdef CONFIG_SYSCTL
3203 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3204                                         void __user *buffer,
3205                                         size_t *lenp, loff_t *ppos)
3206 {
3207         if (write) {
3208                 int flush_delay;
3209                 ctl_table ctl;
3210                 struct net *net;
3211
3212                 memcpy(&ctl, __ctl, sizeof(ctl));
3213                 ctl.data = &flush_delay;
3214                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3215
3216                 net = (struct net *)__ctl->extra1;
3217                 rt_cache_flush(net, flush_delay);
3218                 return 0;
3219         }
3220
3221         return -EINVAL;
3222 }
3223
3224 static ctl_table ipv4_route_table[] = {
3225         {
3226                 .procname       = "gc_thresh",
3227                 .data           = &ipv4_dst_ops.gc_thresh,
3228                 .maxlen         = sizeof(int),
3229                 .mode           = 0644,
3230                 .proc_handler   = proc_dointvec,
3231         },
3232         {
3233                 .procname       = "max_size",
3234                 .data           = &ip_rt_max_size,
3235                 .maxlen         = sizeof(int),
3236                 .mode           = 0644,
3237                 .proc_handler   = proc_dointvec,
3238         },
3239         {
3240                 /*  Deprecated. Use gc_min_interval_ms */
3241
3242                 .procname       = "gc_min_interval",
3243                 .data           = &ip_rt_gc_min_interval,
3244                 .maxlen         = sizeof(int),
3245                 .mode           = 0644,
3246                 .proc_handler   = proc_dointvec_jiffies,
3247         },
3248         {
3249                 .procname       = "gc_min_interval_ms",
3250                 .data           = &ip_rt_gc_min_interval,
3251                 .maxlen         = sizeof(int),
3252                 .mode           = 0644,
3253                 .proc_handler   = proc_dointvec_ms_jiffies,
3254         },
3255         {
3256                 .procname       = "gc_timeout",
3257                 .data           = &ip_rt_gc_timeout,
3258                 .maxlen         = sizeof(int),
3259                 .mode           = 0644,
3260                 .proc_handler   = proc_dointvec_jiffies,
3261         },
3262         {
3263                 .procname       = "gc_interval",
3264                 .data           = &ip_rt_gc_interval,
3265                 .maxlen         = sizeof(int),
3266                 .mode           = 0644,
3267                 .proc_handler   = proc_dointvec_jiffies,
3268         },
3269         {
3270                 .procname       = "redirect_load",
3271                 .data           = &ip_rt_redirect_load,
3272                 .maxlen         = sizeof(int),
3273                 .mode           = 0644,
3274                 .proc_handler   = proc_dointvec,
3275         },
3276         {
3277                 .procname       = "redirect_number",
3278                 .data           = &ip_rt_redirect_number,
3279                 .maxlen         = sizeof(int),
3280                 .mode           = 0644,
3281                 .proc_handler   = proc_dointvec,
3282         },
3283         {
3284                 .procname       = "redirect_silence",
3285                 .data           = &ip_rt_redirect_silence,
3286                 .maxlen         = sizeof(int),
3287                 .mode           = 0644,
3288                 .proc_handler   = proc_dointvec,
3289         },
3290         {
3291                 .procname       = "error_cost",
3292                 .data           = &ip_rt_error_cost,
3293                 .maxlen         = sizeof(int),
3294                 .mode           = 0644,
3295                 .proc_handler   = proc_dointvec,
3296         },
3297         {
3298                 .procname       = "error_burst",
3299                 .data           = &ip_rt_error_burst,
3300                 .maxlen         = sizeof(int),
3301                 .mode           = 0644,
3302                 .proc_handler   = proc_dointvec,
3303         },
3304         {
3305                 .procname       = "gc_elasticity",
3306                 .data           = &ip_rt_gc_elasticity,
3307                 .maxlen         = sizeof(int),
3308                 .mode           = 0644,
3309                 .proc_handler   = proc_dointvec,
3310         },
3311         {
3312                 .procname       = "mtu_expires",
3313                 .data           = &ip_rt_mtu_expires,
3314                 .maxlen         = sizeof(int),
3315                 .mode           = 0644,
3316                 .proc_handler   = proc_dointvec_jiffies,
3317         },
3318         {
3319                 .procname       = "min_pmtu",
3320                 .data           = &ip_rt_min_pmtu,
3321                 .maxlen         = sizeof(int),
3322                 .mode           = 0644,
3323                 .proc_handler   = proc_dointvec,
3324         },
3325         {
3326                 .procname       = "min_adv_mss",
3327                 .data           = &ip_rt_min_advmss,
3328                 .maxlen         = sizeof(int),
3329                 .mode           = 0644,
3330                 .proc_handler   = proc_dointvec,
3331         },
3332         { }
3333 };
3334
3335 static struct ctl_table ipv4_route_flush_table[] = {
3336         {
3337                 .procname       = "flush",
3338                 .maxlen         = sizeof(int),
3339                 .mode           = 0200,
3340                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3341         },
3342         { },
3343 };
3344
3345 static __net_init int sysctl_route_net_init(struct net *net)
3346 {
3347         struct ctl_table *tbl;
3348
3349         tbl = ipv4_route_flush_table;
3350         if (!net_eq(net, &init_net)) {
3351                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3352                 if (tbl == NULL)
3353                         goto err_dup;
3354         }
3355         tbl[0].extra1 = net;
3356
3357         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3358         if (net->ipv4.route_hdr == NULL)
3359                 goto err_reg;
3360         return 0;
3361
3362 err_reg:
3363         if (tbl != ipv4_route_flush_table)
3364                 kfree(tbl);
3365 err_dup:
3366         return -ENOMEM;
3367 }
3368
3369 static __net_exit void sysctl_route_net_exit(struct net *net)
3370 {
3371         struct ctl_table *tbl;
3372
3373         tbl = net->ipv4.route_hdr->ctl_table_arg;
3374         unregister_net_sysctl_table(net->ipv4.route_hdr);
3375         BUG_ON(tbl == ipv4_route_flush_table);
3376         kfree(tbl);
3377 }
3378
3379 static __net_initdata struct pernet_operations sysctl_route_ops = {
3380         .init = sysctl_route_net_init,
3381         .exit = sysctl_route_net_exit,
3382 };
3383 #endif
3384
3385 static __net_init int rt_genid_init(struct net *net)
3386 {
3387         get_random_bytes(&net->ipv4.rt_genid,
3388                          sizeof(net->ipv4.rt_genid));
3389         get_random_bytes(&net->ipv4.dev_addr_genid,
3390                          sizeof(net->ipv4.dev_addr_genid));
3391         return 0;
3392 }
3393
3394 static __net_initdata struct pernet_operations rt_genid_ops = {
3395         .init = rt_genid_init,
3396 };
3397
3398 static int __net_init ipv4_inetpeer_init(struct net *net)
3399 {
3400         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3401
3402         if (!bp)
3403                 return -ENOMEM;
3404         inet_peer_base_init(bp);
3405         net->ipv4.peers = bp;
3406         return 0;
3407 }
3408
3409 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3410 {
3411         struct inet_peer_base *bp = net->ipv4.peers;
3412
3413         net->ipv4.peers = NULL;
3414         inetpeer_invalidate_tree(bp);
3415         kfree(bp);
3416 }
3417
3418 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3419         .init   =       ipv4_inetpeer_init,
3420         .exit   =       ipv4_inetpeer_exit,
3421 };
3422
3423 #ifdef CONFIG_IP_ROUTE_CLASSID
3424 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3425 #endif /* CONFIG_IP_ROUTE_CLASSID */
3426
3427 static __initdata unsigned long rhash_entries;
3428 static int __init set_rhash_entries(char *str)
3429 {
3430         ssize_t ret;
3431
3432         if (!str)
3433                 return 0;
3434
3435         ret = kstrtoul(str, 0, &rhash_entries);
3436         if (ret)
3437                 return 0;
3438
3439         return 1;
3440 }
3441 __setup("rhash_entries=", set_rhash_entries);
3442
3443 int __init ip_rt_init(void)
3444 {
3445         int rc = 0;
3446
3447 #ifdef CONFIG_IP_ROUTE_CLASSID
3448         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3449         if (!ip_rt_acct)
3450                 panic("IP: failed to allocate ip_rt_acct\n");
3451 #endif
3452
3453         ipv4_dst_ops.kmem_cachep =
3454                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3455                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3456
3457         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3458
3459         if (dst_entries_init(&ipv4_dst_ops) < 0)
3460                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3461
3462         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3463                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3464
3465         rt_hash_table = (struct rt_hash_bucket *)
3466                 alloc_large_system_hash("IP route cache",
3467                                         sizeof(struct rt_hash_bucket),
3468                                         rhash_entries,
3469                                         (totalram_pages >= 128 * 1024) ?
3470                                         15 : 17,
3471                                         0,
3472                                         &rt_hash_log,
3473                                         &rt_hash_mask,
3474                                         0,
3475                                         rhash_entries ? 0 : 512 * 1024);
3476         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3477         rt_hash_lock_init();
3478
3479         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3480         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3481
3482         devinet_init();
3483         ip_fib_init();
3484
3485         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3486         expires_ljiffies = jiffies;
3487         schedule_delayed_work(&expires_work,
3488                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3489
3490         if (ip_rt_proc_init())
3491                 pr_err("Unable to create route proc files\n");
3492 #ifdef CONFIG_XFRM
3493         xfrm_init();
3494         xfrm4_init(ip_rt_max_size);
3495 #endif
3496         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3497
3498 #ifdef CONFIG_SYSCTL
3499         register_pernet_subsys(&sysctl_route_ops);
3500 #endif
3501         register_pernet_subsys(&rt_genid_ops);
3502         register_pernet_subsys(&ipv4_inetpeer_ops);
3503         return rc;
3504 }
3505
3506 #ifdef CONFIG_SYSCTL
3507 /*
3508  * We really need to sanitize the damn ipv4 init order, then all
3509  * this nonsense will go away.
3510  */
3511 void __init ip_static_sysctl_init(void)
3512 {
3513         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3514 }
3515 #endif