net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 144 static void              ipv4_link_failure(struct sk_buff *skb);
 145 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 146                                            struct sk_buff *skb, u32 mtu);
 147 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 148                                         struct sk_buff *skb);
 149 static void             ipv4_dst_destroy(struct dst_entry *dst);
 150
 151 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 152 {
 153         WARN_ON(1);
 154         return NULL;
 155 }
 156
 157 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 158                                            struct sk_buff *skb,
 159                                            const void *daddr);
 160 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 161
 162 static struct dst_ops ipv4_dst_ops = {
 163         .family =               AF_INET,
 164         .check =                ipv4_dst_check,
 165         .default_advmss =       ipv4_default_advmss,
 166         .mtu =                  ipv4_mtu,
 167         .cow_metrics =          ipv4_cow_metrics,
 168         .destroy =              ipv4_dst_destroy,
 169         .negative_advice =      ipv4_negative_advice,
 170         .link_failure =         ipv4_link_failure,
 171         .update_pmtu =          ip_rt_update_pmtu,
 172         .redirect =             ip_do_redirect,
 173         .local_out =            __ip_local_out,
 174         .neigh_lookup =         ipv4_neigh_lookup,
 175         .confirm_neigh =        ipv4_confirm_neigh,
 176 };
 177
 178 #define ECN_OR_COST(class)      TC_PRIO_##class
 179
 180 const __u8 ip_tos2prio[16] = {
 181         TC_PRIO_BESTEFFORT,
 182         ECN_OR_COST(BESTEFFORT),
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BULK,
 186         ECN_OR_COST(BULK),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_INTERACTIVE,
 190         ECN_OR_COST(INTERACTIVE),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE_BULK,
 194         ECN_OR_COST(INTERACTIVE_BULK),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK)
 197 };
 198 EXPORT_SYMBOL(ip_tos2prio);
 199
 200 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 201 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 202
 203 #ifdef CONFIG_PROC_FS
 204 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 205 {
 206         if (*pos)
 207                 return NULL;
 208         return SEQ_START_TOKEN;
 209 }
 210
 211 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 212 {
 213         ++*pos;
 214         return NULL;
 215 }
 216
 217 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 218 {
 219 }
 220
 221 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 222 {
 223         if (v == SEQ_START_TOKEN)
 224                 seq_printf(seq, "%-127s\n",
 225                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 226                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 227                            "HHUptod\tSpecDst");
 228         return 0;
 229 }
 230
 231 static const struct seq_operations rt_cache_seq_ops = {
 232         .start  = rt_cache_seq_start,
 233         .next   = rt_cache_seq_next,
 234         .stop   = rt_cache_seq_stop,
 235         .show   = rt_cache_seq_show,
 236 };
 237
 238 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 239 {
 240         return seq_open(file, &rt_cache_seq_ops);
 241 }
 242
 243 static const struct file_operations rt_cache_seq_fops = {
 244         .open    = rt_cache_seq_open,
 245         .read    = seq_read,
 246         .llseek  = seq_lseek,
 247         .release = seq_release,
 248 };
 249
 250
 251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 252 {
 253         int cpu;
 254
 255         if (*pos == 0)
 256                 return SEQ_START_TOKEN;
 257
 258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 259                 if (!cpu_possible(cpu))
 260                         continue;
 261                 *pos = cpu+1;
 262                 return &per_cpu(rt_cache_stat, cpu);
 263         }
 264         return NULL;
 265 }
 266
 267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 268 {
 269         int cpu;
 270
 271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 272                 if (!cpu_possible(cpu))
 273                         continue;
 274                 *pos = cpu+1;
 275                 return &per_cpu(rt_cache_stat, cpu);
 276         }
 277         return NULL;
 278
 279 }
 280
 281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 282 {
 283
 284 }
 285
 286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 287 {
 288         struct rt_cache_stat *st = v;
 289
 290         if (v == SEQ_START_TOKEN) {
 291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 292                 return 0;
 293         }
 294
 295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 297                    dst_entries_get_slow(&ipv4_dst_ops),
 298                    0, /* st->in_hit */
 299                    st->in_slow_tot,
 300                    st->in_slow_mc,
 301                    st->in_no_route,
 302                    st->in_brd,
 303                    st->in_martian_dst,
 304                    st->in_martian_src,
 305
 306                    0, /* st->out_hit */
 307                    st->out_slow_tot,
 308                    st->out_slow_mc,
 309
 310                    0, /* st->gc_total */
 311                    0, /* st->gc_ignored */
 312                    0, /* st->gc_goal_miss */
 313                    0, /* st->gc_dst_overflow */
 314                    0, /* st->in_hlist_search */
 315                    0  /* st->out_hlist_search */
 316                 );
 317         return 0;
 318 }
 319
 320 static const struct seq_operations rt_cpu_seq_ops = {
 321         .start  = rt_cpu_seq_start,
 322         .next   = rt_cpu_seq_next,
 323         .stop   = rt_cpu_seq_stop,
 324         .show   = rt_cpu_seq_show,
 325 };
 326
 327
 328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 329 {
 330         return seq_open(file, &rt_cpu_seq_ops);
 331 }
 332
 333 static const struct file_operations rt_cpu_seq_fops = {
 334         .open    = rt_cpu_seq_open,
 335         .read    = seq_read,
 336         .llseek  = seq_lseek,
 337         .release = seq_release,
 338 };
 339
 340 #ifdef CONFIG_IP_ROUTE_CLASSID
 341 static int rt_acct_proc_show(struct seq_file *m, void *v)
 342 {
 343         struct ip_rt_acct *dst, *src;
 344         unsigned int i, j;
 345
 346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 347         if (!dst)
 348                 return -ENOMEM;
 349
 350         for_each_possible_cpu(i) {
 351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 352                 for (j = 0; j < 256; j++) {
 353                         dst[j].o_bytes   += src[j].o_bytes;
 354                         dst[j].o_packets += src[j].o_packets;
 355                         dst[j].i_bytes   += src[j].i_bytes;
 356                         dst[j].i_packets += src[j].i_packets;
 357                 }
 358         }
 359
 360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 361         kfree(dst);
 362         return 0;
 363 }
 364
 365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 366 {
 367         return single_open(file, rt_acct_proc_show, NULL);
 368 }
 369
 370 static const struct file_operations rt_acct_proc_fops = {
 371         .open           = rt_acct_proc_open,
 372         .read           = seq_read,
 373         .llseek         = seq_lseek,
 374         .release        = single_release,
 375 };
 376 #endif
 377
 378 static int __net_init ip_rt_do_proc_init(struct net *net)
 379 {
 380         struct proc_dir_entry *pde;
 381
 382         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 383                           &rt_cache_seq_fops);
 384         if (!pde)
 385                 goto err1;
 386
 387         pde = proc_create("rt_cache", S_IRUGO,
 388                           net->proc_net_stat, &rt_cpu_seq_fops);
 389         if (!pde)
 390                 goto err2;
 391
 392 #ifdef CONFIG_IP_ROUTE_CLASSID
 393         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 394         if (!pde)
 395                 goto err3;
 396 #endif
 397         return 0;
 398
 399 #ifdef CONFIG_IP_ROUTE_CLASSID
 400 err3:
 401         remove_proc_entry("rt_cache", net->proc_net_stat);
 402 #endif
 403 err2:
 404         remove_proc_entry("rt_cache", net->proc_net);
 405 err1:
 406         return -ENOMEM;
 407 }
 408
 409 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 410 {
 411         remove_proc_entry("rt_cache", net->proc_net_stat);
 412         remove_proc_entry("rt_cache", net->proc_net);
 413 #ifdef CONFIG_IP_ROUTE_CLASSID
 414         remove_proc_entry("rt_acct", net->proc_net);
 415 #endif
 416 }
 417
 418 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 419         .init = ip_rt_do_proc_init,
 420         .exit = ip_rt_do_proc_exit,
 421         .async = true,
 422 };
 423
 424 static int __init ip_rt_proc_init(void)
 425 {
 426         return register_pernet_subsys(&ip_rt_proc_ops);
 427 }
 428
 429 #else
 430 static inline int ip_rt_proc_init(void)
 431 {
 432         return 0;
 433 }
 434 #endif /* CONFIG_PROC_FS */
 435
 436 static inline bool rt_is_expired(const struct rtable *rth)
 437 {
 438         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 439 }
 440
 441 void rt_cache_flush(struct net *net)
 442 {
 443         rt_genid_bump_ipv4(net);
 444 }
 445
 446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 447                                            struct sk_buff *skb,
 448                                            const void *daddr)
 449 {
 450         struct net_device *dev = dst->dev;
 451         const __be32 *pkey = daddr;
 452         const struct rtable *rt;
 453         struct neighbour *n;
 454
 455         rt = (const struct rtable *) dst;
 456         if (rt->rt_gateway)
 457                 pkey = (const __be32 *) &rt->rt_gateway;
 458         else if (skb)
 459                 pkey = &ip_hdr(skb)->daddr;
 460
 461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 462         if (n)
 463                 return n;
 464         return neigh_create(&arp_tbl, pkey, dev);
 465 }
 466
 467 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 468 {
 469         struct net_device *dev = dst->dev;
 470         const __be32 *pkey = daddr;
 471         const struct rtable *rt;
 472
 473         rt = (const struct rtable *)dst;
 474         if (rt->rt_gateway)
 475                 pkey = (const __be32 *)&rt->rt_gateway;
 476         else if (!daddr ||
 477                  (rt->rt_flags &
 478                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 479                 return;
 480
 481         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 482 }
 483
 484 #define IP_IDENTS_SZ 2048u
 485
 486 static atomic_t *ip_idents __read_mostly;
 487 static u32 *ip_tstamps __read_mostly;
 488
 489 /* In order to protect privacy, we add a perturbation to identifiers
 490  * if one generator is seldom used. This makes hard for an attacker
 491  * to infer how many packets were sent between two points in time.
 492  */
 493 u32 ip_idents_reserve(u32 hash, int segs)
 494 {
 495         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 496         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 497         u32 old = READ_ONCE(*p_tstamp);
 498         u32 now = (u32)jiffies;
 499         u32 new, delta = 0;
 500
 501         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 502                 delta = prandom_u32_max(now - old);
 503
 504         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 505         do {
 506                 old = (u32)atomic_read(p_id);
 507                 new = old + delta + segs;
 508         } while (atomic_cmpxchg(p_id, old, new) != old);
 509
 510         return new - segs;
 511 }
 512 EXPORT_SYMBOL(ip_idents_reserve);
 513
 514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 515 {
 516         static u32 ip_idents_hashrnd __read_mostly;
 517         u32 hash, id;
 518
 519         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 520
 521         hash = jhash_3words((__force u32)iph->daddr,
 522                             (__force u32)iph->saddr,
 523                             iph->protocol ^ net_hash_mix(net),
 524                             ip_idents_hashrnd);
 525         id = ip_idents_reserve(hash, segs);
 526         iph->id = htons(id);
 527 }
 528 EXPORT_SYMBOL(__ip_select_ident);
 529
 530 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 531                              const struct sock *sk,
 532                              const struct iphdr *iph,
 533                              int oif, u8 tos,
 534                              u8 prot, u32 mark, int flow_flags)
 535 {
 536         if (sk) {
 537                 const struct inet_sock *inet = inet_sk(sk);
 538
 539                 oif = sk->sk_bound_dev_if;
 540                 mark = sk->sk_mark;
 541                 tos = RT_CONN_FLAGS(sk);
 542                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 543         }
 544         flowi4_init_output(fl4, oif, mark, tos,
 545                            RT_SCOPE_UNIVERSE, prot,
 546                            flow_flags,
 547                            iph->daddr, iph->saddr, 0, 0,
 548                            sock_net_uid(net, sk));
 549 }
 550
 551 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 552                                const struct sock *sk)
 553 {
 554         const struct net *net = dev_net(skb->dev);
 555         const struct iphdr *iph = ip_hdr(skb);
 556         int oif = skb->dev->ifindex;
 557         u8 tos = RT_TOS(iph->tos);
 558         u8 prot = iph->protocol;
 559         u32 mark = skb->mark;
 560
 561         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 562 }
 563
 564 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 565 {
 566         const struct inet_sock *inet = inet_sk(sk);
 567         const struct ip_options_rcu *inet_opt;
 568         __be32 daddr = inet->inet_daddr;
 569
 570         rcu_read_lock();
 571         inet_opt = rcu_dereference(inet->inet_opt);
 572         if (inet_opt && inet_opt->opt.srr)
 573                 daddr = inet_opt->opt.faddr;
 574         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 575                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 576                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 577                            inet_sk_flowi_flags(sk),
 578                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 579         rcu_read_unlock();
 580 }
 581
 582 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 583                                  const struct sk_buff *skb)
 584 {
 585         if (skb)
 586                 build_skb_flow_key(fl4, skb, sk);
 587         else
 588                 build_sk_flow_key(fl4, sk);
 589 }
 590
 591 static DEFINE_SPINLOCK(fnhe_lock);
 592
 593 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 594 {
 595         struct rtable *rt;
 596
 597         rt = rcu_dereference(fnhe->fnhe_rth_input);
 598         if (rt) {
 599                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 600                 dst_dev_put(&rt->dst);
 601                 dst_release(&rt->dst);
 602         }
 603         rt = rcu_dereference(fnhe->fnhe_rth_output);
 604         if (rt) {
 605                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 606                 dst_dev_put(&rt->dst);
 607                 dst_release(&rt->dst);
 608         }
 609 }
 610
 611 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 612 {
 613         struct fib_nh_exception *fnhe, *oldest;
 614
 615         oldest = rcu_dereference(hash->chain);
 616         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 617              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 618                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 619                         oldest = fnhe;
 620         }
 621         fnhe_flush_routes(oldest);
 622         return oldest;
 623 }
 624
 625 static inline u32 fnhe_hashfun(__be32 daddr)
 626 {
 627         static u32 fnhe_hashrnd __read_mostly;
 628         u32 hval;
 629
 630         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 631         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 632         return hash_32(hval, FNHE_HASH_SHIFT);
 633 }
 634
 635 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 636 {
 637         rt->rt_pmtu = fnhe->fnhe_pmtu;
 638         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 639         rt->dst.expires = fnhe->fnhe_expires;
 640
 641         if (fnhe->fnhe_gw) {
 642                 rt->rt_flags |= RTCF_REDIRECTED;
 643                 rt->rt_gateway = fnhe->fnhe_gw;
 644                 rt->rt_uses_gateway = 1;
 645         }
 646 }
 647
 648 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 649                                   u32 pmtu, bool lock, unsigned long expires)
 650 {
 651         struct fnhe_hash_bucket *hash;
 652         struct fib_nh_exception *fnhe;
 653         struct rtable *rt;
 654         u32 genid, hval;
 655         unsigned int i;
 656         int depth;
 657
 658         genid = fnhe_genid(dev_net(nh->nh_dev));
 659         hval = fnhe_hashfun(daddr);
 660
 661         spin_lock_bh(&fnhe_lock);
 662
 663         hash = rcu_dereference(nh->nh_exceptions);
 664         if (!hash) {
 665                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 666                 if (!hash)
 667                         goto out_unlock;
 668                 rcu_assign_pointer(nh->nh_exceptions, hash);
 669         }
 670
 671         hash += hval;
 672
 673         depth = 0;
 674         for (fnhe = rcu_dereference(hash->chain); fnhe;
 675              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 676                 if (fnhe->fnhe_daddr == daddr)
 677                         break;
 678                 depth++;
 679         }
 680
 681         if (fnhe) {
 682                 if (fnhe->fnhe_genid != genid)
 683                         fnhe->fnhe_genid = genid;
 684                 if (gw)
 685                         fnhe->fnhe_gw = gw;
 686                 if (pmtu) {
 687                         fnhe->fnhe_pmtu = pmtu;
 688                         fnhe->fnhe_mtu_locked = lock;
 689                 }
 690                 fnhe->fnhe_expires = max(1UL, expires);
 691                 /* Update all cached dsts too */
 692                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 693                 if (rt)
 694                         fill_route_from_fnhe(rt, fnhe);
 695                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 696                 if (rt)
 697                         fill_route_from_fnhe(rt, fnhe);
 698         } else {
 699                 if (depth > FNHE_RECLAIM_DEPTH)
 700                         fnhe = fnhe_oldest(hash);
 701                 else {
 702                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 703                         if (!fnhe)
 704                                 goto out_unlock;
 705
 706                         fnhe->fnhe_next = hash->chain;
 707                         rcu_assign_pointer(hash->chain, fnhe);
 708                 }
 709                 fnhe->fnhe_genid = genid;
 710                 fnhe->fnhe_daddr = daddr;
 711                 fnhe->fnhe_gw = gw;
 712                 fnhe->fnhe_pmtu = pmtu;
 713                 fnhe->fnhe_mtu_locked = lock;
 714                 fnhe->fnhe_expires = expires;
 715
 716                 /* Exception created; mark the cached routes for the nexthop
 717                  * stale, so anyone caching it rechecks if this exception
 718                  * applies to them.
 719                  */
 720                 rt = rcu_dereference(nh->nh_rth_input);
 721                 if (rt)
 722                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 723
 724                 for_each_possible_cpu(i) {
 725                         struct rtable __rcu **prt;
 726                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 727                         rt = rcu_dereference(*prt);
 728                         if (rt)
 729                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 730                 }
 731         }
 732
 733         fnhe->fnhe_stamp = jiffies;
 734
 735 out_unlock:
 736         spin_unlock_bh(&fnhe_lock);
 737 }
 738
 739 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 740                              bool kill_route)
 741 {
 742         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 743         __be32 old_gw = ip_hdr(skb)->saddr;
 744         struct net_device *dev = skb->dev;
 745         struct in_device *in_dev;
 746         struct fib_result res;
 747         struct neighbour *n;
 748         struct net *net;
 749
 750         switch (icmp_hdr(skb)->code & 7) {
 751         case ICMP_REDIR_NET:
 752         case ICMP_REDIR_NETTOS:
 753         case ICMP_REDIR_HOST:
 754         case ICMP_REDIR_HOSTTOS:
 755                 break;
 756
 757         default:
 758                 return;
 759         }
 760
 761         if (rt->rt_gateway != old_gw)
 762                 return;
 763
 764         in_dev = __in_dev_get_rcu(dev);
 765         if (!in_dev)
 766                 return;
 767
 768         net = dev_net(dev);
 769         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 770             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 771             ipv4_is_zeronet(new_gw))
 772                 goto reject_redirect;
 773
 774         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 775                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 776                         goto reject_redirect;
 777                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 778                         goto reject_redirect;
 779         } else {
 780                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 781                         goto reject_redirect;
 782         }
 783
 784         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 785         if (!n)
 786                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 787         if (!IS_ERR(n)) {
 788                 if (!(n->nud_state & NUD_VALID)) {
 789                         neigh_event_send(n, NULL);
 790                 } else {
 791                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 792                                 struct fib_nh *nh = &FIB_RES_NH(res);
 793
 794                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 795                                                 0, false,
 796                                                 jiffies + ip_rt_gc_timeout);
 797                         }
 798                         if (kill_route)
 799                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 800                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 801                 }
 802                 neigh_release(n);
 803         }
 804         return;
 805
 806 reject_redirect:
 807 #ifdef CONFIG_IP_ROUTE_VERBOSE
 808         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 809                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 810                 __be32 daddr = iph->daddr;
 811                 __be32 saddr = iph->saddr;
 812
 813                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 814                                      "  Advised path = %pI4 -> %pI4\n",
 815                                      &old_gw, dev->name, &new_gw,
 816                                      &saddr, &daddr);
 817         }
 818 #endif
 819         ;
 820 }
 821
 822 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 823 {
 824         struct rtable *rt;
 825         struct flowi4 fl4;
 826         const struct iphdr *iph = (const struct iphdr *) skb->data;
 827         struct net *net = dev_net(skb->dev);
 828         int oif = skb->dev->ifindex;
 829         u8 tos = RT_TOS(iph->tos);
 830         u8 prot = iph->protocol;
 831         u32 mark = skb->mark;
 832
 833         rt = (struct rtable *) dst;
 834
 835         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 836         __ip_do_redirect(rt, skb, &fl4, true);
 837 }
 838
 839 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 840 {
 841         struct rtable *rt = (struct rtable *)dst;
 842         struct dst_entry *ret = dst;
 843
 844         if (rt) {
 845                 if (dst->obsolete > 0) {
 846                         ip_rt_put(rt);
 847                         ret = NULL;
 848                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 849                            rt->dst.expires) {
 850                         ip_rt_put(rt);
 851                         ret = NULL;
 852                 }
 853         }
 854         return ret;
 855 }
 856
 857 /*
 858  * Algorithm:
 859  *      1. The first ip_rt_redirect_number redirects are sent
 860  *         with exponential backoff, then we stop sending them at all,
 861  *         assuming that the host ignores our redirects.
 862  *      2. If we did not see packets requiring redirects
 863  *         during ip_rt_redirect_silence, we assume that the host
 864  *         forgot redirected route and start to send redirects again.
 865  *
 866  * This algorithm is much cheaper and more intelligent than dumb load limiting
 867  * in icmp.c.
 868  *
 869  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 870  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 871  */
 872
 873 void ip_rt_send_redirect(struct sk_buff *skb)
 874 {
 875         struct rtable *rt = skb_rtable(skb);
 876         struct in_device *in_dev;
 877         struct inet_peer *peer;
 878         struct net *net;
 879         int log_martians;
 880         int vif;
 881
 882         rcu_read_lock();
 883         in_dev = __in_dev_get_rcu(rt->dst.dev);
 884         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 885                 rcu_read_unlock();
 886                 return;
 887         }
 888         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 889         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 890         rcu_read_unlock();
 891
 892         net = dev_net(rt->dst.dev);
 893         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 894         if (!peer) {
 895                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 896                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 897                 return;
 898         }
 899
 900         /* No redirected packets during ip_rt_redirect_silence;
 901          * reset the algorithm.
 902          */
 903         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 904                 peer->rate_tokens = 0;
 905
 906         /* Too many ignored redirects; do not send anything
 907          * set dst.rate_last to the last seen redirected packet.
 908          */
 909         if (peer->rate_tokens >= ip_rt_redirect_number) {
 910                 peer->rate_last = jiffies;
 911                 goto out_put_peer;
 912         }
 913
 914         /* Check for load limit; set rate_last to the latest sent
 915          * redirect.
 916          */
 917         if (peer->rate_tokens == 0 ||
 918             time_after(jiffies,
 919                        (peer->rate_last +
 920                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 921                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 922
 923                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 924                 peer->rate_last = jiffies;
 925                 ++peer->rate_tokens;
 926 #ifdef CONFIG_IP_ROUTE_VERBOSE
 927                 if (log_martians &&
 928                     peer->rate_tokens == ip_rt_redirect_number)
 929                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 930                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 931                                              &ip_hdr(skb)->daddr, &gw);
 932 #endif
 933         }
 934 out_put_peer:
 935         inet_putpeer(peer);
 936 }
 937
 938 static int ip_error(struct sk_buff *skb)
 939 {
 940         struct rtable *rt = skb_rtable(skb);
 941         struct net_device *dev = skb->dev;
 942         struct in_device *in_dev;
 943         struct inet_peer *peer;
 944         unsigned long now;
 945         struct net *net;
 946         bool send;
 947         int code;
 948
 949         if (netif_is_l3_master(skb->dev)) {
 950                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 951                 if (!dev)
 952                         goto out;
 953         }
 954
 955         in_dev = __in_dev_get_rcu(dev);
 956
 957         /* IP on this device is disabled. */
 958         if (!in_dev)
 959                 goto out;
 960
 961         net = dev_net(rt->dst.dev);
 962         if (!IN_DEV_FORWARD(in_dev)) {
 963                 switch (rt->dst.error) {
 964                 case EHOSTUNREACH:
 965                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 966                         break;
 967
 968                 case ENETUNREACH:
 969                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 970                         break;
 971                 }
 972                 goto out;
 973         }
 974
 975         switch (rt->dst.error) {
 976         case EINVAL:
 977         default:
 978                 goto out;
 979         case EHOSTUNREACH:
 980                 code = ICMP_HOST_UNREACH;
 981                 break;
 982         case ENETUNREACH:
 983                 code = ICMP_NET_UNREACH;
 984                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 985                 break;
 986         case EACCES:
 987                 code = ICMP_PKT_FILTERED;
 988                 break;
 989         }
 990
 991         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 992                                l3mdev_master_ifindex(skb->dev), 1);
 993
 994         send = true;
 995         if (peer) {
 996                 now = jiffies;
 997                 peer->rate_tokens += now - peer->rate_last;
 998                 if (peer->rate_tokens > ip_rt_error_burst)
 999                         peer->rate_tokens = ip_rt_error_burst;
1000                 peer->rate_last = now;
1001                 if (peer->rate_tokens >= ip_rt_error_cost)
1002                         peer->rate_tokens -= ip_rt_error_cost;
1003                 else
1004                         send = false;
1005                 inet_putpeer(peer);
1006         }
1007         if (send)
1008                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009
1010 out:    kfree_skb(skb);
1011         return 0;
1012 }
1013
1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 {
1016         struct dst_entry *dst = &rt->dst;
1017         struct fib_result res;
1018         bool lock = false;
1019
1020         if (ip_mtu_locked(dst))
1021                 return;
1022
1023         if (ipv4_mtu(dst) < mtu)
1024                 return;
1025
1026         if (mtu < ip_rt_min_pmtu) {
1027                 lock = true;
1028                 mtu = ip_rt_min_pmtu;
1029         }
1030
1031         if (rt->rt_pmtu == mtu &&
1032             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033                 return;
1034
1035         rcu_read_lock();
1036         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037                 struct fib_nh *nh = &FIB_RES_NH(res);
1038
1039                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1040                                       jiffies + ip_rt_mtu_expires);
1041         }
1042         rcu_read_unlock();
1043 }
1044
1045 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046                               struct sk_buff *skb, u32 mtu)
1047 {
1048         struct rtable *rt = (struct rtable *) dst;
1049         struct flowi4 fl4;
1050
1051         ip_rt_build_flow_key(&fl4, sk, skb);
1052         __ip_rt_update_pmtu(rt, &fl4, mtu);
1053 }
1054
1055 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1056                       int oif, u32 mark, u8 protocol, int flow_flags)
1057 {
1058         const struct iphdr *iph = (const struct iphdr *) skb->data;
1059         struct flowi4 fl4;
1060         struct rtable *rt;
1061
1062         if (!mark)
1063                 mark = IP4_REPLY_MARK(net, skb->mark);
1064
1065         __build_flow_key(net, &fl4, NULL, iph, oif,
1066                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1067         rt = __ip_route_output_key(net, &fl4);
1068         if (!IS_ERR(rt)) {
1069                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1070                 ip_rt_put(rt);
1071         }
1072 }
1073 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1074
1075 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1076 {
1077         const struct iphdr *iph = (const struct iphdr *) skb->data;
1078         struct flowi4 fl4;
1079         struct rtable *rt;
1080
1081         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1082
1083         if (!fl4.flowi4_mark)
1084                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1085
1086         rt = __ip_route_output_key(sock_net(sk), &fl4);
1087         if (!IS_ERR(rt)) {
1088                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1089                 ip_rt_put(rt);
1090         }
1091 }
1092
1093 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1094 {
1095         const struct iphdr *iph = (const struct iphdr *) skb->data;
1096         struct flowi4 fl4;
1097         struct rtable *rt;
1098         struct dst_entry *odst = NULL;
1099         bool new = false;
1100         struct net *net = sock_net(sk);
1101
1102         bh_lock_sock(sk);
1103
1104         if (!ip_sk_accept_pmtu(sk))
1105                 goto out;
1106
1107         odst = sk_dst_get(sk);
1108
1109         if (sock_owned_by_user(sk) || !odst) {
1110                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1111                 goto out;
1112         }
1113
1114         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1115
1116         rt = (struct rtable *)odst;
1117         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1118                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1119                 if (IS_ERR(rt))
1120                         goto out;
1121
1122                 new = true;
1123         }
1124
1125         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1126
1127         if (!dst_check(&rt->dst, 0)) {
1128                 if (new)
1129                         dst_release(&rt->dst);
1130
1131                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1132                 if (IS_ERR(rt))
1133                         goto out;
1134
1135                 new = true;
1136         }
1137
1138         if (new)
1139                 sk_dst_set(sk, &rt->dst);
1140
1141 out:
1142         bh_unlock_sock(sk);
1143         dst_release(odst);
1144 }
1145 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1146
1147 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1148                    int oif, u32 mark, u8 protocol, int flow_flags)
1149 {
1150         const struct iphdr *iph = (const struct iphdr *) skb->data;
1151         struct flowi4 fl4;
1152         struct rtable *rt;
1153
1154         __build_flow_key(net, &fl4, NULL, iph, oif,
1155                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1156         rt = __ip_route_output_key(net, &fl4);
1157         if (!IS_ERR(rt)) {
1158                 __ip_do_redirect(rt, skb, &fl4, false);
1159                 ip_rt_put(rt);
1160         }
1161 }
1162 EXPORT_SYMBOL_GPL(ipv4_redirect);
1163
1164 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1165 {
1166         const struct iphdr *iph = (const struct iphdr *) skb->data;
1167         struct flowi4 fl4;
1168         struct rtable *rt;
1169         struct net *net = sock_net(sk);
1170
1171         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1172         rt = __ip_route_output_key(net, &fl4);
1173         if (!IS_ERR(rt)) {
1174                 __ip_do_redirect(rt, skb, &fl4, false);
1175                 ip_rt_put(rt);
1176         }
1177 }
1178 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1179
1180 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1181 {
1182         struct rtable *rt = (struct rtable *) dst;
1183
1184         /* All IPV4 dsts are created with ->obsolete set to the value
1185          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1186          * into this function always.
1187          *
1188          * When a PMTU/redirect information update invalidates a route,
1189          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1190          * DST_OBSOLETE_DEAD by dst_free().
1191          */
1192         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1193                 return NULL;
1194         return dst;
1195 }
1196
1197 static void ipv4_link_failure(struct sk_buff *skb)
1198 {
1199         struct rtable *rt;
1200
1201         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1202
1203         rt = skb_rtable(skb);
1204         if (rt)
1205                 dst_set_expires(&rt->dst, 0);
1206 }
1207
1208 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1209 {
1210         pr_debug("%s: %pI4 -> %pI4, %s\n",
1211                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1212                  skb->dev ? skb->dev->name : "?");
1213         kfree_skb(skb);
1214         WARN_ON(1);
1215         return 0;
1216 }
1217
1218 /*
1219    We do not cache source address of outgoing interface,
1220    because it is used only by IP RR, TS and SRR options,
1221    so that it out of fast path.
1222
1223    BTW remember: "addr" is allowed to be not aligned
1224    in IP options!
1225  */
1226
1227 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1228 {
1229         __be32 src;
1230
1231         if (rt_is_output_route(rt))
1232                 src = ip_hdr(skb)->saddr;
1233         else {
1234                 struct fib_result res;
1235                 struct flowi4 fl4;
1236                 struct iphdr *iph;
1237
1238                 iph = ip_hdr(skb);
1239
1240                 memset(&fl4, 0, sizeof(fl4));
1241                 fl4.daddr = iph->daddr;
1242                 fl4.saddr = iph->saddr;
1243                 fl4.flowi4_tos = RT_TOS(iph->tos);
1244                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1245                 fl4.flowi4_iif = skb->dev->ifindex;
1246                 fl4.flowi4_mark = skb->mark;
1247
1248                 rcu_read_lock();
1249                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1250                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1251                 else
1252                         src = inet_select_addr(rt->dst.dev,
1253                                                rt_nexthop(rt, iph->daddr),
1254                                                RT_SCOPE_UNIVERSE);
1255                 rcu_read_unlock();
1256         }
1257         memcpy(addr, &src, 4);
1258 }
1259
1260 #ifdef CONFIG_IP_ROUTE_CLASSID
1261 static void set_class_tag(struct rtable *rt, u32 tag)
1262 {
1263         if (!(rt->dst.tclassid & 0xFFFF))
1264                 rt->dst.tclassid |= tag & 0xFFFF;
1265         if (!(rt->dst.tclassid & 0xFFFF0000))
1266                 rt->dst.tclassid |= tag & 0xFFFF0000;
1267 }
1268 #endif
1269
1270 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1271 {
1272         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1273         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1274                                     ip_rt_min_advmss);
1275
1276         return min(advmss, IPV4_MAX_PMTU - header_size);
1277 }
1278
1279 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1280 {
1281         const struct rtable *rt = (const struct rtable *) dst;
1282         unsigned int mtu = rt->rt_pmtu;
1283
1284         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1285                 mtu = dst_metric_raw(dst, RTAX_MTU);
1286
1287         if (mtu)
1288                 return mtu;
1289
1290         mtu = READ_ONCE(dst->dev->mtu);
1291
1292         if (unlikely(ip_mtu_locked(dst))) {
1293                 if (rt->rt_uses_gateway && mtu > 576)
1294                         mtu = 576;
1295         }
1296
1297         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1298
1299         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1300 }
1301
1302 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1303 {
1304         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1305         struct fib_nh_exception *fnhe;
1306         u32 hval;
1307
1308         if (!hash)
1309                 return NULL;
1310
1311         hval = fnhe_hashfun(daddr);
1312
1313         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1314              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1315                 if (fnhe->fnhe_daddr == daddr)
1316                         return fnhe;
1317         }
1318         return NULL;
1319 }
1320
1321 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1322                               __be32 daddr, const bool do_cache)
1323 {
1324         bool ret = false;
1325
1326         spin_lock_bh(&fnhe_lock);
1327
1328         if (daddr == fnhe->fnhe_daddr) {
1329                 struct rtable __rcu **porig;
1330                 struct rtable *orig;
1331                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1332
1333                 if (rt_is_input_route(rt))
1334                         porig = &fnhe->fnhe_rth_input;
1335                 else
1336                         porig = &fnhe->fnhe_rth_output;
1337                 orig = rcu_dereference(*porig);
1338
1339                 if (fnhe->fnhe_genid != genid) {
1340                         fnhe->fnhe_genid = genid;
1341                         fnhe->fnhe_gw = 0;
1342                         fnhe->fnhe_pmtu = 0;
1343                         fnhe->fnhe_expires = 0;
1344                         fnhe_flush_routes(fnhe);
1345                         orig = NULL;
1346                 }
1347                 fill_route_from_fnhe(rt, fnhe);
1348                 if (!rt->rt_gateway)
1349                         rt->rt_gateway = daddr;
1350
1351                 if (do_cache) {
1352                         dst_hold(&rt->dst);
1353                         rcu_assign_pointer(*porig, rt);
1354                         if (orig) {
1355                                 dst_dev_put(&orig->dst);
1356                                 dst_release(&orig->dst);
1357                         }
1358                         ret = true;
1359                 }
1360
1361                 fnhe->fnhe_stamp = jiffies;
1362         }
1363         spin_unlock_bh(&fnhe_lock);
1364
1365         return ret;
1366 }
1367
1368 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1369 {
1370         struct rtable *orig, *prev, **p;
1371         bool ret = true;
1372
1373         if (rt_is_input_route(rt)) {
1374                 p = (struct rtable **)&nh->nh_rth_input;
1375         } else {
1376                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1377         }
1378         orig = *p;
1379
1380         /* hold dst before doing cmpxchg() to avoid race condition
1381          * on this dst
1382          */
1383         dst_hold(&rt->dst);
1384         prev = cmpxchg(p, orig, rt);
1385         if (prev == orig) {
1386                 if (orig) {
1387                         dst_dev_put(&orig->dst);
1388                         dst_release(&orig->dst);
1389                 }
1390         } else {
1391                 dst_release(&rt->dst);
1392                 ret = false;
1393         }
1394
1395         return ret;
1396 }
1397
1398 struct uncached_list {
1399         spinlock_t              lock;
1400         struct list_head        head;
1401 };
1402
1403 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1404
1405 void rt_add_uncached_list(struct rtable *rt)
1406 {
1407         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1408
1409         rt->rt_uncached_list = ul;
1410
1411         spin_lock_bh(&ul->lock);
1412         list_add_tail(&rt->rt_uncached, &ul->head);
1413         spin_unlock_bh(&ul->lock);
1414 }
1415
1416 void rt_del_uncached_list(struct rtable *rt)
1417 {
1418         if (!list_empty(&rt->rt_uncached)) {
1419                 struct uncached_list *ul = rt->rt_uncached_list;
1420
1421                 spin_lock_bh(&ul->lock);
1422                 list_del(&rt->rt_uncached);
1423                 spin_unlock_bh(&ul->lock);
1424         }
1425 }
1426
1427 static void ipv4_dst_destroy(struct dst_entry *dst)
1428 {
1429         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1430         struct rtable *rt = (struct rtable *)dst;
1431
1432         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1433                 kfree(p);
1434
1435         rt_del_uncached_list(rt);
1436 }
1437
1438 void rt_flush_dev(struct net_device *dev)
1439 {
1440         struct net *net = dev_net(dev);
1441         struct rtable *rt;
1442         int cpu;
1443
1444         for_each_possible_cpu(cpu) {
1445                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1446
1447                 spin_lock_bh(&ul->lock);
1448                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1449                         if (rt->dst.dev != dev)
1450                                 continue;
1451                         rt->dst.dev = net->loopback_dev;
1452                         dev_hold(rt->dst.dev);
1453                         dev_put(dev);
1454                 }
1455                 spin_unlock_bh(&ul->lock);
1456         }
1457 }
1458
1459 static bool rt_cache_valid(const struct rtable *rt)
1460 {
1461         return  rt &&
1462                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1463                 !rt_is_expired(rt);
1464 }
1465
1466 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1467                            const struct fib_result *res,
1468                            struct fib_nh_exception *fnhe,
1469                            struct fib_info *fi, u16 type, u32 itag,
1470                            const bool do_cache)
1471 {
1472         bool cached = false;
1473
1474         if (fi) {
1475                 struct fib_nh *nh = &FIB_RES_NH(*res);
1476
1477                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1478                         rt->rt_gateway = nh->nh_gw;
1479                         rt->rt_uses_gateway = 1;
1480                 }
1481                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1482                 if (fi->fib_metrics != &dst_default_metrics) {
1483                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1484                         refcount_inc(&fi->fib_metrics->refcnt);
1485                 }
1486 #ifdef CONFIG_IP_ROUTE_CLASSID
1487                 rt->dst.tclassid = nh->nh_tclassid;
1488 #endif
1489                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1490                 if (unlikely(fnhe))
1491                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1492                 else if (do_cache)
1493                         cached = rt_cache_route(nh, rt);
1494                 if (unlikely(!cached)) {
1495                         /* Routes we intend to cache in nexthop exception or
1496                          * FIB nexthop have the DST_NOCACHE bit clear.
1497                          * However, if we are unsuccessful at storing this
1498                          * route into the cache we really need to set it.
1499                          */
1500                         if (!rt->rt_gateway)
1501                                 rt->rt_gateway = daddr;
1502                         rt_add_uncached_list(rt);
1503                 }
1504         } else
1505                 rt_add_uncached_list(rt);
1506
1507 #ifdef CONFIG_IP_ROUTE_CLASSID
1508 #ifdef CONFIG_IP_MULTIPLE_TABLES
1509         set_class_tag(rt, res->tclassid);
1510 #endif
1511         set_class_tag(rt, itag);
1512 #endif
1513 }
1514
1515 struct rtable *rt_dst_alloc(struct net_device *dev,
1516                             unsigned int flags, u16 type,
1517                             bool nopolicy, bool noxfrm, bool will_cache)
1518 {
1519         struct rtable *rt;
1520
1521         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1522                        (will_cache ? 0 : DST_HOST) |
1523                        (nopolicy ? DST_NOPOLICY : 0) |
1524                        (noxfrm ? DST_NOXFRM : 0));
1525
1526         if (rt) {
1527                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1528                 rt->rt_flags = flags;
1529                 rt->rt_type = type;
1530                 rt->rt_is_input = 0;
1531                 rt->rt_iif = 0;
1532                 rt->rt_pmtu = 0;
1533                 rt->rt_mtu_locked = 0;
1534                 rt->rt_gateway = 0;
1535                 rt->rt_uses_gateway = 0;
1536                 INIT_LIST_HEAD(&rt->rt_uncached);
1537
1538                 rt->dst.output = ip_output;
1539                 if (flags & RTCF_LOCAL)
1540                         rt->dst.input = ip_local_deliver;
1541         }
1542
1543         return rt;
1544 }
1545 EXPORT_SYMBOL(rt_dst_alloc);
1546
1547 /* called in rcu_read_lock() section */
1548 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1549                           u8 tos, struct net_device *dev,
1550                           struct in_device *in_dev, u32 *itag)
1551 {
1552         int err;
1553
1554         /* Primary sanity checks. */
1555         if (!in_dev)
1556                 return -EINVAL;
1557
1558         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1559             skb->protocol != htons(ETH_P_IP))
1560                 return -EINVAL;
1561
1562         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1563                 return -EINVAL;
1564
1565         if (ipv4_is_zeronet(saddr)) {
1566                 if (!ipv4_is_local_multicast(daddr))
1567                         return -EINVAL;
1568         } else {
1569                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1570                                           in_dev, itag);
1571                 if (err < 0)
1572                         return err;
1573         }
1574         return 0;
1575 }
1576
1577 /* called in rcu_read_lock() section */
1578 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1579                              u8 tos, struct net_device *dev, int our)
1580 {
1581         struct in_device *in_dev = __in_dev_get_rcu(dev);
1582         unsigned int flags = RTCF_MULTICAST;
1583         struct rtable *rth;
1584         u32 itag = 0;
1585         int err;
1586
1587         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1588         if (err)
1589                 return err;
1590
1591         if (our)
1592                 flags |= RTCF_LOCAL;
1593
1594         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1595                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1596         if (!rth)
1597                 return -ENOBUFS;
1598
1599 #ifdef CONFIG_IP_ROUTE_CLASSID
1600         rth->dst.tclassid = itag;
1601 #endif
1602         rth->dst.output = ip_rt_bug;
1603         rth->rt_is_input= 1;
1604
1605 #ifdef CONFIG_IP_MROUTE
1606         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1607                 rth->dst.input = ip_mr_input;
1608 #endif
1609         RT_CACHE_STAT_INC(in_slow_mc);
1610
1611         skb_dst_set(skb, &rth->dst);
1612         return 0;
1613 }
1614
1615
1616 static void ip_handle_martian_source(struct net_device *dev,
1617                                      struct in_device *in_dev,
1618                                      struct sk_buff *skb,
1619                                      __be32 daddr,
1620                                      __be32 saddr)
1621 {
1622         RT_CACHE_STAT_INC(in_martian_src);
1623 #ifdef CONFIG_IP_ROUTE_VERBOSE
1624         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1625                 /*
1626                  *      RFC1812 recommendation, if source is martian,
1627                  *      the only hint is MAC header.
1628                  */
1629                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1630                         &daddr, &saddr, dev->name);
1631                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1632                         print_hex_dump(KERN_WARNING, "ll header: ",
1633                                        DUMP_PREFIX_OFFSET, 16, 1,
1634                                        skb_mac_header(skb),
1635                                        dev->hard_header_len, true);
1636                 }
1637         }
1638 #endif
1639 }
1640
1641 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1642 {
1643         struct fnhe_hash_bucket *hash;
1644         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1645         u32 hval = fnhe_hashfun(daddr);
1646
1647         spin_lock_bh(&fnhe_lock);
1648
1649         hash = rcu_dereference_protected(nh->nh_exceptions,
1650                                          lockdep_is_held(&fnhe_lock));
1651         hash += hval;
1652
1653         fnhe_p = &hash->chain;
1654         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1655         while (fnhe) {
1656                 if (fnhe->fnhe_daddr == daddr) {
1657                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1658                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1659                         fnhe_flush_routes(fnhe);
1660                         kfree_rcu(fnhe, rcu);
1661                         break;
1662                 }
1663                 fnhe_p = &fnhe->fnhe_next;
1664                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1665                                                  lockdep_is_held(&fnhe_lock));
1666         }
1667
1668         spin_unlock_bh(&fnhe_lock);
1669 }
1670
1671 /* called in rcu_read_lock() section */
1672 static int __mkroute_input(struct sk_buff *skb,
1673                            const struct fib_result *res,
1674                            struct in_device *in_dev,
1675                            __be32 daddr, __be32 saddr, u32 tos)
1676 {
1677         struct fib_nh_exception *fnhe;
1678         struct rtable *rth;
1679         int err;
1680         struct in_device *out_dev;
1681         bool do_cache;
1682         u32 itag = 0;
1683
1684         /* get a working reference to the output device */
1685         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1686         if (!out_dev) {
1687                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1688                 return -EINVAL;
1689         }
1690
1691         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1692                                   in_dev->dev, in_dev, &itag);
1693         if (err < 0) {
1694                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1695                                          saddr);
1696
1697                 goto cleanup;
1698         }
1699
1700         do_cache = res->fi && !itag;
1701         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1702             skb->protocol == htons(ETH_P_IP) &&
1703             (IN_DEV_SHARED_MEDIA(out_dev) ||
1704              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1705                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1706
1707         if (skb->protocol != htons(ETH_P_IP)) {
1708                 /* Not IP (i.e. ARP). Do not create route, if it is
1709                  * invalid for proxy arp. DNAT routes are always valid.
1710                  *
1711                  * Proxy arp feature have been extended to allow, ARP
1712                  * replies back to the same interface, to support
1713                  * Private VLAN switch technologies. See arp.c.
1714                  */
1715                 if (out_dev == in_dev &&
1716                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1717                         err = -EINVAL;
1718                         goto cleanup;
1719                 }
1720         }
1721
1722         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1723         if (do_cache) {
1724                 if (fnhe) {
1725                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1726                         if (rth && rth->dst.expires &&
1727                             time_after(jiffies, rth->dst.expires)) {
1728                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1729                                 fnhe = NULL;
1730                         } else {
1731                                 goto rt_cache;
1732                         }
1733                 }
1734
1735                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1736
1737 rt_cache:
1738                 if (rt_cache_valid(rth)) {
1739                         skb_dst_set_noref(skb, &rth->dst);
1740                         goto out;
1741                 }
1742         }
1743
1744         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1745                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1746                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1747         if (!rth) {
1748                 err = -ENOBUFS;
1749                 goto cleanup;
1750         }
1751
1752         rth->rt_is_input = 1;
1753         RT_CACHE_STAT_INC(in_slow_tot);
1754
1755         rth->dst.input = ip_forward;
1756
1757         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1758                        do_cache);
1759         lwtunnel_set_redirect(&rth->dst);
1760         skb_dst_set(skb, &rth->dst);
1761 out:
1762         err = 0;
1763  cleanup:
1764         return err;
1765 }
1766
1767 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1768 /* To make ICMP packets follow the right flow, the multipath hash is
1769  * calculated from the inner IP addresses.
1770  */
1771 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1772                                  struct flow_keys *hash_keys)
1773 {
1774         const struct iphdr *outer_iph = ip_hdr(skb);
1775         const struct iphdr *key_iph = outer_iph;
1776         const struct iphdr *inner_iph;
1777         const struct icmphdr *icmph;
1778         struct iphdr _inner_iph;
1779         struct icmphdr _icmph;
1780
1781         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1782                 goto out;
1783
1784         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1785                 goto out;
1786
1787         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1788                                    &_icmph);
1789         if (!icmph)
1790                 goto out;
1791
1792         if (icmph->type != ICMP_DEST_UNREACH &&
1793             icmph->type != ICMP_REDIRECT &&
1794             icmph->type != ICMP_TIME_EXCEEDED &&
1795             icmph->type != ICMP_PARAMETERPROB)
1796                 goto out;
1797
1798         inner_iph = skb_header_pointer(skb,
1799                                        outer_iph->ihl * 4 + sizeof(_icmph),
1800                                        sizeof(_inner_iph), &_inner_iph);
1801         if (!inner_iph)
1802                 goto out;
1803
1804         key_iph = inner_iph;
1805 out:
1806         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1807         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1808 }
1809
1810 /* if skb is set it will be used and fl4 can be NULL */
1811 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1812                        const struct sk_buff *skb, struct flow_keys *flkeys)
1813 {
1814         struct flow_keys hash_keys;
1815         u32 mhash;
1816
1817         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1818         case 0:
1819                 memset(&hash_keys, 0, sizeof(hash_keys));
1820                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1821                 if (skb) {
1822                         ip_multipath_l3_keys(skb, &hash_keys);
1823                 } else {
1824                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1825                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1826                 }
1827                 break;
1828         case 1:
1829                 /* skb is currently provided only when forwarding */
1830                 if (skb) {
1831                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1832                         struct flow_keys keys;
1833
1834                         /* short-circuit if we already have L4 hash present */
1835                         if (skb->l4_hash)
1836                                 return skb_get_hash_raw(skb) >> 1;
1837
1838                         memset(&hash_keys, 0, sizeof(hash_keys));
1839
1840                         if (!flkeys) {
1841                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1842                                 flkeys = &keys;
1843                         }
1844
1845                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1846                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1847                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1848                         hash_keys.ports.src = flkeys->ports.src;
1849                         hash_keys.ports.dst = flkeys->ports.dst;
1850                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1851                 } else {
1852                         memset(&hash_keys, 0, sizeof(hash_keys));
1853                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1854                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1855                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1856                         hash_keys.ports.src = fl4->fl4_sport;
1857                         hash_keys.ports.dst = fl4->fl4_dport;
1858                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1859                 }
1860                 break;
1861         }
1862         mhash = flow_hash_from_keys(&hash_keys);
1863
1864         return mhash >> 1;
1865 }
1866 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1867
1868 static int ip_mkroute_input(struct sk_buff *skb,
1869                             struct fib_result *res,
1870                             struct in_device *in_dev,
1871                             __be32 daddr, __be32 saddr, u32 tos,
1872                             struct flow_keys *hkeys)
1873 {
1874 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1875         if (res->fi && res->fi->fib_nhs > 1) {
1876                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1877
1878                 fib_select_multipath(res, h);
1879         }
1880 #endif
1881
1882         /* create a routing cache entry */
1883         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1884 }
1885
1886 /*
1887  *      NOTE. We drop all the packets that has local source
1888  *      addresses, because every properly looped back packet
1889  *      must have correct destination already attached by output routine.
1890  *
1891  *      Such approach solves two big problems:
1892  *      1. Not simplex devices are handled properly.
1893  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1894  *      called with rcu_read_lock()
1895  */
1896
1897 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1898                                u8 tos, struct net_device *dev,
1899                                struct fib_result *res)
1900 {
1901         struct in_device *in_dev = __in_dev_get_rcu(dev);
1902         struct flow_keys *flkeys = NULL, _flkeys;
1903         struct net    *net = dev_net(dev);
1904         struct ip_tunnel_info *tun_info;
1905         int             err = -EINVAL;
1906         unsigned int    flags = 0;
1907         u32             itag = 0;
1908         struct rtable   *rth;
1909         struct flowi4   fl4;
1910         bool do_cache;
1911
1912         /* IP on this device is disabled. */
1913
1914         if (!in_dev)
1915                 goto out;
1916
1917         /* Check for the most weird martians, which can be not detected
1918            by fib_lookup.
1919          */
1920
1921         tun_info = skb_tunnel_info(skb);
1922         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1923                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1924         else
1925                 fl4.flowi4_tun_key.tun_id = 0;
1926         skb_dst_drop(skb);
1927
1928         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1929                 goto martian_source;
1930
1931         res->fi = NULL;
1932         res->table = NULL;
1933         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1934                 goto brd_input;
1935
1936         /* Accept zero addresses only to limited broadcast;
1937          * I even do not know to fix it or not. Waiting for complains :-)
1938          */
1939         if (ipv4_is_zeronet(saddr))
1940                 goto martian_source;
1941
1942         if (ipv4_is_zeronet(daddr))
1943                 goto martian_destination;
1944
1945         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1946          * and call it once if daddr or/and saddr are loopback addresses
1947          */
1948         if (ipv4_is_loopback(daddr)) {
1949                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1950                         goto martian_destination;
1951         } else if (ipv4_is_loopback(saddr)) {
1952                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1953                         goto martian_source;
1954         }
1955
1956         /*
1957          *      Now we are ready to route packet.
1958          */
1959         fl4.flowi4_oif = 0;
1960         fl4.flowi4_iif = dev->ifindex;
1961         fl4.flowi4_mark = skb->mark;
1962         fl4.flowi4_tos = tos;
1963         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1964         fl4.flowi4_flags = 0;
1965         fl4.daddr = daddr;
1966         fl4.saddr = saddr;
1967         fl4.flowi4_uid = sock_net_uid(net, NULL);
1968
1969         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
1970                 flkeys = &_flkeys;
1971
1972         err = fib_lookup(net, &fl4, res, 0);
1973         if (err != 0) {
1974                 if (!IN_DEV_FORWARD(in_dev))
1975                         err = -EHOSTUNREACH;
1976                 goto no_route;
1977         }
1978
1979         if (res->type == RTN_BROADCAST)
1980                 goto brd_input;
1981
1982         if (res->type == RTN_LOCAL) {
1983                 err = fib_validate_source(skb, saddr, daddr, tos,
1984                                           0, dev, in_dev, &itag);
1985                 if (err < 0)
1986                         goto martian_source;
1987                 goto local_input;
1988         }
1989
1990         if (!IN_DEV_FORWARD(in_dev)) {
1991                 err = -EHOSTUNREACH;
1992                 goto no_route;
1993         }
1994         if (res->type != RTN_UNICAST)
1995                 goto martian_destination;
1996
1997         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1998 out:    return err;
1999
2000 brd_input:
2001         if (skb->protocol != htons(ETH_P_IP))
2002                 goto e_inval;
2003
2004         if (!ipv4_is_zeronet(saddr)) {
2005                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006                                           in_dev, &itag);
2007                 if (err < 0)
2008                         goto martian_source;
2009         }
2010         flags |= RTCF_BROADCAST;
2011         res->type = RTN_BROADCAST;
2012         RT_CACHE_STAT_INC(in_brd);
2013
2014 local_input:
2015         do_cache = false;
2016         if (res->fi) {
2017                 if (!itag) {
2018                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019                         if (rt_cache_valid(rth)) {
2020                                 skb_dst_set_noref(skb, &rth->dst);
2021                                 err = 0;
2022                                 goto out;
2023                         }
2024                         do_cache = true;
2025                 }
2026         }
2027
2028         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029                            flags | RTCF_LOCAL, res->type,
2030                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2031         if (!rth)
2032                 goto e_nobufs;
2033
2034         rth->dst.output= ip_rt_bug;
2035 #ifdef CONFIG_IP_ROUTE_CLASSID
2036         rth->dst.tclassid = itag;
2037 #endif
2038         rth->rt_is_input = 1;
2039
2040         RT_CACHE_STAT_INC(in_slow_tot);
2041         if (res->type == RTN_UNREACHABLE) {
2042                 rth->dst.input= ip_error;
2043                 rth->dst.error= -err;
2044                 rth->rt_flags   &= ~RTCF_LOCAL;
2045         }
2046
2047         if (do_cache) {
2048                 struct fib_nh *nh = &FIB_RES_NH(*res);
2049
2050                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2051                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2052                         WARN_ON(rth->dst.input == lwtunnel_input);
2053                         rth->dst.lwtstate->orig_input = rth->dst.input;
2054                         rth->dst.input = lwtunnel_input;
2055                 }
2056
2057                 if (unlikely(!rt_cache_route(nh, rth)))
2058                         rt_add_uncached_list(rth);
2059         }
2060         skb_dst_set(skb, &rth->dst);
2061         err = 0;
2062         goto out;
2063
2064 no_route:
2065         RT_CACHE_STAT_INC(in_no_route);
2066         res->type = RTN_UNREACHABLE;
2067         res->fi = NULL;
2068         res->table = NULL;
2069         goto local_input;
2070
2071         /*
2072          *      Do not cache martian addresses: they should be logged (RFC1812)
2073          */
2074 martian_destination:
2075         RT_CACHE_STAT_INC(in_martian_dst);
2076 #ifdef CONFIG_IP_ROUTE_VERBOSE
2077         if (IN_DEV_LOG_MARTIANS(in_dev))
2078                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2079                                      &daddr, &saddr, dev->name);
2080 #endif
2081
2082 e_inval:
2083         err = -EINVAL;
2084         goto out;
2085
2086 e_nobufs:
2087         err = -ENOBUFS;
2088         goto out;
2089
2090 martian_source:
2091         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2092         goto out;
2093 }
2094
2095 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096                          u8 tos, struct net_device *dev)
2097 {
2098         struct fib_result res;
2099         int err;
2100
2101         tos &= IPTOS_RT_MASK;
2102         rcu_read_lock();
2103         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2104         rcu_read_unlock();
2105
2106         return err;
2107 }
2108 EXPORT_SYMBOL(ip_route_input_noref);
2109
2110 /* called with rcu_read_lock held */
2111 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2112                        u8 tos, struct net_device *dev, struct fib_result *res)
2113 {
2114         /* Multicast recognition logic is moved from route cache to here.
2115            The problem was that too many Ethernet cards have broken/missing
2116            hardware multicast filters :-( As result the host on multicasting
2117            network acquires a lot of useless route cache entries, sort of
2118            SDR messages from all the world. Now we try to get rid of them.
2119            Really, provided software IP multicast filter is organized
2120            reasonably (at least, hashed), it does not result in a slowdown
2121            comparing with route cache reject entries.
2122            Note, that multicast routers are not affected, because
2123            route cache entry is created eventually.
2124          */
2125         if (ipv4_is_multicast(daddr)) {
2126                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2127                 int our = 0;
2128                 int err = -EINVAL;
2129
2130                 if (in_dev)
2131                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2132                                               ip_hdr(skb)->protocol);
2133
2134                 /* check l3 master if no match yet */
2135                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2136                         struct in_device *l3_in_dev;
2137
2138                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2139                         if (l3_in_dev)
2140                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2141                                                       ip_hdr(skb)->protocol);
2142                 }
2143
2144                 if (our
2145 #ifdef CONFIG_IP_MROUTE
2146                         ||
2147                     (!ipv4_is_local_multicast(daddr) &&
2148                      IN_DEV_MFORWARD(in_dev))
2149 #endif
2150                    ) {
2151                         err = ip_route_input_mc(skb, daddr, saddr,
2152                                                 tos, dev, our);
2153                 }
2154                 return err;
2155         }
2156
2157         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2158 }
2159
2160 /* called with rcu_read_lock() */
2161 static struct rtable *__mkroute_output(const struct fib_result *res,
2162                                        const struct flowi4 *fl4, int orig_oif,
2163                                        struct net_device *dev_out,
2164                                        unsigned int flags)
2165 {
2166         struct fib_info *fi = res->fi;
2167         struct fib_nh_exception *fnhe;
2168         struct in_device *in_dev;
2169         u16 type = res->type;
2170         struct rtable *rth;
2171         bool do_cache;
2172
2173         in_dev = __in_dev_get_rcu(dev_out);
2174         if (!in_dev)
2175                 return ERR_PTR(-EINVAL);
2176
2177         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2178                 if (ipv4_is_loopback(fl4->saddr) &&
2179                     !(dev_out->flags & IFF_LOOPBACK) &&
2180                     !netif_is_l3_master(dev_out))
2181                         return ERR_PTR(-EINVAL);
2182
2183         if (ipv4_is_lbcast(fl4->daddr))
2184                 type = RTN_BROADCAST;
2185         else if (ipv4_is_multicast(fl4->daddr))
2186                 type = RTN_MULTICAST;
2187         else if (ipv4_is_zeronet(fl4->daddr))
2188                 return ERR_PTR(-EINVAL);
2189
2190         if (dev_out->flags & IFF_LOOPBACK)
2191                 flags |= RTCF_LOCAL;
2192
2193         do_cache = true;
2194         if (type == RTN_BROADCAST) {
2195                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2196                 fi = NULL;
2197         } else if (type == RTN_MULTICAST) {
2198                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2199                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2200                                      fl4->flowi4_proto))
2201                         flags &= ~RTCF_LOCAL;
2202                 else
2203                         do_cache = false;
2204                 /* If multicast route do not exist use
2205                  * default one, but do not gateway in this case.
2206                  * Yes, it is hack.
2207                  */
2208                 if (fi && res->prefixlen < 4)
2209                         fi = NULL;
2210         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2211                    (orig_oif != dev_out->ifindex)) {
2212                 /* For local routes that require a particular output interface
2213                  * we do not want to cache the result.  Caching the result
2214                  * causes incorrect behaviour when there are multiple source
2215                  * addresses on the interface, the end result being that if the
2216                  * intended recipient is waiting on that interface for the
2217                  * packet he won't receive it because it will be delivered on
2218                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2219                  * be set to the loopback interface as well.
2220                  */
2221                 fi = NULL;
2222         }
2223
2224         fnhe = NULL;
2225         do_cache &= fi != NULL;
2226         if (do_cache) {
2227                 struct rtable __rcu **prth;
2228                 struct fib_nh *nh = &FIB_RES_NH(*res);
2229
2230                 fnhe = find_exception(nh, fl4->daddr);
2231                 if (fnhe) {
2232                         prth = &fnhe->fnhe_rth_output;
2233                         rth = rcu_dereference(*prth);
2234                         if (rth && rth->dst.expires &&
2235                             time_after(jiffies, rth->dst.expires)) {
2236                                 ip_del_fnhe(nh, fl4->daddr);
2237                                 fnhe = NULL;
2238                         } else {
2239                                 goto rt_cache;
2240                         }
2241                 }
2242
2243                 if (unlikely(fl4->flowi4_flags &
2244                              FLOWI_FLAG_KNOWN_NH &&
2245                              !(nh->nh_gw &&
2246                                nh->nh_scope == RT_SCOPE_LINK))) {
2247                         do_cache = false;
2248                         goto add;
2249                 }
2250                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2251                 rth = rcu_dereference(*prth);
2252
2253 rt_cache:
2254                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2255                         return rth;
2256         }
2257
2258 add:
2259         rth = rt_dst_alloc(dev_out, flags, type,
2260                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2261                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2262                            do_cache);
2263         if (!rth)
2264                 return ERR_PTR(-ENOBUFS);
2265
2266         rth->rt_iif = orig_oif;
2267
2268         RT_CACHE_STAT_INC(out_slow_tot);
2269
2270         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2271                 if (flags & RTCF_LOCAL &&
2272                     !(dev_out->flags & IFF_LOOPBACK)) {
2273                         rth->dst.output = ip_mc_output;
2274                         RT_CACHE_STAT_INC(out_slow_mc);
2275                 }
2276 #ifdef CONFIG_IP_MROUTE
2277                 if (type == RTN_MULTICAST) {
2278                         if (IN_DEV_MFORWARD(in_dev) &&
2279                             !ipv4_is_local_multicast(fl4->daddr)) {
2280                                 rth->dst.input = ip_mr_input;
2281                                 rth->dst.output = ip_mc_output;
2282                         }
2283                 }
2284 #endif
2285         }
2286
2287         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2288         lwtunnel_set_redirect(&rth->dst);
2289
2290         return rth;
2291 }
2292
2293 /*
2294  * Major route resolver routine.
2295  */
2296
2297 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2298                                         const struct sk_buff *skb)
2299 {
2300         __u8 tos = RT_FL_TOS(fl4);
2301         struct fib_result res;
2302         struct rtable *rth;
2303
2304         res.tclassid    = 0;
2305         res.fi          = NULL;
2306         res.table       = NULL;
2307
2308         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2309         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2310         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2311                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2312
2313         rcu_read_lock();
2314         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2315         rcu_read_unlock();
2316
2317         return rth;
2318 }
2319 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2320
2321 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2322                                             struct fib_result *res,
2323                                             const struct sk_buff *skb)
2324 {
2325         struct net_device *dev_out = NULL;
2326         int orig_oif = fl4->flowi4_oif;
2327         unsigned int flags = 0;
2328         struct rtable *rth;
2329         int err = -ENETUNREACH;
2330
2331         if (fl4->saddr) {
2332                 rth = ERR_PTR(-EINVAL);
2333                 if (ipv4_is_multicast(fl4->saddr) ||
2334                     ipv4_is_lbcast(fl4->saddr) ||
2335                     ipv4_is_zeronet(fl4->saddr))
2336                         goto out;
2337
2338                 /* I removed check for oif == dev_out->oif here.
2339                    It was wrong for two reasons:
2340                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2341                       is assigned to multiple interfaces.
2342                    2. Moreover, we are allowed to send packets with saddr
2343                       of another iface. --ANK
2344                  */
2345
2346                 if (fl4->flowi4_oif == 0 &&
2347                     (ipv4_is_multicast(fl4->daddr) ||
2348                      ipv4_is_lbcast(fl4->daddr))) {
2349                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2350                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2351                         if (!dev_out)
2352                                 goto out;
2353
2354                         /* Special hack: user can direct multicasts
2355                            and limited broadcast via necessary interface
2356                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2357                            This hack is not just for fun, it allows
2358                            vic,vat and friends to work.
2359                            They bind socket to loopback, set ttl to zero
2360                            and expect that it will work.
2361                            From the viewpoint of routing cache they are broken,
2362                            because we are not allowed to build multicast path
2363                            with loopback source addr (look, routing cache
2364                            cannot know, that ttl is zero, so that packet
2365                            will not leave this host and route is valid).
2366                            Luckily, this hack is good workaround.
2367                          */
2368
2369                         fl4->flowi4_oif = dev_out->ifindex;
2370                         goto make_route;
2371                 }
2372
2373                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2374                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2375                         if (!__ip_dev_find(net, fl4->saddr, false))
2376                                 goto out;
2377                 }
2378         }
2379
2380
2381         if (fl4->flowi4_oif) {
2382                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2383                 rth = ERR_PTR(-ENODEV);
2384                 if (!dev_out)
2385                         goto out;
2386
2387                 /* RACE: Check return value of inet_select_addr instead. */
2388                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2389                         rth = ERR_PTR(-ENETUNREACH);
2390                         goto out;
2391                 }
2392                 if (ipv4_is_local_multicast(fl4->daddr) ||
2393                     ipv4_is_lbcast(fl4->daddr) ||
2394                     fl4->flowi4_proto == IPPROTO_IGMP) {
2395                         if (!fl4->saddr)
2396                                 fl4->saddr = inet_select_addr(dev_out, 0,
2397                                                               RT_SCOPE_LINK);
2398                         goto make_route;
2399                 }
2400                 if (!fl4->saddr) {
2401                         if (ipv4_is_multicast(fl4->daddr))
2402                                 fl4->saddr = inet_select_addr(dev_out, 0,
2403                                                               fl4->flowi4_scope);
2404                         else if (!fl4->daddr)
2405                                 fl4->saddr = inet_select_addr(dev_out, 0,
2406                                                               RT_SCOPE_HOST);
2407                 }
2408         }
2409
2410         if (!fl4->daddr) {
2411                 fl4->daddr = fl4->saddr;
2412                 if (!fl4->daddr)
2413                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2414                 dev_out = net->loopback_dev;
2415                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2416                 res->type = RTN_LOCAL;
2417                 flags |= RTCF_LOCAL;
2418                 goto make_route;
2419         }
2420
2421         err = fib_lookup(net, fl4, res, 0);
2422         if (err) {
2423                 res->fi = NULL;
2424                 res->table = NULL;
2425                 if (fl4->flowi4_oif &&
2426                     (ipv4_is_multicast(fl4->daddr) ||
2427                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2428                         /* Apparently, routing tables are wrong. Assume,
2429                            that the destination is on link.
2430
2431                            WHY? DW.
2432                            Because we are allowed to send to iface
2433                            even if it has NO routes and NO assigned
2434                            addresses. When oif is specified, routing
2435                            tables are looked up with only one purpose:
2436                            to catch if destination is gatewayed, rather than
2437                            direct. Moreover, if MSG_DONTROUTE is set,
2438                            we send packet, ignoring both routing tables
2439                            and ifaddr state. --ANK
2440
2441
2442                            We could make it even if oif is unknown,
2443                            likely IPv6, but we do not.
2444                          */
2445
2446                         if (fl4->saddr == 0)
2447                                 fl4->saddr = inet_select_addr(dev_out, 0,
2448                                                               RT_SCOPE_LINK);
2449                         res->type = RTN_UNICAST;
2450                         goto make_route;
2451                 }
2452                 rth = ERR_PTR(err);
2453                 goto out;
2454         }
2455
2456         if (res->type == RTN_LOCAL) {
2457                 if (!fl4->saddr) {
2458                         if (res->fi->fib_prefsrc)
2459                                 fl4->saddr = res->fi->fib_prefsrc;
2460                         else
2461                                 fl4->saddr = fl4->daddr;
2462                 }
2463
2464                 /* L3 master device is the loopback for that domain */
2465                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2466                         net->loopback_dev;
2467
2468                 /* make sure orig_oif points to fib result device even
2469                  * though packet rx/tx happens over loopback or l3mdev
2470                  */
2471                 orig_oif = FIB_RES_OIF(*res);
2472
2473                 fl4->flowi4_oif = dev_out->ifindex;
2474                 flags |= RTCF_LOCAL;
2475                 goto make_route;
2476         }
2477
2478         fib_select_path(net, res, fl4, skb);
2479
2480         dev_out = FIB_RES_DEV(*res);
2481         fl4->flowi4_oif = dev_out->ifindex;
2482
2483
2484 make_route:
2485         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2486
2487 out:
2488         return rth;
2489 }
2490
2491 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2492 {
2493         return NULL;
2494 }
2495
2496 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2497 {
2498         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2499
2500         return mtu ? : dst->dev->mtu;
2501 }
2502
2503 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2504                                           struct sk_buff *skb, u32 mtu)
2505 {
2506 }
2507
2508 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2509                                        struct sk_buff *skb)
2510 {
2511 }
2512
2513 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2514                                           unsigned long old)
2515 {
2516         return NULL;
2517 }
2518
2519 static struct dst_ops ipv4_dst_blackhole_ops = {
2520         .family                 =       AF_INET,
2521         .check                  =       ipv4_blackhole_dst_check,
2522         .mtu                    =       ipv4_blackhole_mtu,
2523         .default_advmss         =       ipv4_default_advmss,
2524         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2525         .redirect               =       ipv4_rt_blackhole_redirect,
2526         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2527         .neigh_lookup           =       ipv4_neigh_lookup,
2528 };
2529
2530 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2531 {
2532         struct rtable *ort = (struct rtable *) dst_orig;
2533         struct rtable *rt;
2534
2535         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2536         if (rt) {
2537                 struct dst_entry *new = &rt->dst;
2538
2539                 new->__use = 1;
2540                 new->input = dst_discard;
2541                 new->output = dst_discard_out;
2542
2543                 new->dev = net->loopback_dev;
2544                 if (new->dev)
2545                         dev_hold(new->dev);
2546
2547                 rt->rt_is_input = ort->rt_is_input;
2548                 rt->rt_iif = ort->rt_iif;
2549                 rt->rt_pmtu = ort->rt_pmtu;
2550                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2551
2552                 rt->rt_genid = rt_genid_ipv4(net);
2553                 rt->rt_flags = ort->rt_flags;
2554                 rt->rt_type = ort->rt_type;
2555                 rt->rt_gateway = ort->rt_gateway;
2556                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2557
2558                 INIT_LIST_HEAD(&rt->rt_uncached);
2559         }
2560
2561         dst_release(dst_orig);
2562
2563         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2564 }
2565
2566 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2567                                     const struct sock *sk)
2568 {
2569         struct rtable *rt = __ip_route_output_key(net, flp4);
2570
2571         if (IS_ERR(rt))
2572                 return rt;
2573
2574         if (flp4->flowi4_proto)
2575                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2576                                                         flowi4_to_flowi(flp4),
2577                                                         sk, 0);
2578
2579         return rt;
2580 }
2581 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2582
2583 /* called with rcu_read_lock held */
2584 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2585                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2586                         u32 seq)
2587 {
2588         struct rtable *rt = skb_rtable(skb);
2589         struct rtmsg *r;
2590         struct nlmsghdr *nlh;
2591         unsigned long expires = 0;
2592         u32 error;
2593         u32 metrics[RTAX_MAX];
2594
2595         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2596         if (!nlh)
2597                 return -EMSGSIZE;
2598
2599         r = nlmsg_data(nlh);
2600         r->rtm_family    = AF_INET;
2601         r->rtm_dst_len  = 32;
2602         r->rtm_src_len  = 0;
2603         r->rtm_tos      = fl4->flowi4_tos;
2604         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2605         if (nla_put_u32(skb, RTA_TABLE, table_id))
2606                 goto nla_put_failure;
2607         r->rtm_type     = rt->rt_type;
2608         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2609         r->rtm_protocol = RTPROT_UNSPEC;
2610         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2611         if (rt->rt_flags & RTCF_NOTIFY)
2612                 r->rtm_flags |= RTM_F_NOTIFY;
2613         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2614                 r->rtm_flags |= RTCF_DOREDIRECT;
2615
2616         if (nla_put_in_addr(skb, RTA_DST, dst))
2617                 goto nla_put_failure;
2618         if (src) {
2619                 r->rtm_src_len = 32;
2620                 if (nla_put_in_addr(skb, RTA_SRC, src))
2621                         goto nla_put_failure;
2622         }
2623         if (rt->dst.dev &&
2624             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2625                 goto nla_put_failure;
2626 #ifdef CONFIG_IP_ROUTE_CLASSID
2627         if (rt->dst.tclassid &&
2628             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2629                 goto nla_put_failure;
2630 #endif
2631         if (!rt_is_input_route(rt) &&
2632             fl4->saddr != src) {
2633                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2634                         goto nla_put_failure;
2635         }
2636         if (rt->rt_uses_gateway &&
2637             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2638                 goto nla_put_failure;
2639
2640         expires = rt->dst.expires;
2641         if (expires) {
2642                 unsigned long now = jiffies;
2643
2644                 if (time_before(now, expires))
2645                         expires -= now;
2646                 else
2647                         expires = 0;
2648         }
2649
2650         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2651         if (rt->rt_pmtu && expires)
2652                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2653         if (rt->rt_mtu_locked && expires)
2654                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2655         if (rtnetlink_put_metrics(skb, metrics) < 0)
2656                 goto nla_put_failure;
2657
2658         if (fl4->flowi4_mark &&
2659             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2660                 goto nla_put_failure;
2661
2662         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2663             nla_put_u32(skb, RTA_UID,
2664                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2665                 goto nla_put_failure;
2666
2667         error = rt->dst.error;
2668
2669         if (rt_is_input_route(rt)) {
2670 #ifdef CONFIG_IP_MROUTE
2671                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2672                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2673                         int err = ipmr_get_route(net, skb,
2674                                                  fl4->saddr, fl4->daddr,
2675                                                  r, portid);
2676
2677                         if (err <= 0) {
2678                                 if (err == 0)
2679                                         return 0;
2680                                 goto nla_put_failure;
2681                         }
2682                 } else
2683 #endif
2684                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2685                                 goto nla_put_failure;
2686         }
2687
2688         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2689                 goto nla_put_failure;
2690
2691         nlmsg_end(skb, nlh);
2692         return 0;
2693
2694 nla_put_failure:
2695         nlmsg_cancel(skb, nlh);
2696         return -EMSGSIZE;
2697 }
2698
2699 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2700                              struct netlink_ext_ack *extack)
2701 {
2702         struct net *net = sock_net(in_skb->sk);
2703         struct rtmsg *rtm;
2704         struct nlattr *tb[RTA_MAX+1];
2705         struct fib_result res = {};
2706         struct rtable *rt = NULL;
2707         struct flowi4 fl4;
2708         __be32 dst = 0;
2709         __be32 src = 0;
2710         u32 iif;
2711         int err;
2712         int mark;
2713         struct sk_buff *skb;
2714         u32 table_id = RT_TABLE_MAIN;
2715         kuid_t uid;
2716
2717         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2718                           extack);
2719         if (err < 0)
2720                 goto errout;
2721
2722         rtm = nlmsg_data(nlh);
2723
2724         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2725         if (!skb) {
2726                 err = -ENOBUFS;
2727                 goto errout;
2728         }
2729
2730         /* Reserve room for dummy headers, this skb can pass
2731            through good chunk of routing engine.
2732          */
2733         skb_reset_mac_header(skb);
2734         skb_reset_network_header(skb);
2735
2736         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2737         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2738         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2739         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2740         if (tb[RTA_UID])
2741                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2742         else
2743                 uid = (iif ? INVALID_UID : current_uid());
2744
2745         /* Bugfix: need to give ip_route_input enough of an IP header to
2746          * not gag.
2747          */
2748         ip_hdr(skb)->protocol = IPPROTO_UDP;
2749         ip_hdr(skb)->saddr = src;
2750         ip_hdr(skb)->daddr = dst;
2751
2752         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2753
2754         memset(&fl4, 0, sizeof(fl4));
2755         fl4.daddr = dst;
2756         fl4.saddr = src;
2757         fl4.flowi4_tos = rtm->rtm_tos;
2758         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2759         fl4.flowi4_mark = mark;
2760         fl4.flowi4_uid = uid;
2761
2762         rcu_read_lock();
2763
2764         if (iif) {
2765                 struct net_device *dev;
2766
2767                 dev = dev_get_by_index_rcu(net, iif);
2768                 if (!dev) {
2769                         err = -ENODEV;
2770                         goto errout_free;
2771                 }
2772
2773                 skb->protocol   = htons(ETH_P_IP);
2774                 skb->dev        = dev;
2775                 skb->mark       = mark;
2776                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2777                                          dev, &res);
2778
2779                 rt = skb_rtable(skb);
2780                 if (err == 0 && rt->dst.error)
2781                         err = -rt->dst.error;
2782         } else {
2783                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2784                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2785                 err = 0;
2786                 if (IS_ERR(rt))
2787                         err = PTR_ERR(rt);
2788                 else
2789                         skb_dst_set(skb, &rt->dst);
2790         }
2791
2792         if (err)
2793                 goto errout_free;
2794
2795         if (rtm->rtm_flags & RTM_F_NOTIFY)
2796                 rt->rt_flags |= RTCF_NOTIFY;
2797
2798         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2799                 table_id = res.table ? res.table->tb_id : 0;
2800
2801         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2802                 if (!res.fi) {
2803                         err = fib_props[res.type].error;
2804                         if (!err)
2805                                 err = -EHOSTUNREACH;
2806                         goto errout_free;
2807                 }
2808                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2809                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2810                                     rt->rt_type, res.prefix, res.prefixlen,
2811                                     fl4.flowi4_tos, res.fi, 0);
2812         } else {
2813                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2814                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2815         }
2816         if (err < 0)
2817                 goto errout_free;
2818
2819         rcu_read_unlock();
2820
2821         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2822 errout:
2823         return err;
2824
2825 errout_free:
2826         rcu_read_unlock();
2827         kfree_skb(skb);
2828         goto errout;
2829 }
2830
2831 void ip_rt_multicast_event(struct in_device *in_dev)
2832 {
2833         rt_cache_flush(dev_net(in_dev->dev));
2834 }
2835
2836 #ifdef CONFIG_SYSCTL
2837 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2838 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2839 static int ip_rt_gc_elasticity __read_mostly    = 8;
2840 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2841
2842 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2843                                         void __user *buffer,
2844                                         size_t *lenp, loff_t *ppos)
2845 {
2846         struct net *net = (struct net *)__ctl->extra1;
2847
2848         if (write) {
2849                 rt_cache_flush(net);
2850                 fnhe_genid_bump(net);
2851                 return 0;
2852         }
2853
2854         return -EINVAL;
2855 }
2856
2857 static struct ctl_table ipv4_route_table[] = {
2858         {
2859                 .procname       = "gc_thresh",
2860                 .data           = &ipv4_dst_ops.gc_thresh,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = proc_dointvec,
2864         },
2865         {
2866                 .procname       = "max_size",
2867                 .data           = &ip_rt_max_size,
2868                 .maxlen         = sizeof(int),
2869                 .mode           = 0644,
2870                 .proc_handler   = proc_dointvec,
2871         },
2872         {
2873                 /*  Deprecated. Use gc_min_interval_ms */
2874
2875                 .procname       = "gc_min_interval",
2876                 .data           = &ip_rt_gc_min_interval,
2877                 .maxlen         = sizeof(int),
2878                 .mode           = 0644,
2879                 .proc_handler   = proc_dointvec_jiffies,
2880         },
2881         {
2882                 .procname       = "gc_min_interval_ms",
2883                 .data           = &ip_rt_gc_min_interval,
2884                 .maxlen         = sizeof(int),
2885                 .mode           = 0644,
2886                 .proc_handler   = proc_dointvec_ms_jiffies,
2887         },
2888         {
2889                 .procname       = "gc_timeout",
2890                 .data           = &ip_rt_gc_timeout,
2891                 .maxlen         = sizeof(int),
2892                 .mode           = 0644,
2893                 .proc_handler   = proc_dointvec_jiffies,
2894         },
2895         {
2896                 .procname       = "gc_interval",
2897                 .data           = &ip_rt_gc_interval,
2898                 .maxlen         = sizeof(int),
2899                 .mode           = 0644,
2900                 .proc_handler   = proc_dointvec_jiffies,
2901         },
2902         {
2903                 .procname       = "redirect_load",
2904                 .data           = &ip_rt_redirect_load,
2905                 .maxlen         = sizeof(int),
2906                 .mode           = 0644,
2907                 .proc_handler   = proc_dointvec,
2908         },
2909         {
2910                 .procname       = "redirect_number",
2911                 .data           = &ip_rt_redirect_number,
2912                 .maxlen         = sizeof(int),
2913                 .mode           = 0644,
2914                 .proc_handler   = proc_dointvec,
2915         },
2916         {
2917                 .procname       = "redirect_silence",
2918                 .data           = &ip_rt_redirect_silence,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = proc_dointvec,
2922         },
2923         {
2924                 .procname       = "error_cost",
2925                 .data           = &ip_rt_error_cost,
2926                 .maxlen         = sizeof(int),
2927                 .mode           = 0644,
2928                 .proc_handler   = proc_dointvec,
2929         },
2930         {
2931                 .procname       = "error_burst",
2932                 .data           = &ip_rt_error_burst,
2933                 .maxlen         = sizeof(int),
2934                 .mode           = 0644,
2935                 .proc_handler   = proc_dointvec,
2936         },
2937         {
2938                 .procname       = "gc_elasticity",
2939                 .data           = &ip_rt_gc_elasticity,
2940                 .maxlen         = sizeof(int),
2941                 .mode           = 0644,
2942                 .proc_handler   = proc_dointvec,
2943         },
2944         {
2945                 .procname       = "mtu_expires",
2946                 .data           = &ip_rt_mtu_expires,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = proc_dointvec_jiffies,
2950         },
2951         {
2952                 .procname       = "min_pmtu",
2953                 .data           = &ip_rt_min_pmtu,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = proc_dointvec_minmax,
2957                 .extra1         = &ip_min_valid_pmtu,
2958         },
2959         {
2960                 .procname       = "min_adv_mss",
2961                 .data           = &ip_rt_min_advmss,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = proc_dointvec,
2965         },
2966         { }
2967 };
2968
2969 static struct ctl_table ipv4_route_flush_table[] = {
2970         {
2971                 .procname       = "flush",
2972                 .maxlen         = sizeof(int),
2973                 .mode           = 0200,
2974                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2975         },
2976         { },
2977 };
2978
2979 static __net_init int sysctl_route_net_init(struct net *net)
2980 {
2981         struct ctl_table *tbl;
2982
2983         tbl = ipv4_route_flush_table;
2984         if (!net_eq(net, &init_net)) {
2985                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2986                 if (!tbl)
2987                         goto err_dup;
2988
2989                 /* Don't export sysctls to unprivileged users */
2990                 if (net->user_ns != &init_user_ns)
2991                         tbl[0].procname = NULL;
2992         }
2993         tbl[0].extra1 = net;
2994
2995         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2996         if (!net->ipv4.route_hdr)
2997                 goto err_reg;
2998         return 0;
2999
3000 err_reg:
3001         if (tbl != ipv4_route_flush_table)
3002                 kfree(tbl);
3003 err_dup:
3004         return -ENOMEM;
3005 }
3006
3007 static __net_exit void sysctl_route_net_exit(struct net *net)
3008 {
3009         struct ctl_table *tbl;
3010
3011         tbl = net->ipv4.route_hdr->ctl_table_arg;
3012         unregister_net_sysctl_table(net->ipv4.route_hdr);
3013         BUG_ON(tbl == ipv4_route_flush_table);
3014         kfree(tbl);
3015 }
3016
3017 static __net_initdata struct pernet_operations sysctl_route_ops = {
3018         .init = sysctl_route_net_init,
3019         .exit = sysctl_route_net_exit,
3020         .async = true,
3021 };
3022 #endif
3023
3024 static __net_init int rt_genid_init(struct net *net)
3025 {
3026         atomic_set(&net->ipv4.rt_genid, 0);
3027         atomic_set(&net->fnhe_genid, 0);
3028         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3029         return 0;
3030 }
3031
3032 static __net_initdata struct pernet_operations rt_genid_ops = {
3033         .init = rt_genid_init,
3034         .async = true,
3035 };
3036
3037 static int __net_init ipv4_inetpeer_init(struct net *net)
3038 {
3039         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3040
3041         if (!bp)
3042                 return -ENOMEM;
3043         inet_peer_base_init(bp);
3044         net->ipv4.peers = bp;
3045         return 0;
3046 }
3047
3048 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3049 {
3050         struct inet_peer_base *bp = net->ipv4.peers;
3051
3052         net->ipv4.peers = NULL;
3053         inetpeer_invalidate_tree(bp);
3054         kfree(bp);
3055 }
3056
3057 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3058         .init   =       ipv4_inetpeer_init,
3059         .exit   =       ipv4_inetpeer_exit,
3060         .async  =       true,
3061 };
3062
3063 #ifdef CONFIG_IP_ROUTE_CLASSID
3064 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3065 #endif /* CONFIG_IP_ROUTE_CLASSID */
3066
3067 int __init ip_rt_init(void)
3068 {
3069         int cpu;
3070
3071         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3072         if (!ip_idents)
3073                 panic("IP: failed to allocate ip_idents\n");
3074
3075         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3076
3077         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3078         if (!ip_tstamps)
3079                 panic("IP: failed to allocate ip_tstamps\n");
3080
3081         for_each_possible_cpu(cpu) {
3082                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3083
3084                 INIT_LIST_HEAD(&ul->head);
3085                 spin_lock_init(&ul->lock);
3086         }
3087 #ifdef CONFIG_IP_ROUTE_CLASSID
3088         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3089         if (!ip_rt_acct)
3090                 panic("IP: failed to allocate ip_rt_acct\n");
3091 #endif
3092
3093         ipv4_dst_ops.kmem_cachep =
3094                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3095                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3096
3097         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3098
3099         if (dst_entries_init(&ipv4_dst_ops) < 0)
3100                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3101
3102         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3103                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3104
3105         ipv4_dst_ops.gc_thresh = ~0;
3106         ip_rt_max_size = INT_MAX;
3107
3108         devinet_init();
3109         ip_fib_init();
3110
3111         if (ip_rt_proc_init())
3112                 pr_err("Unable to create route proc files\n");
3113 #ifdef CONFIG_XFRM
3114         xfrm_init();
3115         xfrm4_init();
3116 #endif
3117         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3118                       RTNL_FLAG_DOIT_UNLOCKED);
3119
3120 #ifdef CONFIG_SYSCTL
3121         register_pernet_subsys(&sysctl_route_ops);
3122 #endif
3123         register_pernet_subsys(&rt_genid_ops);
3124         register_pernet_subsys(&ipv4_inetpeer_ops);
3125         return 0;
3126 }
3127
3128 #ifdef CONFIG_SYSCTL
3129 /*
3130  * We really need to sanitize the damn ipv4 init order, then all
3131  * this nonsense will go away.
3132  */
3133 void __init ip_static_sysctl_init(void)
3134 {
3135         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3136 }
3137 #endif