net/ipv4/route.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              ROUTE - implementation of the IP router.
   8  *
   9  * Authors:     Ross Biro
  10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Verify area fixes.
  17  *              Alan Cox        :       cli() protects routing changes
  18  *              Rui Oliveira    :       ICMP routing table updates
  19  *              (rco@di.uminho.pt)      Routing table insertion and update
  20  *              Linus Torvalds  :       Rewrote bits to be sensible
  21  *              Alan Cox        :       Added BSD route gw semantics
  22  *              Alan Cox        :       Super /proc >4K
  23  *              Alan Cox        :       MTU in route table
  24  *              Alan Cox        :       MSS actually. Also added the window
  25  *                                      clamper.
  26  *              Sam Lantinga    :       Fixed route matching in rt_del()
  27  *              Alan Cox        :       Routing cache support.
  28  *              Alan Cox        :       Removed compatibility cruft.
  29  *              Alan Cox        :       RTF_REJECT support.
  30  *              Alan Cox        :       TCP irtt support.
  31  *              Jonathan Naylor :       Added Metric support.
  32  *      Miquel van Smoorenburg  :       BSD API fixes.
  33  *      Miquel van Smoorenburg  :       Metrics.
  34  *              Alan Cox        :       Use __u32 properly
  35  *              Alan Cox        :       Aligned routing errors more closely with BSD
  36  *                                      our system is still very different.
  37  *              Alan Cox        :       Faster /proc handling
  38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  39  *                                      routing caches and better behaviour.
  40  *
  41  *              Olaf Erb        :       irtt wasn't being copied right.
  42  *              Bjorn Ekwall    :       Kerneld route support.
  43  *              Alan Cox        :       Multicast fixed (I hope)
  44  *              Pavel Krauz     :       Limited broadcast fixed
  45  *              Mike McLagan    :       Routing by source
  46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  47  *                                      route.c and rewritten from scratch.
  48  *              Andi Kleen      :       Load-limit warning messages.
  49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  53  *              Marc Boucher    :       routing by fwmark
  54  *      Robert Olsson           :       Added rt_cache statistics
  55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  59  */
  60
  61 #define pr_fmt(fmt) "IPv4: " fmt
  62
  63 #include <linux/module.h>
  64 #include <linux/uaccess.h>
  65 #include <linux/bitops.h>
  66 #include <linux/types.h>
  67 #include <linux/kernel.h>
  68 #include <linux/mm.h>
  69 #include <linux/string.h>
  70 #include <linux/socket.h>
  71 #include <linux/sockios.h>
  72 #include <linux/errno.h>
  73 #include <linux/in.h>
  74 #include <linux/inet.h>
  75 #include <linux/netdevice.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/init.h>
  78 #include <linux/skbuff.h>
  79 #include <linux/inetdevice.h>
  80 #include <linux/igmp.h>
  81 #include <linux/pkt_sched.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/random.h>
  85 #include <linux/rcupdate.h>
  86 #include <linux/times.h>
  87 #include <linux/slab.h>
  88 #include <linux/jhash.h>
  89 #include <net/dst.h>
  90 #include <net/dst_metadata.h>
  91 #include <net/net_namespace.h>
  92 #include <net/protocol.h>
  93 #include <net/ip.h>
  94 #include <net/route.h>
  95 #include <net/inetpeer.h>
  96 #include <net/sock.h>
  97 #include <net/ip_fib.h>
  98 #include <net/nexthop.h>
  99 #include <net/arp.h>
 100 #include <net/tcp.h>
 101 #include <net/icmp.h>
 102 #include <net/xfrm.h>
 103 #include <net/lwtunnel.h>
 104 #include <net/netevent.h>
 105 #include <net/rtnetlink.h>
 106 #ifdef CONFIG_SYSCTL
 107 #include <linux/sysctl.h>
 108 #endif
 109 #include <net/secure_seq.h>
 110 #include <net/ip_tunnels.h>
 111 #include <net/l3mdev.h>
 112
 113 #include "fib_lookup.h"
 114
 115 #define RT_FL_TOS(oldflp4) \
 116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_redirect_number __read_mostly  = 9;
 122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 124 static int ip_rt_error_cost __read_mostly       = HZ;
 125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 128 static int ip_rt_min_advmss __read_mostly       = 256;
 129
 130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 131
 132 /*
 133  *      Interface to generic destination cache.
 134  */
 135
 136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140 static void              ipv4_link_failure(struct sk_buff *skb);
 141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142                                            struct sk_buff *skb, u32 mtu,
 143                                            bool confirm_neigh);
 144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                         struct sk_buff *skb);
 146 static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149 {
 150         WARN_ON(1);
 151         return NULL;
 152 }
 153
 154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                            struct sk_buff *skb,
 156                                            const void *daddr);
 157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 158
 159 static struct dst_ops ipv4_dst_ops = {
 160         .family =               AF_INET,
 161         .check =                ipv4_dst_check,
 162         .default_advmss =       ipv4_default_advmss,
 163         .mtu =                  ipv4_mtu,
 164         .cow_metrics =          ipv4_cow_metrics,
 165         .destroy =              ipv4_dst_destroy,
 166         .negative_advice =      ipv4_negative_advice,
 167         .link_failure =         ipv4_link_failure,
 168         .update_pmtu =          ip_rt_update_pmtu,
 169         .redirect =             ip_do_redirect,
 170         .local_out =            __ip_local_out,
 171         .neigh_lookup =         ipv4_neigh_lookup,
 172         .confirm_neigh =        ipv4_confirm_neigh,
 173 };
 174
 175 #define ECN_OR_COST(class)      TC_PRIO_##class
 176
 177 const __u8 ip_tos2prio[16] = {
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK)
 194 };
 195 EXPORT_SYMBOL(ip_tos2prio);
 196
 197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 199
 200 #ifdef CONFIG_PROC_FS
 201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 202 {
 203         if (*pos)
 204                 return NULL;
 205         return SEQ_START_TOKEN;
 206 }
 207
 208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 209 {
 210         ++*pos;
 211         return NULL;
 212 }
 213
 214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 215 {
 216 }
 217
 218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 219 {
 220         if (v == SEQ_START_TOKEN)
 221                 seq_printf(seq, "%-127s\n",
 222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 224                            "HHUptod\tSpecDst");
 225         return 0;
 226 }
 227
 228 static const struct seq_operations rt_cache_seq_ops = {
 229         .start  = rt_cache_seq_start,
 230         .next   = rt_cache_seq_next,
 231         .stop   = rt_cache_seq_stop,
 232         .show   = rt_cache_seq_show,
 233 };
 234
 235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 236 {
 237         return seq_open(file, &rt_cache_seq_ops);
 238 }
 239
 240 static const struct file_operations rt_cache_seq_fops = {
 241         .open    = rt_cache_seq_open,
 242         .read    = seq_read,
 243         .llseek  = seq_lseek,
 244         .release = seq_release,
 245 };
 246
 247
 248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 249 {
 250         int cpu;
 251
 252         if (*pos == 0)
 253                 return SEQ_START_TOKEN;
 254
 255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 256                 if (!cpu_possible(cpu))
 257                         continue;
 258                 *pos = cpu+1;
 259                 return &per_cpu(rt_cache_stat, cpu);
 260         }
 261         return NULL;
 262 }
 263
 264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 265 {
 266         int cpu;
 267
 268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 269                 if (!cpu_possible(cpu))
 270                         continue;
 271                 *pos = cpu+1;
 272                 return &per_cpu(rt_cache_stat, cpu);
 273         }
 274         return NULL;
 275
 276 }
 277
 278 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 279 {
 280
 281 }
 282
 283 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 284 {
 285         struct rt_cache_stat *st = v;
 286
 287         if (v == SEQ_START_TOKEN) {
 288                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 289                 return 0;
 290         }
 291
 292         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 293                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 294                    dst_entries_get_slow(&ipv4_dst_ops),
 295                    0, /* st->in_hit */
 296                    st->in_slow_tot,
 297                    st->in_slow_mc,
 298                    st->in_no_route,
 299                    st->in_brd,
 300                    st->in_martian_dst,
 301                    st->in_martian_src,
 302
 303                    0, /* st->out_hit */
 304                    st->out_slow_tot,
 305                    st->out_slow_mc,
 306
 307                    0, /* st->gc_total */
 308                    0, /* st->gc_ignored */
 309                    0, /* st->gc_goal_miss */
 310                    0, /* st->gc_dst_overflow */
 311                    0, /* st->in_hlist_search */
 312                    0  /* st->out_hlist_search */
 313                 );
 314         return 0;
 315 }
 316
 317 static const struct seq_operations rt_cpu_seq_ops = {
 318         .start  = rt_cpu_seq_start,
 319         .next   = rt_cpu_seq_next,
 320         .stop   = rt_cpu_seq_stop,
 321         .show   = rt_cpu_seq_show,
 322 };
 323
 324
 325 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 326 {
 327         return seq_open(file, &rt_cpu_seq_ops);
 328 }
 329
 330 static const struct file_operations rt_cpu_seq_fops = {
 331         .open    = rt_cpu_seq_open,
 332         .read    = seq_read,
 333         .llseek  = seq_lseek,
 334         .release = seq_release,
 335 };
 336
 337 #ifdef CONFIG_IP_ROUTE_CLASSID
 338 static int rt_acct_proc_show(struct seq_file *m, void *v)
 339 {
 340         struct ip_rt_acct *dst, *src;
 341         unsigned int i, j;
 342
 343         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 344         if (!dst)
 345                 return -ENOMEM;
 346
 347         for_each_possible_cpu(i) {
 348                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 349                 for (j = 0; j < 256; j++) {
 350                         dst[j].o_bytes   += src[j].o_bytes;
 351                         dst[j].o_packets += src[j].o_packets;
 352                         dst[j].i_bytes   += src[j].i_bytes;
 353                         dst[j].i_packets += src[j].i_packets;
 354                 }
 355         }
 356
 357         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 358         kfree(dst);
 359         return 0;
 360 }
 361 #endif
 362
 363 static int __net_init ip_rt_do_proc_init(struct net *net)
 364 {
 365         struct proc_dir_entry *pde;
 366
 367         pde = proc_create("rt_cache", 0444, net->proc_net,
 368                           &rt_cache_seq_fops);
 369         if (!pde)
 370                 goto err1;
 371
 372         pde = proc_create("rt_cache", 0444,
 373                           net->proc_net_stat, &rt_cpu_seq_fops);
 374         if (!pde)
 375                 goto err2;
 376
 377 #ifdef CONFIG_IP_ROUTE_CLASSID
 378         pde = proc_create_single("rt_acct", 0, net->proc_net,
 379                         rt_acct_proc_show);
 380         if (!pde)
 381                 goto err3;
 382 #endif
 383         return 0;
 384
 385 #ifdef CONFIG_IP_ROUTE_CLASSID
 386 err3:
 387         remove_proc_entry("rt_cache", net->proc_net_stat);
 388 #endif
 389 err2:
 390         remove_proc_entry("rt_cache", net->proc_net);
 391 err1:
 392         return -ENOMEM;
 393 }
 394
 395 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 396 {
 397         remove_proc_entry("rt_cache", net->proc_net_stat);
 398         remove_proc_entry("rt_cache", net->proc_net);
 399 #ifdef CONFIG_IP_ROUTE_CLASSID
 400         remove_proc_entry("rt_acct", net->proc_net);
 401 #endif
 402 }
 403
 404 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 405         .init = ip_rt_do_proc_init,
 406         .exit = ip_rt_do_proc_exit,
 407 };
 408
 409 static int __init ip_rt_proc_init(void)
 410 {
 411         return register_pernet_subsys(&ip_rt_proc_ops);
 412 }
 413
 414 #else
 415 static inline int ip_rt_proc_init(void)
 416 {
 417         return 0;
 418 }
 419 #endif /* CONFIG_PROC_FS */
 420
 421 static inline bool rt_is_expired(const struct rtable *rth)
 422 {
 423         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 424 }
 425
 426 void rt_cache_flush(struct net *net)
 427 {
 428         rt_genid_bump_ipv4(net);
 429 }
 430
 431 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 432                                            struct sk_buff *skb,
 433                                            const void *daddr)
 434 {
 435         const struct rtable *rt = container_of(dst, struct rtable, dst);
 436         struct net_device *dev = dst->dev;
 437         struct neighbour *n;
 438
 439         rcu_read_lock_bh();
 440
 441         if (likely(rt->rt_gw_family == AF_INET)) {
 442                 n = ip_neigh_gw4(dev, rt->rt_gw4);
 443         } else if (rt->rt_gw_family == AF_INET6) {
 444                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
 445         } else {
 446                 __be32 pkey;
 447
 448                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 449                 n = ip_neigh_gw4(dev, pkey);
 450         }
 451
 452         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 453                 n = NULL;
 454
 455         rcu_read_unlock_bh();
 456
 457         return n;
 458 }
 459
 460 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 461 {
 462         const struct rtable *rt = container_of(dst, struct rtable, dst);
 463         struct net_device *dev = dst->dev;
 464         const __be32 *pkey = daddr;
 465
 466         if (rt->rt_gw_family == AF_INET) {
 467                 pkey = (const __be32 *)&rt->rt_gw4;
 468         } else if (rt->rt_gw_family == AF_INET6) {
 469                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 470         } else if (!daddr ||
 471                  (rt->rt_flags &
 472                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 473                 return;
 474         }
 475         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 476 }
 477
 478 #define IP_IDENTS_SZ 2048u
 479
 480 static atomic_t *ip_idents __read_mostly;
 481 static u32 *ip_tstamps __read_mostly;
 482
 483 /* In order to protect privacy, we add a perturbation to identifiers
 484  * if one generator is seldom used. This makes hard for an attacker
 485  * to infer how many packets were sent between two points in time.
 486  */
 487 u32 ip_idents_reserve(u32 hash, int segs)
 488 {
 489         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 490         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 491         u32 old = READ_ONCE(*p_tstamp);
 492         u32 now = (u32)jiffies;
 493         u32 new, delta = 0;
 494
 495         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 496                 delta = prandom_u32_max(now - old);
 497
 498         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 499         do {
 500                 old = (u32)atomic_read(p_id);
 501                 new = old + delta + segs;
 502         } while (atomic_cmpxchg(p_id, old, new) != old);
 503
 504         return new - segs;
 505 }
 506 EXPORT_SYMBOL(ip_idents_reserve);
 507
 508 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 509 {
 510         u32 hash, id;
 511
 512         /* Note the following code is not safe, but this is okay. */
 513         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 514                 get_random_bytes(&net->ipv4.ip_id_key,
 515                                  sizeof(net->ipv4.ip_id_key));
 516
 517         hash = siphash_3u32((__force u32)iph->daddr,
 518                             (__force u32)iph->saddr,
 519                             iph->protocol,
 520                             &net->ipv4.ip_id_key);
 521         id = ip_idents_reserve(hash, segs);
 522         iph->id = htons(id);
 523 }
 524 EXPORT_SYMBOL(__ip_select_ident);
 525
 526 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 527                              const struct sock *sk,
 528                              const struct iphdr *iph,
 529                              int oif, u8 tos,
 530                              u8 prot, u32 mark, int flow_flags)
 531 {
 532         if (sk) {
 533                 const struct inet_sock *inet = inet_sk(sk);
 534
 535                 oif = sk->sk_bound_dev_if;
 536                 mark = sk->sk_mark;
 537                 tos = RT_CONN_FLAGS(sk);
 538                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 539         }
 540         flowi4_init_output(fl4, oif, mark, tos,
 541                            RT_SCOPE_UNIVERSE, prot,
 542                            flow_flags,
 543                            iph->daddr, iph->saddr, 0, 0,
 544                            sock_net_uid(net, sk));
 545 }
 546
 547 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 548                                const struct sock *sk)
 549 {
 550         const struct net *net = dev_net(skb->dev);
 551         const struct iphdr *iph = ip_hdr(skb);
 552         int oif = skb->dev->ifindex;
 553         u8 tos = RT_TOS(iph->tos);
 554         u8 prot = iph->protocol;
 555         u32 mark = skb->mark;
 556
 557         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 558 }
 559
 560 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563         const struct ip_options_rcu *inet_opt;
 564         __be32 daddr = inet->inet_daddr;
 565
 566         rcu_read_lock();
 567         inet_opt = rcu_dereference(inet->inet_opt);
 568         if (inet_opt && inet_opt->opt.srr)
 569                 daddr = inet_opt->opt.faddr;
 570         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 571                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 572                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 573                            inet_sk_flowi_flags(sk),
 574                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 575         rcu_read_unlock();
 576 }
 577
 578 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 579                                  const struct sk_buff *skb)
 580 {
 581         if (skb)
 582                 build_skb_flow_key(fl4, skb, sk);
 583         else
 584                 build_sk_flow_key(fl4, sk);
 585 }
 586
 587 static DEFINE_SPINLOCK(fnhe_lock);
 588
 589 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 590 {
 591         struct rtable *rt;
 592
 593         rt = rcu_dereference(fnhe->fnhe_rth_input);
 594         if (rt) {
 595                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 596                 dst_dev_put(&rt->dst);
 597                 dst_release(&rt->dst);
 598         }
 599         rt = rcu_dereference(fnhe->fnhe_rth_output);
 600         if (rt) {
 601                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 602                 dst_dev_put(&rt->dst);
 603                 dst_release(&rt->dst);
 604         }
 605 }
 606
 607 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 608 {
 609         struct fib_nh_exception *fnhe, *oldest;
 610
 611         oldest = rcu_dereference(hash->chain);
 612         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 613              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 614                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 615                         oldest = fnhe;
 616         }
 617         fnhe_flush_routes(oldest);
 618         return oldest;
 619 }
 620
 621 static inline u32 fnhe_hashfun(__be32 daddr)
 622 {
 623         static u32 fnhe_hashrnd __read_mostly;
 624         u32 hval;
 625
 626         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 627         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 628         return hash_32(hval, FNHE_HASH_SHIFT);
 629 }
 630
 631 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 632 {
 633         rt->rt_pmtu = fnhe->fnhe_pmtu;
 634         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 635         rt->dst.expires = fnhe->fnhe_expires;
 636
 637         if (fnhe->fnhe_gw) {
 638                 rt->rt_flags |= RTCF_REDIRECTED;
 639                 rt->rt_uses_gateway = 1;
 640                 rt->rt_gw_family = AF_INET;
 641                 rt->rt_gw4 = fnhe->fnhe_gw;
 642         }
 643 }
 644
 645 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 646                                   __be32 gw, u32 pmtu, bool lock,
 647                                   unsigned long expires)
 648 {
 649         struct fnhe_hash_bucket *hash;
 650         struct fib_nh_exception *fnhe;
 651         struct rtable *rt;
 652         u32 genid, hval;
 653         unsigned int i;
 654         int depth;
 655
 656         genid = fnhe_genid(dev_net(nhc->nhc_dev));
 657         hval = fnhe_hashfun(daddr);
 658
 659         spin_lock_bh(&fnhe_lock);
 660
 661         hash = rcu_dereference(nhc->nhc_exceptions);
 662         if (!hash) {
 663                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 664                 if (!hash)
 665                         goto out_unlock;
 666                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
 667         }
 668
 669         hash += hval;
 670
 671         depth = 0;
 672         for (fnhe = rcu_dereference(hash->chain); fnhe;
 673              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 674                 if (fnhe->fnhe_daddr == daddr)
 675                         break;
 676                 depth++;
 677         }
 678
 679         if (fnhe) {
 680                 if (fnhe->fnhe_genid != genid)
 681                         fnhe->fnhe_genid = genid;
 682                 if (gw)
 683                         fnhe->fnhe_gw = gw;
 684                 if (pmtu) {
 685                         fnhe->fnhe_pmtu = pmtu;
 686                         fnhe->fnhe_mtu_locked = lock;
 687                 }
 688                 fnhe->fnhe_expires = max(1UL, expires);
 689                 /* Update all cached dsts too */
 690                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 691                 if (rt)
 692                         fill_route_from_fnhe(rt, fnhe);
 693                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 694                 if (rt)
 695                         fill_route_from_fnhe(rt, fnhe);
 696         } else {
 697                 if (depth > FNHE_RECLAIM_DEPTH)
 698                         fnhe = fnhe_oldest(hash);
 699                 else {
 700                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 701                         if (!fnhe)
 702                                 goto out_unlock;
 703
 704                         fnhe->fnhe_next = hash->chain;
 705                         rcu_assign_pointer(hash->chain, fnhe);
 706                 }
 707                 fnhe->fnhe_genid = genid;
 708                 fnhe->fnhe_daddr = daddr;
 709                 fnhe->fnhe_gw = gw;
 710                 fnhe->fnhe_pmtu = pmtu;
 711                 fnhe->fnhe_mtu_locked = lock;
 712                 fnhe->fnhe_expires = max(1UL, expires);
 713
 714                 /* Exception created; mark the cached routes for the nexthop
 715                  * stale, so anyone caching it rechecks if this exception
 716                  * applies to them.
 717                  */
 718                 rt = rcu_dereference(nhc->nhc_rth_input);
 719                 if (rt)
 720                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 721
 722                 for_each_possible_cpu(i) {
 723                         struct rtable __rcu **prt;
 724                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 725                         rt = rcu_dereference(*prt);
 726                         if (rt)
 727                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 728                 }
 729         }
 730
 731         fnhe->fnhe_stamp = jiffies;
 732
 733 out_unlock:
 734         spin_unlock_bh(&fnhe_lock);
 735 }
 736
 737 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 738                              bool kill_route)
 739 {
 740         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 741         __be32 old_gw = ip_hdr(skb)->saddr;
 742         struct net_device *dev = skb->dev;
 743         struct in_device *in_dev;
 744         struct fib_result res;
 745         struct neighbour *n;
 746         struct net *net;
 747
 748         switch (icmp_hdr(skb)->code & 7) {
 749         case ICMP_REDIR_NET:
 750         case ICMP_REDIR_NETTOS:
 751         case ICMP_REDIR_HOST:
 752         case ICMP_REDIR_HOSTTOS:
 753                 break;
 754
 755         default:
 756                 return;
 757         }
 758
 759         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 760                 return;
 761
 762         in_dev = __in_dev_get_rcu(dev);
 763         if (!in_dev)
 764                 return;
 765
 766         net = dev_net(dev);
 767         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 768             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 769             ipv4_is_zeronet(new_gw))
 770                 goto reject_redirect;
 771
 772         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 773                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 774                         goto reject_redirect;
 775                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 776                         goto reject_redirect;
 777         } else {
 778                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 779                         goto reject_redirect;
 780         }
 781
 782         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 783         if (!n)
 784                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 785         if (!IS_ERR(n)) {
 786                 if (!(n->nud_state & NUD_VALID)) {
 787                         neigh_event_send(n, NULL);
 788                 } else {
 789                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 790                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
 791
 792                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 793                                                 0, false,
 794                                                 jiffies + ip_rt_gc_timeout);
 795                         }
 796                         if (kill_route)
 797                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 798                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 799                 }
 800                 neigh_release(n);
 801         }
 802         return;
 803
 804 reject_redirect:
 805 #ifdef CONFIG_IP_ROUTE_VERBOSE
 806         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 807                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 808                 __be32 daddr = iph->daddr;
 809                 __be32 saddr = iph->saddr;
 810
 811                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 812                                      "  Advised path = %pI4 -> %pI4\n",
 813                                      &old_gw, dev->name, &new_gw,
 814                                      &saddr, &daddr);
 815         }
 816 #endif
 817         ;
 818 }
 819
 820 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 821 {
 822         struct rtable *rt;
 823         struct flowi4 fl4;
 824         const struct iphdr *iph = (const struct iphdr *) skb->data;
 825         struct net *net = dev_net(skb->dev);
 826         int oif = skb->dev->ifindex;
 827         u8 tos = RT_TOS(iph->tos);
 828         u8 prot = iph->protocol;
 829         u32 mark = skb->mark;
 830
 831         rt = (struct rtable *) dst;
 832
 833         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 834         __ip_do_redirect(rt, skb, &fl4, true);
 835 }
 836
 837 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 838 {
 839         struct rtable *rt = (struct rtable *)dst;
 840         struct dst_entry *ret = dst;
 841
 842         if (rt) {
 843                 if (dst->obsolete > 0) {
 844                         ip_rt_put(rt);
 845                         ret = NULL;
 846                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 847                            rt->dst.expires) {
 848                         ip_rt_put(rt);
 849                         ret = NULL;
 850                 }
 851         }
 852         return ret;
 853 }
 854
 855 /*
 856  * Algorithm:
 857  *      1. The first ip_rt_redirect_number redirects are sent
 858  *         with exponential backoff, then we stop sending them at all,
 859  *         assuming that the host ignores our redirects.
 860  *      2. If we did not see packets requiring redirects
 861  *         during ip_rt_redirect_silence, we assume that the host
 862  *         forgot redirected route and start to send redirects again.
 863  *
 864  * This algorithm is much cheaper and more intelligent than dumb load limiting
 865  * in icmp.c.
 866  *
 867  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 868  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 869  */
 870
 871 void ip_rt_send_redirect(struct sk_buff *skb)
 872 {
 873         struct rtable *rt = skb_rtable(skb);
 874         struct in_device *in_dev;
 875         struct inet_peer *peer;
 876         struct net *net;
 877         int log_martians;
 878         int vif;
 879
 880         rcu_read_lock();
 881         in_dev = __in_dev_get_rcu(rt->dst.dev);
 882         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 883                 rcu_read_unlock();
 884                 return;
 885         }
 886         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 887         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 888         rcu_read_unlock();
 889
 890         net = dev_net(rt->dst.dev);
 891         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 892         if (!peer) {
 893                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 894                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 895                 return;
 896         }
 897
 898         /* No redirected packets during ip_rt_redirect_silence;
 899          * reset the algorithm.
 900          */
 901         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 902                 peer->rate_tokens = 0;
 903                 peer->n_redirects = 0;
 904         }
 905
 906         /* Too many ignored redirects; do not send anything
 907          * set dst.rate_last to the last seen redirected packet.
 908          */
 909         if (peer->n_redirects >= ip_rt_redirect_number) {
 910                 peer->rate_last = jiffies;
 911                 goto out_put_peer;
 912         }
 913
 914         /* Check for load limit; set rate_last to the latest sent
 915          * redirect.
 916          */
 917         if (peer->rate_tokens == 0 ||
 918             time_after(jiffies,
 919                        (peer->rate_last +
 920                         (ip_rt_redirect_load << peer->n_redirects)))) {
 921                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 922
 923                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 924                 peer->rate_last = jiffies;
 925                 ++peer->n_redirects;
 926 #ifdef CONFIG_IP_ROUTE_VERBOSE
 927                 if (log_martians &&
 928                     peer->n_redirects == ip_rt_redirect_number)
 929                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 930                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 931                                              &ip_hdr(skb)->daddr, &gw);
 932 #endif
 933         }
 934 out_put_peer:
 935         inet_putpeer(peer);
 936 }
 937
 938 static int ip_error(struct sk_buff *skb)
 939 {
 940         struct rtable *rt = skb_rtable(skb);
 941         struct net_device *dev = skb->dev;
 942         struct in_device *in_dev;
 943         struct inet_peer *peer;
 944         unsigned long now;
 945         struct net *net;
 946         bool send;
 947         int code;
 948
 949         if (netif_is_l3_master(skb->dev)) {
 950                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 951                 if (!dev)
 952                         goto out;
 953         }
 954
 955         in_dev = __in_dev_get_rcu(dev);
 956
 957         /* IP on this device is disabled. */
 958         if (!in_dev)
 959                 goto out;
 960
 961         net = dev_net(rt->dst.dev);
 962         if (!IN_DEV_FORWARD(in_dev)) {
 963                 switch (rt->dst.error) {
 964                 case EHOSTUNREACH:
 965                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 966                         break;
 967
 968                 case ENETUNREACH:
 969                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 970                         break;
 971                 }
 972                 goto out;
 973         }
 974
 975         switch (rt->dst.error) {
 976         case EINVAL:
 977         default:
 978                 goto out;
 979         case EHOSTUNREACH:
 980                 code = ICMP_HOST_UNREACH;
 981                 break;
 982         case ENETUNREACH:
 983                 code = ICMP_NET_UNREACH;
 984                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 985                 break;
 986         case EACCES:
 987                 code = ICMP_PKT_FILTERED;
 988                 break;
 989         }
 990
 991         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 992                                l3mdev_master_ifindex(skb->dev), 1);
 993
 994         send = true;
 995         if (peer) {
 996                 now = jiffies;
 997                 peer->rate_tokens += now - peer->rate_last;
 998                 if (peer->rate_tokens > ip_rt_error_burst)
 999                         peer->rate_tokens = ip_rt_error_burst;
1000                 peer->rate_last = now;
1001                 if (peer->rate_tokens >= ip_rt_error_cost)
1002                         peer->rate_tokens -= ip_rt_error_cost;
1003                 else
1004                         send = false;
1005                 inet_putpeer(peer);
1006         }
1007         if (send)
1008                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009
1010 out:    kfree_skb(skb);
1011         return 0;
1012 }
1013
1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 {
1016         struct dst_entry *dst = &rt->dst;
1017         u32 old_mtu = ipv4_mtu(dst);
1018         struct fib_result res;
1019         bool lock = false;
1020
1021         if (ip_mtu_locked(dst))
1022                 return;
1023
1024         if (old_mtu < mtu)
1025                 return;
1026
1027         if (mtu < ip_rt_min_pmtu) {
1028                 lock = true;
1029                 mtu = min(old_mtu, ip_rt_min_pmtu);
1030         }
1031
1032         if (rt->rt_pmtu == mtu && !lock &&
1033             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034                 return;
1035
1036         rcu_read_lock();
1037         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1039
1040                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1041                                       jiffies + ip_rt_mtu_expires);
1042         }
1043         rcu_read_unlock();
1044 }
1045
1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047                               struct sk_buff *skb, u32 mtu,
1048                               bool confirm_neigh)
1049 {
1050         struct rtable *rt = (struct rtable *) dst;
1051         struct flowi4 fl4;
1052
1053         ip_rt_build_flow_key(&fl4, sk, skb);
1054         __ip_rt_update_pmtu(rt, &fl4, mtu);
1055 }
1056
1057 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1058                       int oif, u8 protocol)
1059 {
1060         const struct iphdr *iph = (const struct iphdr *) skb->data;
1061         struct flowi4 fl4;
1062         struct rtable *rt;
1063         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1064
1065         __build_flow_key(net, &fl4, NULL, iph, oif,
1066                          RT_TOS(iph->tos), protocol, mark, 0);
1067         rt = __ip_route_output_key(net, &fl4);
1068         if (!IS_ERR(rt)) {
1069                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1070                 ip_rt_put(rt);
1071         }
1072 }
1073 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1074
1075 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1076 {
1077         const struct iphdr *iph = (const struct iphdr *) skb->data;
1078         struct flowi4 fl4;
1079         struct rtable *rt;
1080
1081         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1082
1083         if (!fl4.flowi4_mark)
1084                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1085
1086         rt = __ip_route_output_key(sock_net(sk), &fl4);
1087         if (!IS_ERR(rt)) {
1088                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1089                 ip_rt_put(rt);
1090         }
1091 }
1092
1093 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1094 {
1095         const struct iphdr *iph = (const struct iphdr *) skb->data;
1096         struct flowi4 fl4;
1097         struct rtable *rt;
1098         struct dst_entry *odst = NULL;
1099         bool new = false;
1100         struct net *net = sock_net(sk);
1101
1102         bh_lock_sock(sk);
1103
1104         if (!ip_sk_accept_pmtu(sk))
1105                 goto out;
1106
1107         odst = sk_dst_get(sk);
1108
1109         if (sock_owned_by_user(sk) || !odst) {
1110                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1111                 goto out;
1112         }
1113
1114         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1115
1116         rt = (struct rtable *)odst;
1117         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1118                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1119                 if (IS_ERR(rt))
1120                         goto out;
1121
1122                 new = true;
1123         }
1124
1125         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1126
1127         if (!dst_check(&rt->dst, 0)) {
1128                 if (new)
1129                         dst_release(&rt->dst);
1130
1131                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1132                 if (IS_ERR(rt))
1133                         goto out;
1134
1135                 new = true;
1136         }
1137
1138         if (new)
1139                 sk_dst_set(sk, &rt->dst);
1140
1141 out:
1142         bh_unlock_sock(sk);
1143         dst_release(odst);
1144 }
1145 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1146
1147 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1148                    int oif, u8 protocol)
1149 {
1150         const struct iphdr *iph = (const struct iphdr *) skb->data;
1151         struct flowi4 fl4;
1152         struct rtable *rt;
1153
1154         __build_flow_key(net, &fl4, NULL, iph, oif,
1155                          RT_TOS(iph->tos), protocol, 0, 0);
1156         rt = __ip_route_output_key(net, &fl4);
1157         if (!IS_ERR(rt)) {
1158                 __ip_do_redirect(rt, skb, &fl4, false);
1159                 ip_rt_put(rt);
1160         }
1161 }
1162 EXPORT_SYMBOL_GPL(ipv4_redirect);
1163
1164 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1165 {
1166         const struct iphdr *iph = (const struct iphdr *) skb->data;
1167         struct flowi4 fl4;
1168         struct rtable *rt;
1169         struct net *net = sock_net(sk);
1170
1171         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1172         rt = __ip_route_output_key(net, &fl4);
1173         if (!IS_ERR(rt)) {
1174                 __ip_do_redirect(rt, skb, &fl4, false);
1175                 ip_rt_put(rt);
1176         }
1177 }
1178 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1179
1180 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1181 {
1182         struct rtable *rt = (struct rtable *) dst;
1183
1184         /* All IPV4 dsts are created with ->obsolete set to the value
1185          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1186          * into this function always.
1187          *
1188          * When a PMTU/redirect information update invalidates a route,
1189          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1190          * DST_OBSOLETE_DEAD.
1191          */
1192         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1193                 return NULL;
1194         return dst;
1195 }
1196
1197 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1198 {
1199         struct ip_options opt;
1200         int res;
1201
1202         /* Recompile ip options since IPCB may not be valid anymore.
1203          * Also check we have a reasonable ipv4 header.
1204          */
1205         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1206             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1207                 return;
1208
1209         memset(&opt, 0, sizeof(opt));
1210         if (ip_hdr(skb)->ihl > 5) {
1211                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1212                         return;
1213                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1214
1215                 rcu_read_lock();
1216                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1217                 rcu_read_unlock();
1218
1219                 if (res)
1220                         return;
1221         }
1222         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1223 }
1224
1225 static void ipv4_link_failure(struct sk_buff *skb)
1226 {
1227         struct rtable *rt;
1228
1229         ipv4_send_dest_unreach(skb);
1230
1231         rt = skb_rtable(skb);
1232         if (rt)
1233                 dst_set_expires(&rt->dst, 0);
1234 }
1235
1236 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1237 {
1238         pr_debug("%s: %pI4 -> %pI4, %s\n",
1239                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1240                  skb->dev ? skb->dev->name : "?");
1241         kfree_skb(skb);
1242         WARN_ON(1);
1243         return 0;
1244 }
1245
1246 /*
1247    We do not cache source address of outgoing interface,
1248    because it is used only by IP RR, TS and SRR options,
1249    so that it out of fast path.
1250
1251    BTW remember: "addr" is allowed to be not aligned
1252    in IP options!
1253  */
1254
1255 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1256 {
1257         __be32 src;
1258
1259         if (rt_is_output_route(rt))
1260                 src = ip_hdr(skb)->saddr;
1261         else {
1262                 struct fib_result res;
1263                 struct iphdr *iph = ip_hdr(skb);
1264                 struct flowi4 fl4 = {
1265                         .daddr = iph->daddr,
1266                         .saddr = iph->saddr,
1267                         .flowi4_tos = RT_TOS(iph->tos),
1268                         .flowi4_oif = rt->dst.dev->ifindex,
1269                         .flowi4_iif = skb->dev->ifindex,
1270                         .flowi4_mark = skb->mark,
1271                 };
1272
1273                 rcu_read_lock();
1274                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1275                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1276                 else
1277                         src = inet_select_addr(rt->dst.dev,
1278                                                rt_nexthop(rt, iph->daddr),
1279                                                RT_SCOPE_UNIVERSE);
1280                 rcu_read_unlock();
1281         }
1282         memcpy(addr, &src, 4);
1283 }
1284
1285 #ifdef CONFIG_IP_ROUTE_CLASSID
1286 static void set_class_tag(struct rtable *rt, u32 tag)
1287 {
1288         if (!(rt->dst.tclassid & 0xFFFF))
1289                 rt->dst.tclassid |= tag & 0xFFFF;
1290         if (!(rt->dst.tclassid & 0xFFFF0000))
1291                 rt->dst.tclassid |= tag & 0xFFFF0000;
1292 }
1293 #endif
1294
1295 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1296 {
1297         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1298         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1299                                     ip_rt_min_advmss);
1300
1301         return min(advmss, IPV4_MAX_PMTU - header_size);
1302 }
1303
1304 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1305 {
1306         const struct rtable *rt = (const struct rtable *) dst;
1307         unsigned int mtu = rt->rt_pmtu;
1308
1309         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1310                 mtu = dst_metric_raw(dst, RTAX_MTU);
1311
1312         if (mtu)
1313                 return mtu;
1314
1315         mtu = READ_ONCE(dst->dev->mtu);
1316
1317         if (unlikely(ip_mtu_locked(dst))) {
1318                 if (rt->rt_uses_gateway && mtu > 576)
1319                         mtu = 576;
1320         }
1321
1322         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1323
1324         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1325 }
1326
1327 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1328 {
1329         struct fnhe_hash_bucket *hash;
1330         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1331         u32 hval = fnhe_hashfun(daddr);
1332
1333         spin_lock_bh(&fnhe_lock);
1334
1335         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1336                                          lockdep_is_held(&fnhe_lock));
1337         hash += hval;
1338
1339         fnhe_p = &hash->chain;
1340         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1341         while (fnhe) {
1342                 if (fnhe->fnhe_daddr == daddr) {
1343                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1344                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1345                         /* set fnhe_daddr to 0 to ensure it won't bind with
1346                          * new dsts in rt_bind_exception().
1347                          */
1348                         fnhe->fnhe_daddr = 0;
1349                         fnhe_flush_routes(fnhe);
1350                         kfree_rcu(fnhe, rcu);
1351                         break;
1352                 }
1353                 fnhe_p = &fnhe->fnhe_next;
1354                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1355                                                  lockdep_is_held(&fnhe_lock));
1356         }
1357
1358         spin_unlock_bh(&fnhe_lock);
1359 }
1360
1361 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1362                                                __be32 daddr)
1363 {
1364         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1365         struct fib_nh_exception *fnhe;
1366         u32 hval;
1367
1368         if (!hash)
1369                 return NULL;
1370
1371         hval = fnhe_hashfun(daddr);
1372
1373         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1374              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1375                 if (fnhe->fnhe_daddr == daddr) {
1376                         if (fnhe->fnhe_expires &&
1377                             time_after(jiffies, fnhe->fnhe_expires)) {
1378                                 ip_del_fnhe(nhc, daddr);
1379                                 break;
1380                         }
1381                         return fnhe;
1382                 }
1383         }
1384         return NULL;
1385 }
1386
1387 /* MTU selection:
1388  * 1. mtu on route is locked - use it
1389  * 2. mtu from nexthop exception
1390  * 3. mtu from egress device
1391  */
1392
1393 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1394 {
1395         struct fib_nh_common *nhc = res->nhc;
1396         struct net_device *dev = nhc->nhc_dev;
1397         struct fib_info *fi = res->fi;
1398         u32 mtu = 0;
1399
1400         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1401             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1402                 mtu = fi->fib_mtu;
1403
1404         if (likely(!mtu)) {
1405                 struct fib_nh_exception *fnhe;
1406
1407                 fnhe = find_exception(nhc, daddr);
1408                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1409                         mtu = fnhe->fnhe_pmtu;
1410         }
1411
1412         if (likely(!mtu))
1413                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1414
1415         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1416 }
1417
1418 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1419                               __be32 daddr, const bool do_cache)
1420 {
1421         bool ret = false;
1422
1423         spin_lock_bh(&fnhe_lock);
1424
1425         if (daddr == fnhe->fnhe_daddr) {
1426                 struct rtable __rcu **porig;
1427                 struct rtable *orig;
1428                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1429
1430                 if (rt_is_input_route(rt))
1431                         porig = &fnhe->fnhe_rth_input;
1432                 else
1433                         porig = &fnhe->fnhe_rth_output;
1434                 orig = rcu_dereference(*porig);
1435
1436                 if (fnhe->fnhe_genid != genid) {
1437                         fnhe->fnhe_genid = genid;
1438                         fnhe->fnhe_gw = 0;
1439                         fnhe->fnhe_pmtu = 0;
1440                         fnhe->fnhe_expires = 0;
1441                         fnhe->fnhe_mtu_locked = false;
1442                         fnhe_flush_routes(fnhe);
1443                         orig = NULL;
1444                 }
1445                 fill_route_from_fnhe(rt, fnhe);
1446                 if (!rt->rt_gw4) {
1447                         rt->rt_gw4 = daddr;
1448                         rt->rt_gw_family = AF_INET;
1449                 }
1450
1451                 if (do_cache) {
1452                         dst_hold(&rt->dst);
1453                         rcu_assign_pointer(*porig, rt);
1454                         if (orig) {
1455                                 dst_dev_put(&orig->dst);
1456                                 dst_release(&orig->dst);
1457                         }
1458                         ret = true;
1459                 }
1460
1461                 fnhe->fnhe_stamp = jiffies;
1462         }
1463         spin_unlock_bh(&fnhe_lock);
1464
1465         return ret;
1466 }
1467
1468 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1469 {
1470         struct rtable *orig, *prev, **p;
1471         bool ret = true;
1472
1473         if (rt_is_input_route(rt)) {
1474                 p = (struct rtable **)&nhc->nhc_rth_input;
1475         } else {
1476                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1477         }
1478         orig = *p;
1479
1480         /* hold dst before doing cmpxchg() to avoid race condition
1481          * on this dst
1482          */
1483         dst_hold(&rt->dst);
1484         prev = cmpxchg(p, orig, rt);
1485         if (prev == orig) {
1486                 if (orig) {
1487                         rt_add_uncached_list(orig);
1488                         dst_release(&orig->dst);
1489                 }
1490         } else {
1491                 dst_release(&rt->dst);
1492                 ret = false;
1493         }
1494
1495         return ret;
1496 }
1497
1498 struct uncached_list {
1499         spinlock_t              lock;
1500         struct list_head        head;
1501 };
1502
1503 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1504
1505 void rt_add_uncached_list(struct rtable *rt)
1506 {
1507         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1508
1509         rt->rt_uncached_list = ul;
1510
1511         spin_lock_bh(&ul->lock);
1512         list_add_tail(&rt->rt_uncached, &ul->head);
1513         spin_unlock_bh(&ul->lock);
1514 }
1515
1516 void rt_del_uncached_list(struct rtable *rt)
1517 {
1518         if (!list_empty(&rt->rt_uncached)) {
1519                 struct uncached_list *ul = rt->rt_uncached_list;
1520
1521                 spin_lock_bh(&ul->lock);
1522                 list_del(&rt->rt_uncached);
1523                 spin_unlock_bh(&ul->lock);
1524         }
1525 }
1526
1527 static void ipv4_dst_destroy(struct dst_entry *dst)
1528 {
1529         struct rtable *rt = (struct rtable *)dst;
1530
1531         ip_dst_metrics_put(dst);
1532         rt_del_uncached_list(rt);
1533 }
1534
1535 void rt_flush_dev(struct net_device *dev)
1536 {
1537         struct rtable *rt;
1538         int cpu;
1539
1540         for_each_possible_cpu(cpu) {
1541                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1542
1543                 spin_lock_bh(&ul->lock);
1544                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1545                         if (rt->dst.dev != dev)
1546                                 continue;
1547                         rt->dst.dev = blackhole_netdev;
1548                         dev_hold(rt->dst.dev);
1549                         dev_put(dev);
1550                 }
1551                 spin_unlock_bh(&ul->lock);
1552         }
1553 }
1554
1555 static bool rt_cache_valid(const struct rtable *rt)
1556 {
1557         return  rt &&
1558                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1559                 !rt_is_expired(rt);
1560 }
1561
1562 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1563                            const struct fib_result *res,
1564                            struct fib_nh_exception *fnhe,
1565                            struct fib_info *fi, u16 type, u32 itag,
1566                            const bool do_cache)
1567 {
1568         bool cached = false;
1569
1570         if (fi) {
1571                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1572
1573                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1574                         rt->rt_uses_gateway = 1;
1575                         rt->rt_gw_family = nhc->nhc_gw_family;
1576                         /* only INET and INET6 are supported */
1577                         if (likely(nhc->nhc_gw_family == AF_INET))
1578                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1579                         else
1580                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1581                 }
1582
1583                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1584
1585 #ifdef CONFIG_IP_ROUTE_CLASSID
1586                 if (nhc->nhc_family == AF_INET) {
1587                         struct fib_nh *nh;
1588
1589                         nh = container_of(nhc, struct fib_nh, nh_common);
1590                         rt->dst.tclassid = nh->nh_tclassid;
1591                 }
1592 #endif
1593                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1594                 if (unlikely(fnhe))
1595                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1596                 else if (do_cache)
1597                         cached = rt_cache_route(nhc, rt);
1598                 if (unlikely(!cached)) {
1599                         /* Routes we intend to cache in nexthop exception or
1600                          * FIB nexthop have the DST_NOCACHE bit clear.
1601                          * However, if we are unsuccessful at storing this
1602                          * route into the cache we really need to set it.
1603                          */
1604                         if (!rt->rt_gw4) {
1605                                 rt->rt_gw_family = AF_INET;
1606                                 rt->rt_gw4 = daddr;
1607                         }
1608                         rt_add_uncached_list(rt);
1609                 }
1610         } else
1611                 rt_add_uncached_list(rt);
1612
1613 #ifdef CONFIG_IP_ROUTE_CLASSID
1614 #ifdef CONFIG_IP_MULTIPLE_TABLES
1615         set_class_tag(rt, res->tclassid);
1616 #endif
1617         set_class_tag(rt, itag);
1618 #endif
1619 }
1620
1621 struct rtable *rt_dst_alloc(struct net_device *dev,
1622                             unsigned int flags, u16 type,
1623                             bool nopolicy, bool noxfrm, bool will_cache)
1624 {
1625         struct rtable *rt;
1626
1627         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1628                        (will_cache ? 0 : DST_HOST) |
1629                        (nopolicy ? DST_NOPOLICY : 0) |
1630                        (noxfrm ? DST_NOXFRM : 0));
1631
1632         if (rt) {
1633                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1634                 rt->rt_flags = flags;
1635                 rt->rt_type = type;
1636                 rt->rt_is_input = 0;
1637                 rt->rt_iif = 0;
1638                 rt->rt_pmtu = 0;
1639                 rt->rt_mtu_locked = 0;
1640                 rt->rt_uses_gateway = 0;
1641                 rt->rt_gw_family = 0;
1642                 rt->rt_gw4 = 0;
1643                 INIT_LIST_HEAD(&rt->rt_uncached);
1644
1645                 rt->dst.output = ip_output;
1646                 if (flags & RTCF_LOCAL)
1647                         rt->dst.input = ip_local_deliver;
1648         }
1649
1650         return rt;
1651 }
1652 EXPORT_SYMBOL(rt_dst_alloc);
1653
1654 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1655 {
1656         struct rtable *new_rt;
1657
1658         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1659                            rt->dst.flags);
1660
1661         if (new_rt) {
1662                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1663                 new_rt->rt_flags = rt->rt_flags;
1664                 new_rt->rt_type = rt->rt_type;
1665                 new_rt->rt_is_input = rt->rt_is_input;
1666                 new_rt->rt_iif = rt->rt_iif;
1667                 new_rt->rt_pmtu = rt->rt_pmtu;
1668                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1669                 new_rt->rt_gw_family = rt->rt_gw_family;
1670                 if (rt->rt_gw_family == AF_INET)
1671                         new_rt->rt_gw4 = rt->rt_gw4;
1672                 else if (rt->rt_gw_family == AF_INET6)
1673                         new_rt->rt_gw6 = rt->rt_gw6;
1674                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1675
1676                 new_rt->dst.flags |= DST_HOST;
1677                 new_rt->dst.input = rt->dst.input;
1678                 new_rt->dst.output = rt->dst.output;
1679                 new_rt->dst.error = rt->dst.error;
1680                 new_rt->dst.lastuse = jiffies;
1681                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1682         }
1683         return new_rt;
1684 }
1685 EXPORT_SYMBOL(rt_dst_clone);
1686
1687 /* called in rcu_read_lock() section */
1688 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1689                           u8 tos, struct net_device *dev,
1690                           struct in_device *in_dev, u32 *itag)
1691 {
1692         int err;
1693
1694         /* Primary sanity checks. */
1695         if (!in_dev)
1696                 return -EINVAL;
1697
1698         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1699             skb->protocol != htons(ETH_P_IP))
1700                 return -EINVAL;
1701
1702         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1703                 return -EINVAL;
1704
1705         if (ipv4_is_zeronet(saddr)) {
1706                 if (!ipv4_is_local_multicast(daddr) &&
1707                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1708                         return -EINVAL;
1709         } else {
1710                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1711                                           in_dev, itag);
1712                 if (err < 0)
1713                         return err;
1714         }
1715         return 0;
1716 }
1717
1718 /* called in rcu_read_lock() section */
1719 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1720                              u8 tos, struct net_device *dev, int our)
1721 {
1722         struct in_device *in_dev = __in_dev_get_rcu(dev);
1723         unsigned int flags = RTCF_MULTICAST;
1724         struct rtable *rth;
1725         u32 itag = 0;
1726         int err;
1727
1728         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1729         if (err)
1730                 return err;
1731
1732         if (our)
1733                 flags |= RTCF_LOCAL;
1734
1735         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1736                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1737         if (!rth)
1738                 return -ENOBUFS;
1739
1740 #ifdef CONFIG_IP_ROUTE_CLASSID
1741         rth->dst.tclassid = itag;
1742 #endif
1743         rth->dst.output = ip_rt_bug;
1744         rth->rt_is_input= 1;
1745
1746 #ifdef CONFIG_IP_MROUTE
1747         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1748                 rth->dst.input = ip_mr_input;
1749 #endif
1750         RT_CACHE_STAT_INC(in_slow_mc);
1751
1752         skb_dst_set(skb, &rth->dst);
1753         return 0;
1754 }
1755
1756
1757 static void ip_handle_martian_source(struct net_device *dev,
1758                                      struct in_device *in_dev,
1759                                      struct sk_buff *skb,
1760                                      __be32 daddr,
1761                                      __be32 saddr)
1762 {
1763         RT_CACHE_STAT_INC(in_martian_src);
1764 #ifdef CONFIG_IP_ROUTE_VERBOSE
1765         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1766                 /*
1767                  *      RFC1812 recommendation, if source is martian,
1768                  *      the only hint is MAC header.
1769                  */
1770                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1771                         &daddr, &saddr, dev->name);
1772                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1773                         print_hex_dump(KERN_WARNING, "ll header: ",
1774                                        DUMP_PREFIX_OFFSET, 16, 1,
1775                                        skb_mac_header(skb),
1776                                        dev->hard_header_len, false);
1777                 }
1778         }
1779 #endif
1780 }
1781
1782 /* called in rcu_read_lock() section */
1783 static int __mkroute_input(struct sk_buff *skb,
1784                            const struct fib_result *res,
1785                            struct in_device *in_dev,
1786                            __be32 daddr, __be32 saddr, u32 tos)
1787 {
1788         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1789         struct net_device *dev = nhc->nhc_dev;
1790         struct fib_nh_exception *fnhe;
1791         struct rtable *rth;
1792         int err;
1793         struct in_device *out_dev;
1794         bool do_cache;
1795         u32 itag = 0;
1796
1797         /* get a working reference to the output device */
1798         out_dev = __in_dev_get_rcu(dev);
1799         if (!out_dev) {
1800                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1801                 return -EINVAL;
1802         }
1803
1804         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1805                                   in_dev->dev, in_dev, &itag);
1806         if (err < 0) {
1807                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1808                                          saddr);
1809
1810                 goto cleanup;
1811         }
1812
1813         do_cache = res->fi && !itag;
1814         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1815             skb->protocol == htons(ETH_P_IP)) {
1816                 __be32 gw;
1817
1818                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1819                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1820                     inet_addr_onlink(out_dev, saddr, gw))
1821                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1822         }
1823
1824         if (skb->protocol != htons(ETH_P_IP)) {
1825                 /* Not IP (i.e. ARP). Do not create route, if it is
1826                  * invalid for proxy arp. DNAT routes are always valid.
1827                  *
1828                  * Proxy arp feature have been extended to allow, ARP
1829                  * replies back to the same interface, to support
1830                  * Private VLAN switch technologies. See arp.c.
1831                  */
1832                 if (out_dev == in_dev &&
1833                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1834                         err = -EINVAL;
1835                         goto cleanup;
1836                 }
1837         }
1838
1839         fnhe = find_exception(nhc, daddr);
1840         if (do_cache) {
1841                 if (fnhe)
1842                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1843                 else
1844                         rth = rcu_dereference(nhc->nhc_rth_input);
1845                 if (rt_cache_valid(rth)) {
1846                         skb_dst_set_noref(skb, &rth->dst);
1847                         goto out;
1848                 }
1849         }
1850
1851         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1852                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1853                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1854         if (!rth) {
1855                 err = -ENOBUFS;
1856                 goto cleanup;
1857         }
1858
1859         rth->rt_is_input = 1;
1860         RT_CACHE_STAT_INC(in_slow_tot);
1861
1862         rth->dst.input = ip_forward;
1863
1864         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1865                        do_cache);
1866         lwtunnel_set_redirect(&rth->dst);
1867         skb_dst_set(skb, &rth->dst);
1868 out:
1869         err = 0;
1870  cleanup:
1871         return err;
1872 }
1873
1874 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1875 /* To make ICMP packets follow the right flow, the multipath hash is
1876  * calculated from the inner IP addresses.
1877  */
1878 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1879                                  struct flow_keys *hash_keys)
1880 {
1881         const struct iphdr *outer_iph = ip_hdr(skb);
1882         const struct iphdr *key_iph = outer_iph;
1883         const struct iphdr *inner_iph;
1884         const struct icmphdr *icmph;
1885         struct iphdr _inner_iph;
1886         struct icmphdr _icmph;
1887
1888         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1889                 goto out;
1890
1891         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1892                 goto out;
1893
1894         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1895                                    &_icmph);
1896         if (!icmph)
1897                 goto out;
1898
1899         if (!icmp_is_err(icmph->type))
1900                 goto out;
1901
1902         inner_iph = skb_header_pointer(skb,
1903                                        outer_iph->ihl * 4 + sizeof(_icmph),
1904                                        sizeof(_inner_iph), &_inner_iph);
1905         if (!inner_iph)
1906                 goto out;
1907
1908         key_iph = inner_iph;
1909 out:
1910         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1911         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1912 }
1913
1914 /* if skb is set it will be used and fl4 can be NULL */
1915 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1916                        const struct sk_buff *skb, struct flow_keys *flkeys)
1917 {
1918         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1919         struct flow_keys hash_keys;
1920         u32 mhash;
1921
1922         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1923         case 0:
1924                 memset(&hash_keys, 0, sizeof(hash_keys));
1925                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1926                 if (skb) {
1927                         ip_multipath_l3_keys(skb, &hash_keys);
1928                 } else {
1929                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1930                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1931                 }
1932                 break;
1933         case 1:
1934                 /* skb is currently provided only when forwarding */
1935                 if (skb) {
1936                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1937                         struct flow_keys keys;
1938
1939                         /* short-circuit if we already have L4 hash present */
1940                         if (skb->l4_hash)
1941                                 return skb_get_hash_raw(skb) >> 1;
1942
1943                         memset(&hash_keys, 0, sizeof(hash_keys));
1944
1945                         if (!flkeys) {
1946                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1947                                 flkeys = &keys;
1948                         }
1949
1950                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1951                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1952                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1953                         hash_keys.ports.src = flkeys->ports.src;
1954                         hash_keys.ports.dst = flkeys->ports.dst;
1955                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1956                 } else {
1957                         memset(&hash_keys, 0, sizeof(hash_keys));
1958                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1959                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1960                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1961                         hash_keys.ports.src = fl4->fl4_sport;
1962                         hash_keys.ports.dst = fl4->fl4_dport;
1963                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1964                 }
1965                 break;
1966         case 2:
1967                 memset(&hash_keys, 0, sizeof(hash_keys));
1968                 /* skb is currently provided only when forwarding */
1969                 if (skb) {
1970                         struct flow_keys keys;
1971
1972                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1973                         /* Inner can be v4 or v6 */
1974                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1975                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1976                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1977                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1978                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1979                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1980                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1981                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1982                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1983                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1984                         } else {
1985                                 /* Same as case 0 */
1986                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1987                                 ip_multipath_l3_keys(skb, &hash_keys);
1988                         }
1989                 } else {
1990                         /* Same as case 0 */
1991                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1992                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1993                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1994                 }
1995                 break;
1996         }
1997         mhash = flow_hash_from_keys(&hash_keys);
1998
1999         if (multipath_hash)
2000                 mhash = jhash_2words(mhash, multipath_hash, 0);
2001
2002         return mhash >> 1;
2003 }
2004 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2005
2006 static int ip_mkroute_input(struct sk_buff *skb,
2007                             struct fib_result *res,
2008                             struct in_device *in_dev,
2009                             __be32 daddr, __be32 saddr, u32 tos,
2010                             struct flow_keys *hkeys)
2011 {
2012 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2013         if (res->fi && fib_info_num_path(res->fi) > 1) {
2014                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2015
2016                 fib_select_multipath(res, h);
2017         }
2018 #endif
2019
2020         /* create a routing cache entry */
2021         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2022 }
2023
2024 /* Implements all the saddr-related checks as ip_route_input_slow(),
2025  * assuming daddr is valid and the destination is not a local broadcast one.
2026  * Uses the provided hint instead of performing a route lookup.
2027  */
2028 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2029                       u8 tos, struct net_device *dev,
2030                       const struct sk_buff *hint)
2031 {
2032         struct in_device *in_dev = __in_dev_get_rcu(dev);
2033         struct rtable *rt = (struct rtable *)hint;
2034         struct net *net = dev_net(dev);
2035         int err = -EINVAL;
2036         u32 tag = 0;
2037
2038         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2039                 goto martian_source;
2040
2041         if (ipv4_is_zeronet(saddr))
2042                 goto martian_source;
2043
2044         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2045                 goto martian_source;
2046
2047         if (rt->rt_type != RTN_LOCAL)
2048                 goto skip_validate_source;
2049
2050         tos &= IPTOS_RT_MASK;
2051         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2052         if (err < 0)
2053                 goto martian_source;
2054
2055 skip_validate_source:
2056         skb_dst_copy(skb, hint);
2057         return 0;
2058
2059 martian_source:
2060         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2061         return err;
2062 }
2063
2064 /*
2065  *      NOTE. We drop all the packets that has local source
2066  *      addresses, because every properly looped back packet
2067  *      must have correct destination already attached by output routine.
2068  *      Changes in the enforced policies must be applied also to
2069  *      ip_route_use_hint().
2070  *
2071  *      Such approach solves two big problems:
2072  *      1. Not simplex devices are handled properly.
2073  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2074  *      called with rcu_read_lock()
2075  */
2076
2077 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2078                                u8 tos, struct net_device *dev,
2079                                struct fib_result *res)
2080 {
2081         struct in_device *in_dev = __in_dev_get_rcu(dev);
2082         struct flow_keys *flkeys = NULL, _flkeys;
2083         struct net    *net = dev_net(dev);
2084         struct ip_tunnel_info *tun_info;
2085         int             err = -EINVAL;
2086         unsigned int    flags = 0;
2087         u32             itag = 0;
2088         struct rtable   *rth;
2089         struct flowi4   fl4;
2090         bool do_cache = true;
2091
2092         /* IP on this device is disabled. */
2093
2094         if (!in_dev)
2095                 goto out;
2096
2097         /* Check for the most weird martians, which can be not detected
2098            by fib_lookup.
2099          */
2100
2101         tun_info = skb_tunnel_info(skb);
2102         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2103                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2104         else
2105                 fl4.flowi4_tun_key.tun_id = 0;
2106         skb_dst_drop(skb);
2107
2108         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2109                 goto martian_source;
2110
2111         res->fi = NULL;
2112         res->table = NULL;
2113         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2114                 goto brd_input;
2115
2116         /* Accept zero addresses only to limited broadcast;
2117          * I even do not know to fix it or not. Waiting for complains :-)
2118          */
2119         if (ipv4_is_zeronet(saddr))
2120                 goto martian_source;
2121
2122         if (ipv4_is_zeronet(daddr))
2123                 goto martian_destination;
2124
2125         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2126          * and call it once if daddr or/and saddr are loopback addresses
2127          */
2128         if (ipv4_is_loopback(daddr)) {
2129                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2130                         goto martian_destination;
2131         } else if (ipv4_is_loopback(saddr)) {
2132                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2133                         goto martian_source;
2134         }
2135
2136         /*
2137          *      Now we are ready to route packet.
2138          */
2139         fl4.flowi4_oif = 0;
2140         fl4.flowi4_iif = dev->ifindex;
2141         fl4.flowi4_mark = skb->mark;
2142         fl4.flowi4_tos = tos;
2143         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2144         fl4.flowi4_flags = 0;
2145         fl4.daddr = daddr;
2146         fl4.saddr = saddr;
2147         fl4.flowi4_uid = sock_net_uid(net, NULL);
2148
2149         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2150                 flkeys = &_flkeys;
2151         } else {
2152                 fl4.flowi4_proto = 0;
2153                 fl4.fl4_sport = 0;
2154                 fl4.fl4_dport = 0;
2155         }
2156
2157         err = fib_lookup(net, &fl4, res, 0);
2158         if (err != 0) {
2159                 if (!IN_DEV_FORWARD(in_dev))
2160                         err = -EHOSTUNREACH;
2161                 goto no_route;
2162         }
2163
2164         if (res->type == RTN_BROADCAST) {
2165                 if (IN_DEV_BFORWARD(in_dev))
2166                         goto make_route;
2167                 /* not do cache if bc_forwarding is enabled */
2168                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2169                         do_cache = false;
2170                 goto brd_input;
2171         }
2172
2173         if (res->type == RTN_LOCAL) {
2174                 err = fib_validate_source(skb, saddr, daddr, tos,
2175                                           0, dev, in_dev, &itag);
2176                 if (err < 0)
2177                         goto martian_source;
2178                 goto local_input;
2179         }
2180
2181         if (!IN_DEV_FORWARD(in_dev)) {
2182                 err = -EHOSTUNREACH;
2183                 goto no_route;
2184         }
2185         if (res->type != RTN_UNICAST)
2186                 goto martian_destination;
2187
2188 make_route:
2189         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2190 out:    return err;
2191
2192 brd_input:
2193         if (skb->protocol != htons(ETH_P_IP))
2194                 goto e_inval;
2195
2196         if (!ipv4_is_zeronet(saddr)) {
2197                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2198                                           in_dev, &itag);
2199                 if (err < 0)
2200                         goto martian_source;
2201         }
2202         flags |= RTCF_BROADCAST;
2203         res->type = RTN_BROADCAST;
2204         RT_CACHE_STAT_INC(in_brd);
2205
2206 local_input:
2207         do_cache &= res->fi && !itag;
2208         if (do_cache) {
2209                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2210
2211                 rth = rcu_dereference(nhc->nhc_rth_input);
2212                 if (rt_cache_valid(rth)) {
2213                         skb_dst_set_noref(skb, &rth->dst);
2214                         err = 0;
2215                         goto out;
2216                 }
2217         }
2218
2219         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2220                            flags | RTCF_LOCAL, res->type,
2221                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2222         if (!rth)
2223                 goto e_nobufs;
2224
2225         rth->dst.output= ip_rt_bug;
2226 #ifdef CONFIG_IP_ROUTE_CLASSID
2227         rth->dst.tclassid = itag;
2228 #endif
2229         rth->rt_is_input = 1;
2230
2231         RT_CACHE_STAT_INC(in_slow_tot);
2232         if (res->type == RTN_UNREACHABLE) {
2233                 rth->dst.input= ip_error;
2234                 rth->dst.error= -err;
2235                 rth->rt_flags   &= ~RTCF_LOCAL;
2236         }
2237
2238         if (do_cache) {
2239                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2240
2241                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2242                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2243                         WARN_ON(rth->dst.input == lwtunnel_input);
2244                         rth->dst.lwtstate->orig_input = rth->dst.input;
2245                         rth->dst.input = lwtunnel_input;
2246                 }
2247
2248                 if (unlikely(!rt_cache_route(nhc, rth)))
2249                         rt_add_uncached_list(rth);
2250         }
2251         skb_dst_set(skb, &rth->dst);
2252         err = 0;
2253         goto out;
2254
2255 no_route:
2256         RT_CACHE_STAT_INC(in_no_route);
2257         res->type = RTN_UNREACHABLE;
2258         res->fi = NULL;
2259         res->table = NULL;
2260         goto local_input;
2261
2262         /*
2263          *      Do not cache martian addresses: they should be logged (RFC1812)
2264          */
2265 martian_destination:
2266         RT_CACHE_STAT_INC(in_martian_dst);
2267 #ifdef CONFIG_IP_ROUTE_VERBOSE
2268         if (IN_DEV_LOG_MARTIANS(in_dev))
2269                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2270                                      &daddr, &saddr, dev->name);
2271 #endif
2272
2273 e_inval:
2274         err = -EINVAL;
2275         goto out;
2276
2277 e_nobufs:
2278         err = -ENOBUFS;
2279         goto out;
2280
2281 martian_source:
2282         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2283         goto out;
2284 }
2285
2286 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2287                          u8 tos, struct net_device *dev)
2288 {
2289         struct fib_result res;
2290         int err;
2291
2292         tos &= IPTOS_RT_MASK;
2293         rcu_read_lock();
2294         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2295         rcu_read_unlock();
2296
2297         return err;
2298 }
2299 EXPORT_SYMBOL(ip_route_input_noref);
2300
2301 /* called with rcu_read_lock held */
2302 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2303                        u8 tos, struct net_device *dev, struct fib_result *res)
2304 {
2305         /* Multicast recognition logic is moved from route cache to here.
2306            The problem was that too many Ethernet cards have broken/missing
2307            hardware multicast filters :-( As result the host on multicasting
2308            network acquires a lot of useless route cache entries, sort of
2309            SDR messages from all the world. Now we try to get rid of them.
2310            Really, provided software IP multicast filter is organized
2311            reasonably (at least, hashed), it does not result in a slowdown
2312            comparing with route cache reject entries.
2313            Note, that multicast routers are not affected, because
2314            route cache entry is created eventually.
2315          */
2316         if (ipv4_is_multicast(daddr)) {
2317                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2318                 int our = 0;
2319                 int err = -EINVAL;
2320
2321                 if (!in_dev)
2322                         return err;
2323                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2324                                       ip_hdr(skb)->protocol);
2325
2326                 /* check l3 master if no match yet */
2327                 if (!our && netif_is_l3_slave(dev)) {
2328                         struct in_device *l3_in_dev;
2329
2330                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2331                         if (l3_in_dev)
2332                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2333                                                       ip_hdr(skb)->protocol);
2334                 }
2335
2336                 if (our
2337 #ifdef CONFIG_IP_MROUTE
2338                         ||
2339                     (!ipv4_is_local_multicast(daddr) &&
2340                      IN_DEV_MFORWARD(in_dev))
2341 #endif
2342                    ) {
2343                         err = ip_route_input_mc(skb, daddr, saddr,
2344                                                 tos, dev, our);
2345                 }
2346                 return err;
2347         }
2348
2349         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2350 }
2351
2352 /* called with rcu_read_lock() */
2353 static struct rtable *__mkroute_output(const struct fib_result *res,
2354                                        const struct flowi4 *fl4, int orig_oif,
2355                                        struct net_device *dev_out,
2356                                        unsigned int flags)
2357 {
2358         struct fib_info *fi = res->fi;
2359         struct fib_nh_exception *fnhe;
2360         struct in_device *in_dev;
2361         u16 type = res->type;
2362         struct rtable *rth;
2363         bool do_cache;
2364
2365         in_dev = __in_dev_get_rcu(dev_out);
2366         if (!in_dev)
2367                 return ERR_PTR(-EINVAL);
2368
2369         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2370                 if (ipv4_is_loopback(fl4->saddr) &&
2371                     !(dev_out->flags & IFF_LOOPBACK) &&
2372                     !netif_is_l3_master(dev_out))
2373                         return ERR_PTR(-EINVAL);
2374
2375         if (ipv4_is_lbcast(fl4->daddr))
2376                 type = RTN_BROADCAST;
2377         else if (ipv4_is_multicast(fl4->daddr))
2378                 type = RTN_MULTICAST;
2379         else if (ipv4_is_zeronet(fl4->daddr))
2380                 return ERR_PTR(-EINVAL);
2381
2382         if (dev_out->flags & IFF_LOOPBACK)
2383                 flags |= RTCF_LOCAL;
2384
2385         do_cache = true;
2386         if (type == RTN_BROADCAST) {
2387                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2388                 fi = NULL;
2389         } else if (type == RTN_MULTICAST) {
2390                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2391                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2392                                      fl4->flowi4_proto))
2393                         flags &= ~RTCF_LOCAL;
2394                 else
2395                         do_cache = false;
2396                 /* If multicast route do not exist use
2397                  * default one, but do not gateway in this case.
2398                  * Yes, it is hack.
2399                  */
2400                 if (fi && res->prefixlen < 4)
2401                         fi = NULL;
2402         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2403                    (orig_oif != dev_out->ifindex)) {
2404                 /* For local routes that require a particular output interface
2405                  * we do not want to cache the result.  Caching the result
2406                  * causes incorrect behaviour when there are multiple source
2407                  * addresses on the interface, the end result being that if the
2408                  * intended recipient is waiting on that interface for the
2409                  * packet he won't receive it because it will be delivered on
2410                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2411                  * be set to the loopback interface as well.
2412                  */
2413                 do_cache = false;
2414         }
2415
2416         fnhe = NULL;
2417         do_cache &= fi != NULL;
2418         if (fi) {
2419                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2420                 struct rtable __rcu **prth;
2421
2422                 fnhe = find_exception(nhc, fl4->daddr);
2423                 if (!do_cache)
2424                         goto add;
2425                 if (fnhe) {
2426                         prth = &fnhe->fnhe_rth_output;
2427                 } else {
2428                         if (unlikely(fl4->flowi4_flags &
2429                                      FLOWI_FLAG_KNOWN_NH &&
2430                                      !(nhc->nhc_gw_family &&
2431                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2432                                 do_cache = false;
2433                                 goto add;
2434                         }
2435                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2436                 }
2437                 rth = rcu_dereference(*prth);
2438                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2439                         return rth;
2440         }
2441
2442 add:
2443         rth = rt_dst_alloc(dev_out, flags, type,
2444                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2445                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2446                            do_cache);
2447         if (!rth)
2448                 return ERR_PTR(-ENOBUFS);
2449
2450         rth->rt_iif = orig_oif;
2451
2452         RT_CACHE_STAT_INC(out_slow_tot);
2453
2454         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2455                 if (flags & RTCF_LOCAL &&
2456                     !(dev_out->flags & IFF_LOOPBACK)) {
2457                         rth->dst.output = ip_mc_output;
2458                         RT_CACHE_STAT_INC(out_slow_mc);
2459                 }
2460 #ifdef CONFIG_IP_MROUTE
2461                 if (type == RTN_MULTICAST) {
2462                         if (IN_DEV_MFORWARD(in_dev) &&
2463                             !ipv4_is_local_multicast(fl4->daddr)) {
2464                                 rth->dst.input = ip_mr_input;
2465                                 rth->dst.output = ip_mc_output;
2466                         }
2467                 }
2468 #endif
2469         }
2470
2471         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2472         lwtunnel_set_redirect(&rth->dst);
2473
2474         return rth;
2475 }
2476
2477 /*
2478  * Major route resolver routine.
2479  */
2480
2481 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2482                                         const struct sk_buff *skb)
2483 {
2484         __u8 tos = RT_FL_TOS(fl4);
2485         struct fib_result res = {
2486                 .type           = RTN_UNSPEC,
2487                 .fi             = NULL,
2488                 .table          = NULL,
2489                 .tclassid       = 0,
2490         };
2491         struct rtable *rth;
2492
2493         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2494         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2495         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2496                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2497
2498         rcu_read_lock();
2499         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2500         rcu_read_unlock();
2501
2502         return rth;
2503 }
2504 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2505
2506 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2507                                             struct fib_result *res,
2508                                             const struct sk_buff *skb)
2509 {
2510         struct net_device *dev_out = NULL;
2511         int orig_oif = fl4->flowi4_oif;
2512         unsigned int flags = 0;
2513         struct rtable *rth;
2514         int err;
2515
2516         if (fl4->saddr) {
2517                 if (ipv4_is_multicast(fl4->saddr) ||
2518                     ipv4_is_lbcast(fl4->saddr) ||
2519                     ipv4_is_zeronet(fl4->saddr)) {
2520                         rth = ERR_PTR(-EINVAL);
2521                         goto out;
2522                 }
2523
2524                 rth = ERR_PTR(-ENETUNREACH);
2525
2526                 /* I removed check for oif == dev_out->oif here.
2527                    It was wrong for two reasons:
2528                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2529                       is assigned to multiple interfaces.
2530                    2. Moreover, we are allowed to send packets with saddr
2531                       of another iface. --ANK
2532                  */
2533
2534                 if (fl4->flowi4_oif == 0 &&
2535                     (ipv4_is_multicast(fl4->daddr) ||
2536                      ipv4_is_lbcast(fl4->daddr))) {
2537                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2538                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2539                         if (!dev_out)
2540                                 goto out;
2541
2542                         /* Special hack: user can direct multicasts
2543                            and limited broadcast via necessary interface
2544                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2545                            This hack is not just for fun, it allows
2546                            vic,vat and friends to work.
2547                            They bind socket to loopback, set ttl to zero
2548                            and expect that it will work.
2549                            From the viewpoint of routing cache they are broken,
2550                            because we are not allowed to build multicast path
2551                            with loopback source addr (look, routing cache
2552                            cannot know, that ttl is zero, so that packet
2553                            will not leave this host and route is valid).
2554                            Luckily, this hack is good workaround.
2555                          */
2556
2557                         fl4->flowi4_oif = dev_out->ifindex;
2558                         goto make_route;
2559                 }
2560
2561                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2562                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2563                         if (!__ip_dev_find(net, fl4->saddr, false))
2564                                 goto out;
2565                 }
2566         }
2567
2568
2569         if (fl4->flowi4_oif) {
2570                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2571                 rth = ERR_PTR(-ENODEV);
2572                 if (!dev_out)
2573                         goto out;
2574
2575                 /* RACE: Check return value of inet_select_addr instead. */
2576                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2577                         rth = ERR_PTR(-ENETUNREACH);
2578                         goto out;
2579                 }
2580                 if (ipv4_is_local_multicast(fl4->daddr) ||
2581                     ipv4_is_lbcast(fl4->daddr) ||
2582                     fl4->flowi4_proto == IPPROTO_IGMP) {
2583                         if (!fl4->saddr)
2584                                 fl4->saddr = inet_select_addr(dev_out, 0,
2585                                                               RT_SCOPE_LINK);
2586                         goto make_route;
2587                 }
2588                 if (!fl4->saddr) {
2589                         if (ipv4_is_multicast(fl4->daddr))
2590                                 fl4->saddr = inet_select_addr(dev_out, 0,
2591                                                               fl4->flowi4_scope);
2592                         else if (!fl4->daddr)
2593                                 fl4->saddr = inet_select_addr(dev_out, 0,
2594                                                               RT_SCOPE_HOST);
2595                 }
2596         }
2597
2598         if (!fl4->daddr) {
2599                 fl4->daddr = fl4->saddr;
2600                 if (!fl4->daddr)
2601                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2602                 dev_out = net->loopback_dev;
2603                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2604                 res->type = RTN_LOCAL;
2605                 flags |= RTCF_LOCAL;
2606                 goto make_route;
2607         }
2608
2609         err = fib_lookup(net, fl4, res, 0);
2610         if (err) {
2611                 res->fi = NULL;
2612                 res->table = NULL;
2613                 if (fl4->flowi4_oif &&
2614                     (ipv4_is_multicast(fl4->daddr) ||
2615                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2616                         /* Apparently, routing tables are wrong. Assume,
2617                            that the destination is on link.
2618
2619                            WHY? DW.
2620                            Because we are allowed to send to iface
2621                            even if it has NO routes and NO assigned
2622                            addresses. When oif is specified, routing
2623                            tables are looked up with only one purpose:
2624                            to catch if destination is gatewayed, rather than
2625                            direct. Moreover, if MSG_DONTROUTE is set,
2626                            we send packet, ignoring both routing tables
2627                            and ifaddr state. --ANK
2628
2629
2630                            We could make it even if oif is unknown,
2631                            likely IPv6, but we do not.
2632                          */
2633
2634                         if (fl4->saddr == 0)
2635                                 fl4->saddr = inet_select_addr(dev_out, 0,
2636                                                               RT_SCOPE_LINK);
2637                         res->type = RTN_UNICAST;
2638                         goto make_route;
2639                 }
2640                 rth = ERR_PTR(err);
2641                 goto out;
2642         }
2643
2644         if (res->type == RTN_LOCAL) {
2645                 if (!fl4->saddr) {
2646                         if (res->fi->fib_prefsrc)
2647                                 fl4->saddr = res->fi->fib_prefsrc;
2648                         else
2649                                 fl4->saddr = fl4->daddr;
2650                 }
2651
2652                 /* L3 master device is the loopback for that domain */
2653                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2654                         net->loopback_dev;
2655
2656                 /* make sure orig_oif points to fib result device even
2657                  * though packet rx/tx happens over loopback or l3mdev
2658                  */
2659                 orig_oif = FIB_RES_OIF(*res);
2660
2661                 fl4->flowi4_oif = dev_out->ifindex;
2662                 flags |= RTCF_LOCAL;
2663                 goto make_route;
2664         }
2665
2666         fib_select_path(net, res, fl4, skb);
2667
2668         dev_out = FIB_RES_DEV(*res);
2669         fl4->flowi4_oif = dev_out->ifindex;
2670
2671
2672 make_route:
2673         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2674
2675 out:
2676         return rth;
2677 }
2678
2679 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2680 {
2681         return NULL;
2682 }
2683
2684 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2685 {
2686         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2687
2688         return mtu ? : dst->dev->mtu;
2689 }
2690
2691 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2692                                           struct sk_buff *skb, u32 mtu,
2693                                           bool confirm_neigh)
2694 {
2695 }
2696
2697 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2698                                        struct sk_buff *skb)
2699 {
2700 }
2701
2702 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2703                                           unsigned long old)
2704 {
2705         return NULL;
2706 }
2707
2708 static struct dst_ops ipv4_dst_blackhole_ops = {
2709         .family                 =       AF_INET,
2710         .check                  =       ipv4_blackhole_dst_check,
2711         .mtu                    =       ipv4_blackhole_mtu,
2712         .default_advmss         =       ipv4_default_advmss,
2713         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2714         .redirect               =       ipv4_rt_blackhole_redirect,
2715         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2716         .neigh_lookup           =       ipv4_neigh_lookup,
2717 };
2718
2719 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2720 {
2721         struct rtable *ort = (struct rtable *) dst_orig;
2722         struct rtable *rt;
2723
2724         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2725         if (rt) {
2726                 struct dst_entry *new = &rt->dst;
2727
2728                 new->__use = 1;
2729                 new->input = dst_discard;
2730                 new->output = dst_discard_out;
2731
2732                 new->dev = net->loopback_dev;
2733                 if (new->dev)
2734                         dev_hold(new->dev);
2735
2736                 rt->rt_is_input = ort->rt_is_input;
2737                 rt->rt_iif = ort->rt_iif;
2738                 rt->rt_pmtu = ort->rt_pmtu;
2739                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2740
2741                 rt->rt_genid = rt_genid_ipv4(net);
2742                 rt->rt_flags = ort->rt_flags;
2743                 rt->rt_type = ort->rt_type;
2744                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2745                 rt->rt_gw_family = ort->rt_gw_family;
2746                 if (rt->rt_gw_family == AF_INET)
2747                         rt->rt_gw4 = ort->rt_gw4;
2748                 else if (rt->rt_gw_family == AF_INET6)
2749                         rt->rt_gw6 = ort->rt_gw6;
2750
2751                 INIT_LIST_HEAD(&rt->rt_uncached);
2752         }
2753
2754         dst_release(dst_orig);
2755
2756         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2757 }
2758
2759 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2760                                     const struct sock *sk)
2761 {
2762         struct rtable *rt = __ip_route_output_key(net, flp4);
2763
2764         if (IS_ERR(rt))
2765                 return rt;
2766
2767         if (flp4->flowi4_proto)
2768                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2769                                                         flowi4_to_flowi(flp4),
2770                                                         sk, 0);
2771
2772         return rt;
2773 }
2774 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2775
2776 /* called with rcu_read_lock held */
2777 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2778                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2779                         struct sk_buff *skb, u32 portid, u32 seq,
2780                         unsigned int flags)
2781 {
2782         struct rtmsg *r;
2783         struct nlmsghdr *nlh;
2784         unsigned long expires = 0;
2785         u32 error;
2786         u32 metrics[RTAX_MAX];
2787
2788         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2789         if (!nlh)
2790                 return -EMSGSIZE;
2791
2792         r = nlmsg_data(nlh);
2793         r->rtm_family    = AF_INET;
2794         r->rtm_dst_len  = 32;
2795         r->rtm_src_len  = 0;
2796         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2797         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2798         if (nla_put_u32(skb, RTA_TABLE, table_id))
2799                 goto nla_put_failure;
2800         r->rtm_type     = rt->rt_type;
2801         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2802         r->rtm_protocol = RTPROT_UNSPEC;
2803         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2804         if (rt->rt_flags & RTCF_NOTIFY)
2805                 r->rtm_flags |= RTM_F_NOTIFY;
2806         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2807                 r->rtm_flags |= RTCF_DOREDIRECT;
2808
2809         if (nla_put_in_addr(skb, RTA_DST, dst))
2810                 goto nla_put_failure;
2811         if (src) {
2812                 r->rtm_src_len = 32;
2813                 if (nla_put_in_addr(skb, RTA_SRC, src))
2814                         goto nla_put_failure;
2815         }
2816         if (rt->dst.dev &&
2817             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2818                 goto nla_put_failure;
2819 #ifdef CONFIG_IP_ROUTE_CLASSID
2820         if (rt->dst.tclassid &&
2821             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2822                 goto nla_put_failure;
2823 #endif
2824         if (fl4 && !rt_is_input_route(rt) &&
2825             fl4->saddr != src) {
2826                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2827                         goto nla_put_failure;
2828         }
2829         if (rt->rt_uses_gateway) {
2830                 if (rt->rt_gw_family == AF_INET &&
2831                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2832                         goto nla_put_failure;
2833                 } else if (rt->rt_gw_family == AF_INET6) {
2834                         int alen = sizeof(struct in6_addr);
2835                         struct nlattr *nla;
2836                         struct rtvia *via;
2837
2838                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2839                         if (!nla)
2840                                 goto nla_put_failure;
2841
2842                         via = nla_data(nla);
2843                         via->rtvia_family = AF_INET6;
2844                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2845                 }
2846         }
2847
2848         expires = rt->dst.expires;
2849         if (expires) {
2850                 unsigned long now = jiffies;
2851
2852                 if (time_before(now, expires))
2853                         expires -= now;
2854                 else
2855                         expires = 0;
2856         }
2857
2858         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2859         if (rt->rt_pmtu && expires)
2860                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2861         if (rt->rt_mtu_locked && expires)
2862                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2863         if (rtnetlink_put_metrics(skb, metrics) < 0)
2864                 goto nla_put_failure;
2865
2866         if (fl4) {
2867                 if (fl4->flowi4_mark &&
2868                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2869                         goto nla_put_failure;
2870
2871                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2872                     nla_put_u32(skb, RTA_UID,
2873                                 from_kuid_munged(current_user_ns(),
2874                                                  fl4->flowi4_uid)))
2875                         goto nla_put_failure;
2876
2877                 if (rt_is_input_route(rt)) {
2878 #ifdef CONFIG_IP_MROUTE
2879                         if (ipv4_is_multicast(dst) &&
2880                             !ipv4_is_local_multicast(dst) &&
2881                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2882                                 int err = ipmr_get_route(net, skb,
2883                                                          fl4->saddr, fl4->daddr,
2884                                                          r, portid);
2885
2886                                 if (err <= 0) {
2887                                         if (err == 0)
2888                                                 return 0;
2889                                         goto nla_put_failure;
2890                                 }
2891                         } else
2892 #endif
2893                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2894                                         goto nla_put_failure;
2895                 }
2896         }
2897
2898         error = rt->dst.error;
2899
2900         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2901                 goto nla_put_failure;
2902
2903         nlmsg_end(skb, nlh);
2904         return 0;
2905
2906 nla_put_failure:
2907         nlmsg_cancel(skb, nlh);
2908         return -EMSGSIZE;
2909 }
2910
2911 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2912                             struct netlink_callback *cb, u32 table_id,
2913                             struct fnhe_hash_bucket *bucket, int genid,
2914                             int *fa_index, int fa_start, unsigned int flags)
2915 {
2916         int i;
2917
2918         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2919                 struct fib_nh_exception *fnhe;
2920
2921                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2922                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2923                         struct rtable *rt;
2924                         int err;
2925
2926                         if (*fa_index < fa_start)
2927                                 goto next;
2928
2929                         if (fnhe->fnhe_genid != genid)
2930                                 goto next;
2931
2932                         if (fnhe->fnhe_expires &&
2933                             time_after(jiffies, fnhe->fnhe_expires))
2934                                 goto next;
2935
2936                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2937                         if (!rt)
2938                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2939                         if (!rt)
2940                                 goto next;
2941
2942                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2943                                            table_id, NULL, skb,
2944                                            NETLINK_CB(cb->skb).portid,
2945                                            cb->nlh->nlmsg_seq, flags);
2946                         if (err)
2947                                 return err;
2948 next:
2949                         (*fa_index)++;
2950                 }
2951         }
2952
2953         return 0;
2954 }
2955
2956 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2957                        u32 table_id, struct fib_info *fi,
2958                        int *fa_index, int fa_start, unsigned int flags)
2959 {
2960         struct net *net = sock_net(cb->skb->sk);
2961         int nhsel, genid = fnhe_genid(net);
2962
2963         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2964                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2965                 struct fnhe_hash_bucket *bucket;
2966                 int err;
2967
2968                 if (nhc->nhc_flags & RTNH_F_DEAD)
2969                         continue;
2970
2971                 rcu_read_lock();
2972                 bucket = rcu_dereference(nhc->nhc_exceptions);
2973                 err = 0;
2974                 if (bucket)
2975                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2976                                                genid, fa_index, fa_start,
2977                                                flags);
2978                 rcu_read_unlock();
2979                 if (err)
2980                         return err;
2981         }
2982
2983         return 0;
2984 }
2985
2986 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2987                                                    u8 ip_proto, __be16 sport,
2988                                                    __be16 dport)
2989 {
2990         struct sk_buff *skb;
2991         struct iphdr *iph;
2992
2993         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2994         if (!skb)
2995                 return NULL;
2996
2997         /* Reserve room for dummy headers, this skb can pass
2998          * through good chunk of routing engine.
2999          */
3000         skb_reset_mac_header(skb);
3001         skb_reset_network_header(skb);
3002         skb->protocol = htons(ETH_P_IP);
3003         iph = skb_put(skb, sizeof(struct iphdr));
3004         iph->protocol = ip_proto;
3005         iph->saddr = src;
3006         iph->daddr = dst;
3007         iph->version = 0x4;
3008         iph->frag_off = 0;
3009         iph->ihl = 0x5;
3010         skb_set_transport_header(skb, skb->len);
3011
3012         switch (iph->protocol) {
3013         case IPPROTO_UDP: {
3014                 struct udphdr *udph;
3015
3016                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3017                 udph->source = sport;
3018                 udph->dest = dport;
3019                 udph->len = sizeof(struct udphdr);
3020                 udph->check = 0;
3021                 break;
3022         }
3023         case IPPROTO_TCP: {
3024                 struct tcphdr *tcph;
3025
3026                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3027                 tcph->source    = sport;
3028                 tcph->dest      = dport;
3029                 tcph->doff      = sizeof(struct tcphdr) / 4;
3030                 tcph->rst = 1;
3031                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3032                                             src, dst, 0);
3033                 break;
3034         }
3035         case IPPROTO_ICMP: {
3036                 struct icmphdr *icmph;
3037
3038                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3039                 icmph->type = ICMP_ECHO;
3040                 icmph->code = 0;
3041         }
3042         }
3043
3044         return skb;
3045 }
3046
3047 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3048                                        const struct nlmsghdr *nlh,
3049                                        struct nlattr **tb,
3050                                        struct netlink_ext_ack *extack)
3051 {
3052         struct rtmsg *rtm;
3053         int i, err;
3054
3055         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3056                 NL_SET_ERR_MSG(extack,
3057                                "ipv4: Invalid header for route get request");
3058                 return -EINVAL;
3059         }
3060
3061         if (!netlink_strict_get_check(skb))
3062                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3063                                               rtm_ipv4_policy, extack);
3064
3065         rtm = nlmsg_data(nlh);
3066         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3067             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3068             rtm->rtm_table || rtm->rtm_protocol ||
3069             rtm->rtm_scope || rtm->rtm_type) {
3070                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3071                 return -EINVAL;
3072         }
3073
3074         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3075                                RTM_F_LOOKUP_TABLE |
3076                                RTM_F_FIB_MATCH)) {
3077                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3078                 return -EINVAL;
3079         }
3080
3081         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3082                                             rtm_ipv4_policy, extack);
3083         if (err)
3084                 return err;
3085
3086         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3087             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3088                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3089                 return -EINVAL;
3090         }
3091
3092         for (i = 0; i <= RTA_MAX; i++) {
3093                 if (!tb[i])
3094                         continue;
3095
3096                 switch (i) {
3097                 case RTA_IIF:
3098                 case RTA_OIF:
3099                 case RTA_SRC:
3100                 case RTA_DST:
3101                 case RTA_IP_PROTO:
3102                 case RTA_SPORT:
3103                 case RTA_DPORT:
3104                 case RTA_MARK:
3105                 case RTA_UID:
3106                         break;
3107                 default:
3108                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3109                         return -EINVAL;
3110                 }
3111         }
3112
3113         return 0;
3114 }
3115
3116 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3117                              struct netlink_ext_ack *extack)
3118 {
3119         struct net *net = sock_net(in_skb->sk);
3120         struct nlattr *tb[RTA_MAX+1];
3121         u32 table_id = RT_TABLE_MAIN;
3122         __be16 sport = 0, dport = 0;
3123         struct fib_result res = {};
3124         u8 ip_proto = IPPROTO_UDP;
3125         struct rtable *rt = NULL;
3126         struct sk_buff *skb;
3127         struct rtmsg *rtm;
3128         struct flowi4 fl4 = {};
3129         __be32 dst = 0;
3130         __be32 src = 0;
3131         kuid_t uid;
3132         u32 iif;
3133         int err;
3134         int mark;
3135
3136         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3137         if (err < 0)
3138                 return err;
3139
3140         rtm = nlmsg_data(nlh);
3141         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3142         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3143         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3144         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3145         if (tb[RTA_UID])
3146                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3147         else
3148                 uid = (iif ? INVALID_UID : current_uid());
3149
3150         if (tb[RTA_IP_PROTO]) {
3151                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3152                                                   &ip_proto, AF_INET, extack);
3153                 if (err)
3154                         return err;
3155         }
3156
3157         if (tb[RTA_SPORT])
3158                 sport = nla_get_be16(tb[RTA_SPORT]);
3159
3160         if (tb[RTA_DPORT])
3161                 dport = nla_get_be16(tb[RTA_DPORT]);
3162
3163         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3164         if (!skb)
3165                 return -ENOBUFS;
3166
3167         fl4.daddr = dst;
3168         fl4.saddr = src;
3169         fl4.flowi4_tos = rtm->rtm_tos;
3170         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3171         fl4.flowi4_mark = mark;
3172         fl4.flowi4_uid = uid;
3173         if (sport)
3174                 fl4.fl4_sport = sport;
3175         if (dport)
3176                 fl4.fl4_dport = dport;
3177         fl4.flowi4_proto = ip_proto;
3178
3179         rcu_read_lock();
3180
3181         if (iif) {
3182                 struct net_device *dev;
3183
3184                 dev = dev_get_by_index_rcu(net, iif);
3185                 if (!dev) {
3186                         err = -ENODEV;
3187                         goto errout_rcu;
3188                 }
3189
3190                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3191                 skb->dev        = dev;
3192                 skb->mark       = mark;
3193                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3194                                          dev, &res);
3195
3196                 rt = skb_rtable(skb);
3197                 if (err == 0 && rt->dst.error)
3198                         err = -rt->dst.error;
3199         } else {
3200                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3201                 skb->dev = net->loopback_dev;
3202                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3203                 err = 0;
3204                 if (IS_ERR(rt))
3205                         err = PTR_ERR(rt);
3206                 else
3207                         skb_dst_set(skb, &rt->dst);
3208         }
3209
3210         if (err)
3211                 goto errout_rcu;
3212
3213         if (rtm->rtm_flags & RTM_F_NOTIFY)
3214                 rt->rt_flags |= RTCF_NOTIFY;
3215
3216         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3217                 table_id = res.table ? res.table->tb_id : 0;
3218
3219         /* reset skb for netlink reply msg */
3220         skb_trim(skb, 0);
3221         skb_reset_network_header(skb);
3222         skb_reset_transport_header(skb);
3223         skb_reset_mac_header(skb);
3224
3225         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3226                 struct fib_rt_info fri;
3227
3228                 if (!res.fi) {
3229                         err = fib_props[res.type].error;
3230                         if (!err)
3231                                 err = -EHOSTUNREACH;
3232                         goto errout_rcu;
3233                 }
3234                 fri.fi = res.fi;
3235                 fri.tb_id = table_id;
3236                 fri.dst = res.prefix;
3237                 fri.dst_len = res.prefixlen;
3238                 fri.tos = fl4.flowi4_tos;
3239                 fri.type = rt->rt_type;
3240                 fri.offload = 0;
3241                 fri.trap = 0;
3242                 if (res.fa_head) {
3243                         struct fib_alias *fa;
3244
3245                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3246                                 u8 slen = 32 - fri.dst_len;
3247
3248                                 if (fa->fa_slen == slen &&
3249                                     fa->tb_id == fri.tb_id &&
3250                                     fa->fa_tos == fri.tos &&
3251                                     fa->fa_info == res.fi &&
3252                                     fa->fa_type == fri.type) {
3253                                         fri.offload = fa->offload;
3254                                         fri.trap = fa->trap;
3255                                         break;
3256                                 }
3257                         }
3258                 }
3259                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3260                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3261         } else {
3262                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3263                                    NETLINK_CB(in_skb).portid,
3264                                    nlh->nlmsg_seq, 0);
3265         }
3266         if (err < 0)
3267                 goto errout_rcu;
3268
3269         rcu_read_unlock();
3270
3271         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3272
3273 errout_free:
3274         return err;
3275 errout_rcu:
3276         rcu_read_unlock();
3277         kfree_skb(skb);
3278         goto errout_free;
3279 }
3280
3281 void ip_rt_multicast_event(struct in_device *in_dev)
3282 {
3283         rt_cache_flush(dev_net(in_dev->dev));
3284 }
3285
3286 #ifdef CONFIG_SYSCTL
3287 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3288 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3289 static int ip_rt_gc_elasticity __read_mostly    = 8;
3290 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3291
3292 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3293                                         void __user *buffer,
3294                                         size_t *lenp, loff_t *ppos)
3295 {
3296         struct net *net = (struct net *)__ctl->extra1;
3297
3298         if (write) {
3299                 rt_cache_flush(net);
3300                 fnhe_genid_bump(net);
3301                 return 0;
3302         }
3303
3304         return -EINVAL;
3305 }
3306
3307 static struct ctl_table ipv4_route_table[] = {
3308         {
3309                 .procname       = "gc_thresh",
3310                 .data           = &ipv4_dst_ops.gc_thresh,
3311                 .maxlen         = sizeof(int),
3312                 .mode           = 0644,
3313                 .proc_handler   = proc_dointvec,
3314         },
3315         {
3316                 .procname       = "max_size",
3317                 .data           = &ip_rt_max_size,
3318                 .maxlen         = sizeof(int),
3319                 .mode           = 0644,
3320                 .proc_handler   = proc_dointvec,
3321         },
3322         {
3323                 /*  Deprecated. Use gc_min_interval_ms */
3324
3325                 .procname       = "gc_min_interval",
3326                 .data           = &ip_rt_gc_min_interval,
3327                 .maxlen         = sizeof(int),
3328                 .mode           = 0644,
3329                 .proc_handler   = proc_dointvec_jiffies,
3330         },
3331         {
3332                 .procname       = "gc_min_interval_ms",
3333                 .data           = &ip_rt_gc_min_interval,
3334                 .maxlen         = sizeof(int),
3335                 .mode           = 0644,
3336                 .proc_handler   = proc_dointvec_ms_jiffies,
3337         },
3338         {
3339                 .procname       = "gc_timeout",
3340                 .data           = &ip_rt_gc_timeout,
3341                 .maxlen         = sizeof(int),
3342                 .mode           = 0644,
3343                 .proc_handler   = proc_dointvec_jiffies,
3344         },
3345         {
3346                 .procname       = "gc_interval",
3347                 .data           = &ip_rt_gc_interval,
3348                 .maxlen         = sizeof(int),
3349                 .mode           = 0644,
3350                 .proc_handler   = proc_dointvec_jiffies,
3351         },
3352         {
3353                 .procname       = "redirect_load",
3354                 .data           = &ip_rt_redirect_load,
3355                 .maxlen         = sizeof(int),
3356                 .mode           = 0644,
3357                 .proc_handler   = proc_dointvec,
3358         },
3359         {
3360                 .procname       = "redirect_number",
3361                 .data           = &ip_rt_redirect_number,
3362                 .maxlen         = sizeof(int),
3363                 .mode           = 0644,
3364                 .proc_handler   = proc_dointvec,
3365         },
3366         {
3367                 .procname       = "redirect_silence",
3368                 .data           = &ip_rt_redirect_silence,
3369                 .maxlen         = sizeof(int),
3370                 .mode           = 0644,
3371                 .proc_handler   = proc_dointvec,
3372         },
3373         {
3374                 .procname       = "error_cost",
3375                 .data           = &ip_rt_error_cost,
3376                 .maxlen         = sizeof(int),
3377                 .mode           = 0644,
3378                 .proc_handler   = proc_dointvec,
3379         },
3380         {
3381                 .procname       = "error_burst",
3382                 .data           = &ip_rt_error_burst,
3383                 .maxlen         = sizeof(int),
3384                 .mode           = 0644,
3385                 .proc_handler   = proc_dointvec,
3386         },
3387         {
3388                 .procname       = "gc_elasticity",
3389                 .data           = &ip_rt_gc_elasticity,
3390                 .maxlen         = sizeof(int),
3391                 .mode           = 0644,
3392                 .proc_handler   = proc_dointvec,
3393         },
3394         {
3395                 .procname       = "mtu_expires",
3396                 .data           = &ip_rt_mtu_expires,
3397                 .maxlen         = sizeof(int),
3398                 .mode           = 0644,
3399                 .proc_handler   = proc_dointvec_jiffies,
3400         },
3401         {
3402                 .procname       = "min_pmtu",
3403                 .data           = &ip_rt_min_pmtu,
3404                 .maxlen         = sizeof(int),
3405                 .mode           = 0644,
3406                 .proc_handler   = proc_dointvec_minmax,
3407                 .extra1         = &ip_min_valid_pmtu,
3408         },
3409         {
3410                 .procname       = "min_adv_mss",
3411                 .data           = &ip_rt_min_advmss,
3412                 .maxlen         = sizeof(int),
3413                 .mode           = 0644,
3414                 .proc_handler   = proc_dointvec,
3415         },
3416         { }
3417 };
3418
3419 static const char ipv4_route_flush_procname[] = "flush";
3420
3421 static struct ctl_table ipv4_route_flush_table[] = {
3422         {
3423                 .procname       = ipv4_route_flush_procname,
3424                 .maxlen         = sizeof(int),
3425                 .mode           = 0200,
3426                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3427         },
3428         { },
3429 };
3430
3431 static __net_init int sysctl_route_net_init(struct net *net)
3432 {
3433         struct ctl_table *tbl;
3434
3435         tbl = ipv4_route_flush_table;
3436         if (!net_eq(net, &init_net)) {
3437                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3438                 if (!tbl)
3439                         goto err_dup;
3440
3441                 /* Don't export non-whitelisted sysctls to unprivileged users */
3442                 if (net->user_ns != &init_user_ns) {
3443                         if (tbl[0].procname != ipv4_route_flush_procname)
3444                                 tbl[0].procname = NULL;
3445                 }
3446         }
3447         tbl[0].extra1 = net;
3448
3449         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3450         if (!net->ipv4.route_hdr)
3451                 goto err_reg;
3452         return 0;
3453
3454 err_reg:
3455         if (tbl != ipv4_route_flush_table)
3456                 kfree(tbl);
3457 err_dup:
3458         return -ENOMEM;
3459 }
3460
3461 static __net_exit void sysctl_route_net_exit(struct net *net)
3462 {
3463         struct ctl_table *tbl;
3464
3465         tbl = net->ipv4.route_hdr->ctl_table_arg;
3466         unregister_net_sysctl_table(net->ipv4.route_hdr);
3467         BUG_ON(tbl == ipv4_route_flush_table);
3468         kfree(tbl);
3469 }
3470
3471 static __net_initdata struct pernet_operations sysctl_route_ops = {
3472         .init = sysctl_route_net_init,
3473         .exit = sysctl_route_net_exit,
3474 };
3475 #endif
3476
3477 static __net_init int rt_genid_init(struct net *net)
3478 {
3479         atomic_set(&net->ipv4.rt_genid, 0);
3480         atomic_set(&net->fnhe_genid, 0);
3481         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3482         return 0;
3483 }
3484
3485 static __net_initdata struct pernet_operations rt_genid_ops = {
3486         .init = rt_genid_init,
3487 };
3488
3489 static int __net_init ipv4_inetpeer_init(struct net *net)
3490 {
3491         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3492
3493         if (!bp)
3494                 return -ENOMEM;
3495         inet_peer_base_init(bp);
3496         net->ipv4.peers = bp;
3497         return 0;
3498 }
3499
3500 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3501 {
3502         struct inet_peer_base *bp = net->ipv4.peers;
3503
3504         net->ipv4.peers = NULL;
3505         inetpeer_invalidate_tree(bp);
3506         kfree(bp);
3507 }
3508
3509 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3510         .init   =       ipv4_inetpeer_init,
3511         .exit   =       ipv4_inetpeer_exit,
3512 };
3513
3514 #ifdef CONFIG_IP_ROUTE_CLASSID
3515 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3516 #endif /* CONFIG_IP_ROUTE_CLASSID */
3517
3518 int __init ip_rt_init(void)
3519 {
3520         int cpu;
3521
3522         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3523                                   GFP_KERNEL);
3524         if (!ip_idents)
3525                 panic("IP: failed to allocate ip_idents\n");
3526
3527         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3528
3529         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3530         if (!ip_tstamps)
3531                 panic("IP: failed to allocate ip_tstamps\n");
3532
3533         for_each_possible_cpu(cpu) {
3534                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3535
3536                 INIT_LIST_HEAD(&ul->head);
3537                 spin_lock_init(&ul->lock);
3538         }
3539 #ifdef CONFIG_IP_ROUTE_CLASSID
3540         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3541         if (!ip_rt_acct)
3542                 panic("IP: failed to allocate ip_rt_acct\n");
3543 #endif
3544
3545         ipv4_dst_ops.kmem_cachep =
3546                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3547                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3548
3549         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3550
3551         if (dst_entries_init(&ipv4_dst_ops) < 0)
3552                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3553
3554         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3555                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3556
3557         ipv4_dst_ops.gc_thresh = ~0;
3558         ip_rt_max_size = INT_MAX;
3559
3560         devinet_init();
3561         ip_fib_init();
3562
3563         if (ip_rt_proc_init())
3564                 pr_err("Unable to create route proc files\n");
3565 #ifdef CONFIG_XFRM
3566         xfrm_init();
3567         xfrm4_init();
3568 #endif
3569         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3570                       RTNL_FLAG_DOIT_UNLOCKED);
3571
3572 #ifdef CONFIG_SYSCTL
3573         register_pernet_subsys(&sysctl_route_ops);
3574 #endif
3575         register_pernet_subsys(&rt_genid_ops);
3576         register_pernet_subsys(&ipv4_inetpeer_ops);
3577         return 0;
3578 }
3579
3580 #ifdef CONFIG_SYSCTL
3581 /*
3582  * We really need to sanitize the damn ipv4 init order, then all
3583  * this nonsense will go away.
3584  */
3585 void __init ip_static_sysctl_init(void)
3586 {
3587         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3588 }
3589 #endif