11ddc276776e357b7998142f9e80327ea7e89c03
[linux-2.6-microblaze.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 #include <net/ip_tunnels.h>
114 #include <net/l3mdev.h>
115
116 #include "fib_lookup.h"
117
118 #define RT_FL_TOS(oldflp4) \
119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145                                            struct sk_buff *skb, u32 mtu);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .open    = rt_cpu_seq_open,
334         .read    = seq_read,
335         .llseek  = seq_lseek,
336         .release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363 #endif
364
365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367         struct proc_dir_entry *pde;
368
369         pde = proc_create("rt_cache", 0444, net->proc_net,
370                           &rt_cache_seq_fops);
371         if (!pde)
372                 goto err1;
373
374         pde = proc_create("rt_cache", 0444,
375                           net->proc_net_stat, &rt_cpu_seq_fops);
376         if (!pde)
377                 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380         pde = proc_create_single("rt_acct", 0, net->proc_net,
381                         rt_acct_proc_show);
382         if (!pde)
383                 goto err3;
384 #endif
385         return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389         remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392         remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394         return -ENOMEM;
395 }
396
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400         remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402         remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407         .init = ip_rt_do_proc_init,
408         .exit = ip_rt_do_proc_exit,
409 };
410
411 static int __init ip_rt_proc_init(void)
412 {
413         return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
417 static inline int ip_rt_proc_init(void)
418 {
419         return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
428 void rt_cache_flush(struct net *net)
429 {
430         rt_genid_bump_ipv4(net);
431 }
432
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434                                            struct sk_buff *skb,
435                                            const void *daddr)
436 {
437         const struct rtable *rt = container_of(dst, struct rtable, dst);
438         struct net_device *dev = dst->dev;
439         struct neighbour *n;
440
441         rcu_read_lock_bh();
442
443         if (likely(rt->rt_gw_family == AF_INET)) {
444                 n = ip_neigh_gw4(dev, rt->rt_gw4);
445         } else if (rt->rt_gw_family == AF_INET6) {
446                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
447         } else {
448                 __be32 pkey;
449
450                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451                 n = ip_neigh_gw4(dev, pkey);
452         }
453
454         if (n && !refcount_inc_not_zero(&n->refcnt))
455                 n = NULL;
456
457         rcu_read_unlock_bh();
458
459         return n;
460 }
461
462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464         const struct rtable *rt = container_of(dst, struct rtable, dst);
465         struct net_device *dev = dst->dev;
466         const __be32 *pkey = daddr;
467
468         if (rt->rt_gw_family == AF_INET) {
469                 pkey = (const __be32 *)&rt->rt_gw4;
470         } else if (rt->rt_gw_family == AF_INET6) {
471                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472         } else if (!daddr ||
473                  (rt->rt_flags &
474                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475                 return;
476         }
477         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479
480 #define IP_IDENTS_SZ 2048u
481
482 static atomic_t *ip_idents __read_mostly;
483 static u32 *ip_tstamps __read_mostly;
484
485 /* In order to protect privacy, we add a perturbation to identifiers
486  * if one generator is seldom used. This makes hard for an attacker
487  * to infer how many packets were sent between two points in time.
488  */
489 u32 ip_idents_reserve(u32 hash, int segs)
490 {
491         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
492         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
493         u32 old = READ_ONCE(*p_tstamp);
494         u32 now = (u32)jiffies;
495         u32 new, delta = 0;
496
497         if (old != now && cmpxchg(p_tstamp, old, now) == old)
498                 delta = prandom_u32_max(now - old);
499
500         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
501         do {
502                 old = (u32)atomic_read(p_id);
503                 new = old + delta + segs;
504         } while (atomic_cmpxchg(p_id, old, new) != old);
505
506         return new - segs;
507 }
508 EXPORT_SYMBOL(ip_idents_reserve);
509
510 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
511 {
512         u32 hash, id;
513
514         /* Note the following code is not safe, but this is okay. */
515         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
516                 get_random_bytes(&net->ipv4.ip_id_key,
517                                  sizeof(net->ipv4.ip_id_key));
518
519         hash = siphash_3u32((__force u32)iph->daddr,
520                             (__force u32)iph->saddr,
521                             iph->protocol,
522                             &net->ipv4.ip_id_key);
523         id = ip_idents_reserve(hash, segs);
524         iph->id = htons(id);
525 }
526 EXPORT_SYMBOL(__ip_select_ident);
527
528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
529                              const struct sock *sk,
530                              const struct iphdr *iph,
531                              int oif, u8 tos,
532                              u8 prot, u32 mark, int flow_flags)
533 {
534         if (sk) {
535                 const struct inet_sock *inet = inet_sk(sk);
536
537                 oif = sk->sk_bound_dev_if;
538                 mark = sk->sk_mark;
539                 tos = RT_CONN_FLAGS(sk);
540                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
541         }
542         flowi4_init_output(fl4, oif, mark, tos,
543                            RT_SCOPE_UNIVERSE, prot,
544                            flow_flags,
545                            iph->daddr, iph->saddr, 0, 0,
546                            sock_net_uid(net, sk));
547 }
548
549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
550                                const struct sock *sk)
551 {
552         const struct net *net = dev_net(skb->dev);
553         const struct iphdr *iph = ip_hdr(skb);
554         int oif = skb->dev->ifindex;
555         u8 tos = RT_TOS(iph->tos);
556         u8 prot = iph->protocol;
557         u32 mark = skb->mark;
558
559         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
560 }
561
562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
563 {
564         const struct inet_sock *inet = inet_sk(sk);
565         const struct ip_options_rcu *inet_opt;
566         __be32 daddr = inet->inet_daddr;
567
568         rcu_read_lock();
569         inet_opt = rcu_dereference(inet->inet_opt);
570         if (inet_opt && inet_opt->opt.srr)
571                 daddr = inet_opt->opt.faddr;
572         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
573                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
574                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
575                            inet_sk_flowi_flags(sk),
576                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
577         rcu_read_unlock();
578 }
579
580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
581                                  const struct sk_buff *skb)
582 {
583         if (skb)
584                 build_skb_flow_key(fl4, skb, sk);
585         else
586                 build_sk_flow_key(fl4, sk);
587 }
588
589 static DEFINE_SPINLOCK(fnhe_lock);
590
591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
592 {
593         struct rtable *rt;
594
595         rt = rcu_dereference(fnhe->fnhe_rth_input);
596         if (rt) {
597                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
598                 dst_dev_put(&rt->dst);
599                 dst_release(&rt->dst);
600         }
601         rt = rcu_dereference(fnhe->fnhe_rth_output);
602         if (rt) {
603                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
604                 dst_dev_put(&rt->dst);
605                 dst_release(&rt->dst);
606         }
607 }
608
609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
610 {
611         struct fib_nh_exception *fnhe, *oldest;
612
613         oldest = rcu_dereference(hash->chain);
614         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
615              fnhe = rcu_dereference(fnhe->fnhe_next)) {
616                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
617                         oldest = fnhe;
618         }
619         fnhe_flush_routes(oldest);
620         return oldest;
621 }
622
623 static inline u32 fnhe_hashfun(__be32 daddr)
624 {
625         static u32 fnhe_hashrnd __read_mostly;
626         u32 hval;
627
628         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
629         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
630         return hash_32(hval, FNHE_HASH_SHIFT);
631 }
632
633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
634 {
635         rt->rt_pmtu = fnhe->fnhe_pmtu;
636         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
637         rt->dst.expires = fnhe->fnhe_expires;
638
639         if (fnhe->fnhe_gw) {
640                 rt->rt_flags |= RTCF_REDIRECTED;
641                 rt->rt_gw_family = AF_INET;
642                 rt->rt_gw4 = fnhe->fnhe_gw;
643         }
644 }
645
646 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647                                   __be32 gw, u32 pmtu, bool lock,
648                                   unsigned long expires)
649 {
650         struct fnhe_hash_bucket *hash;
651         struct fib_nh_exception *fnhe;
652         struct rtable *rt;
653         u32 genid, hval;
654         unsigned int i;
655         int depth;
656
657         genid = fnhe_genid(dev_net(nhc->nhc_dev));
658         hval = fnhe_hashfun(daddr);
659
660         spin_lock_bh(&fnhe_lock);
661
662         hash = rcu_dereference(nhc->nhc_exceptions);
663         if (!hash) {
664                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665                 if (!hash)
666                         goto out_unlock;
667                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
668         }
669
670         hash += hval;
671
672         depth = 0;
673         for (fnhe = rcu_dereference(hash->chain); fnhe;
674              fnhe = rcu_dereference(fnhe->fnhe_next)) {
675                 if (fnhe->fnhe_daddr == daddr)
676                         break;
677                 depth++;
678         }
679
680         if (fnhe) {
681                 if (fnhe->fnhe_genid != genid)
682                         fnhe->fnhe_genid = genid;
683                 if (gw)
684                         fnhe->fnhe_gw = gw;
685                 if (pmtu) {
686                         fnhe->fnhe_pmtu = pmtu;
687                         fnhe->fnhe_mtu_locked = lock;
688                 }
689                 fnhe->fnhe_expires = max(1UL, expires);
690                 /* Update all cached dsts too */
691                 rt = rcu_dereference(fnhe->fnhe_rth_input);
692                 if (rt)
693                         fill_route_from_fnhe(rt, fnhe);
694                 rt = rcu_dereference(fnhe->fnhe_rth_output);
695                 if (rt)
696                         fill_route_from_fnhe(rt, fnhe);
697         } else {
698                 if (depth > FNHE_RECLAIM_DEPTH)
699                         fnhe = fnhe_oldest(hash);
700                 else {
701                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702                         if (!fnhe)
703                                 goto out_unlock;
704
705                         fnhe->fnhe_next = hash->chain;
706                         rcu_assign_pointer(hash->chain, fnhe);
707                 }
708                 fnhe->fnhe_genid = genid;
709                 fnhe->fnhe_daddr = daddr;
710                 fnhe->fnhe_gw = gw;
711                 fnhe->fnhe_pmtu = pmtu;
712                 fnhe->fnhe_mtu_locked = lock;
713                 fnhe->fnhe_expires = max(1UL, expires);
714
715                 /* Exception created; mark the cached routes for the nexthop
716                  * stale, so anyone caching it rechecks if this exception
717                  * applies to them.
718                  */
719                 rt = rcu_dereference(nhc->nhc_rth_input);
720                 if (rt)
721                         rt->dst.obsolete = DST_OBSOLETE_KILL;
722
723                 for_each_possible_cpu(i) {
724                         struct rtable __rcu **prt;
725                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726                         rt = rcu_dereference(*prt);
727                         if (rt)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                 }
730         }
731
732         fnhe->fnhe_stamp = jiffies;
733
734 out_unlock:
735         spin_unlock_bh(&fnhe_lock);
736 }
737
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739                              bool kill_route)
740 {
741         __be32 new_gw = icmp_hdr(skb)->un.gateway;
742         __be32 old_gw = ip_hdr(skb)->saddr;
743         struct net_device *dev = skb->dev;
744         struct in_device *in_dev;
745         struct fib_result res;
746         struct neighbour *n;
747         struct net *net;
748
749         switch (icmp_hdr(skb)->code & 7) {
750         case ICMP_REDIR_NET:
751         case ICMP_REDIR_NETTOS:
752         case ICMP_REDIR_HOST:
753         case ICMP_REDIR_HOSTTOS:
754                 break;
755
756         default:
757                 return;
758         }
759
760         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761                 return;
762
763         in_dev = __in_dev_get_rcu(dev);
764         if (!in_dev)
765                 return;
766
767         net = dev_net(dev);
768         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770             ipv4_is_zeronet(new_gw))
771                 goto reject_redirect;
772
773         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775                         goto reject_redirect;
776                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777                         goto reject_redirect;
778         } else {
779                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780                         goto reject_redirect;
781         }
782
783         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784         if (!n)
785                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786         if (!IS_ERR(n)) {
787                 if (!(n->nud_state & NUD_VALID)) {
788                         neigh_event_send(n, NULL);
789                 } else {
790                         if (fib_lookup(net, fl4, &res, 0) == 0) {
791                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
792
793                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
794                                                 0, false,
795                                                 jiffies + ip_rt_gc_timeout);
796                         }
797                         if (kill_route)
798                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
799                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800                 }
801                 neigh_release(n);
802         }
803         return;
804
805 reject_redirect:
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807         if (IN_DEV_LOG_MARTIANS(in_dev)) {
808                 const struct iphdr *iph = (const struct iphdr *) skb->data;
809                 __be32 daddr = iph->daddr;
810                 __be32 saddr = iph->saddr;
811
812                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813                                      "  Advised path = %pI4 -> %pI4\n",
814                                      &old_gw, dev->name, &new_gw,
815                                      &saddr, &daddr);
816         }
817 #endif
818         ;
819 }
820
821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822 {
823         struct rtable *rt;
824         struct flowi4 fl4;
825         const struct iphdr *iph = (const struct iphdr *) skb->data;
826         struct net *net = dev_net(skb->dev);
827         int oif = skb->dev->ifindex;
828         u8 tos = RT_TOS(iph->tos);
829         u8 prot = iph->protocol;
830         u32 mark = skb->mark;
831
832         rt = (struct rtable *) dst;
833
834         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
835         __ip_do_redirect(rt, skb, &fl4, true);
836 }
837
838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839 {
840         struct rtable *rt = (struct rtable *)dst;
841         struct dst_entry *ret = dst;
842
843         if (rt) {
844                 if (dst->obsolete > 0) {
845                         ip_rt_put(rt);
846                         ret = NULL;
847                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848                            rt->dst.expires) {
849                         ip_rt_put(rt);
850                         ret = NULL;
851                 }
852         }
853         return ret;
854 }
855
856 /*
857  * Algorithm:
858  *      1. The first ip_rt_redirect_number redirects are sent
859  *         with exponential backoff, then we stop sending them at all,
860  *         assuming that the host ignores our redirects.
861  *      2. If we did not see packets requiring redirects
862  *         during ip_rt_redirect_silence, we assume that the host
863  *         forgot redirected route and start to send redirects again.
864  *
865  * This algorithm is much cheaper and more intelligent than dumb load limiting
866  * in icmp.c.
867  *
868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
870  */
871
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874         struct rtable *rt = skb_rtable(skb);
875         struct in_device *in_dev;
876         struct inet_peer *peer;
877         struct net *net;
878         int log_martians;
879         int vif;
880
881         rcu_read_lock();
882         in_dev = __in_dev_get_rcu(rt->dst.dev);
883         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884                 rcu_read_unlock();
885                 return;
886         }
887         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
888         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889         rcu_read_unlock();
890
891         net = dev_net(rt->dst.dev);
892         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
893         if (!peer) {
894                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895                           rt_nexthop(rt, ip_hdr(skb)->daddr));
896                 return;
897         }
898
899         /* No redirected packets during ip_rt_redirect_silence;
900          * reset the algorithm.
901          */
902         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
903                 peer->rate_tokens = 0;
904                 peer->n_redirects = 0;
905         }
906
907         /* Too many ignored redirects; do not send anything
908          * set dst.rate_last to the last seen redirected packet.
909          */
910         if (peer->n_redirects >= ip_rt_redirect_number) {
911                 peer->rate_last = jiffies;
912                 goto out_put_peer;
913         }
914
915         /* Check for load limit; set rate_last to the latest sent
916          * redirect.
917          */
918         if (peer->rate_tokens == 0 ||
919             time_after(jiffies,
920                        (peer->rate_last +
921                         (ip_rt_redirect_load << peer->rate_tokens)))) {
922                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
923
924                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
925                 peer->rate_last = jiffies;
926                 ++peer->rate_tokens;
927                 ++peer->n_redirects;
928 #ifdef CONFIG_IP_ROUTE_VERBOSE
929                 if (log_martians &&
930                     peer->rate_tokens == ip_rt_redirect_number)
931                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
932                                              &ip_hdr(skb)->saddr, inet_iif(skb),
933                                              &ip_hdr(skb)->daddr, &gw);
934 #endif
935         }
936 out_put_peer:
937         inet_putpeer(peer);
938 }
939
940 static int ip_error(struct sk_buff *skb)
941 {
942         struct rtable *rt = skb_rtable(skb);
943         struct net_device *dev = skb->dev;
944         struct in_device *in_dev;
945         struct inet_peer *peer;
946         unsigned long now;
947         struct net *net;
948         bool send;
949         int code;
950
951         if (netif_is_l3_master(skb->dev)) {
952                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
953                 if (!dev)
954                         goto out;
955         }
956
957         in_dev = __in_dev_get_rcu(dev);
958
959         /* IP on this device is disabled. */
960         if (!in_dev)
961                 goto out;
962
963         net = dev_net(rt->dst.dev);
964         if (!IN_DEV_FORWARD(in_dev)) {
965                 switch (rt->dst.error) {
966                 case EHOSTUNREACH:
967                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
968                         break;
969
970                 case ENETUNREACH:
971                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
972                         break;
973                 }
974                 goto out;
975         }
976
977         switch (rt->dst.error) {
978         case EINVAL:
979         default:
980                 goto out;
981         case EHOSTUNREACH:
982                 code = ICMP_HOST_UNREACH;
983                 break;
984         case ENETUNREACH:
985                 code = ICMP_NET_UNREACH;
986                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
987                 break;
988         case EACCES:
989                 code = ICMP_PKT_FILTERED;
990                 break;
991         }
992
993         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
994                                l3mdev_master_ifindex(skb->dev), 1);
995
996         send = true;
997         if (peer) {
998                 now = jiffies;
999                 peer->rate_tokens += now - peer->rate_last;
1000                 if (peer->rate_tokens > ip_rt_error_burst)
1001                         peer->rate_tokens = ip_rt_error_burst;
1002                 peer->rate_last = now;
1003                 if (peer->rate_tokens >= ip_rt_error_cost)
1004                         peer->rate_tokens -= ip_rt_error_cost;
1005                 else
1006                         send = false;
1007                 inet_putpeer(peer);
1008         }
1009         if (send)
1010                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1011
1012 out:    kfree_skb(skb);
1013         return 0;
1014 }
1015
1016 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1017 {
1018         struct dst_entry *dst = &rt->dst;
1019         u32 old_mtu = ipv4_mtu(dst);
1020         struct fib_result res;
1021         bool lock = false;
1022
1023         if (ip_mtu_locked(dst))
1024                 return;
1025
1026         if (old_mtu < mtu)
1027                 return;
1028
1029         if (mtu < ip_rt_min_pmtu) {
1030                 lock = true;
1031                 mtu = min(old_mtu, ip_rt_min_pmtu);
1032         }
1033
1034         if (rt->rt_pmtu == mtu && !lock &&
1035             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1036                 return;
1037
1038         rcu_read_lock();
1039         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1040                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1041
1042                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1043                                       jiffies + ip_rt_mtu_expires);
1044         }
1045         rcu_read_unlock();
1046 }
1047
1048 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1049                               struct sk_buff *skb, u32 mtu)
1050 {
1051         struct rtable *rt = (struct rtable *) dst;
1052         struct flowi4 fl4;
1053
1054         ip_rt_build_flow_key(&fl4, sk, skb);
1055         __ip_rt_update_pmtu(rt, &fl4, mtu);
1056 }
1057
1058 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1059                       int oif, u8 protocol)
1060 {
1061         const struct iphdr *iph = (const struct iphdr *) skb->data;
1062         struct flowi4 fl4;
1063         struct rtable *rt;
1064         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1065
1066         __build_flow_key(net, &fl4, NULL, iph, oif,
1067                          RT_TOS(iph->tos), protocol, mark, 0);
1068         rt = __ip_route_output_key(net, &fl4);
1069         if (!IS_ERR(rt)) {
1070                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1071                 ip_rt_put(rt);
1072         }
1073 }
1074 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1075
1076 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1077 {
1078         const struct iphdr *iph = (const struct iphdr *) skb->data;
1079         struct flowi4 fl4;
1080         struct rtable *rt;
1081
1082         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1083
1084         if (!fl4.flowi4_mark)
1085                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1086
1087         rt = __ip_route_output_key(sock_net(sk), &fl4);
1088         if (!IS_ERR(rt)) {
1089                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1090                 ip_rt_put(rt);
1091         }
1092 }
1093
1094 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1095 {
1096         const struct iphdr *iph = (const struct iphdr *) skb->data;
1097         struct flowi4 fl4;
1098         struct rtable *rt;
1099         struct dst_entry *odst = NULL;
1100         bool new = false;
1101         struct net *net = sock_net(sk);
1102
1103         bh_lock_sock(sk);
1104
1105         if (!ip_sk_accept_pmtu(sk))
1106                 goto out;
1107
1108         odst = sk_dst_get(sk);
1109
1110         if (sock_owned_by_user(sk) || !odst) {
1111                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1112                 goto out;
1113         }
1114
1115         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1116
1117         rt = (struct rtable *)odst;
1118         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1119                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1120                 if (IS_ERR(rt))
1121                         goto out;
1122
1123                 new = true;
1124         }
1125
1126         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1127
1128         if (!dst_check(&rt->dst, 0)) {
1129                 if (new)
1130                         dst_release(&rt->dst);
1131
1132                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1133                 if (IS_ERR(rt))
1134                         goto out;
1135
1136                 new = true;
1137         }
1138
1139         if (new)
1140                 sk_dst_set(sk, &rt->dst);
1141
1142 out:
1143         bh_unlock_sock(sk);
1144         dst_release(odst);
1145 }
1146 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1147
1148 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1149                    int oif, u8 protocol)
1150 {
1151         const struct iphdr *iph = (const struct iphdr *) skb->data;
1152         struct flowi4 fl4;
1153         struct rtable *rt;
1154
1155         __build_flow_key(net, &fl4, NULL, iph, oif,
1156                          RT_TOS(iph->tos), protocol, 0, 0);
1157         rt = __ip_route_output_key(net, &fl4);
1158         if (!IS_ERR(rt)) {
1159                 __ip_do_redirect(rt, skb, &fl4, false);
1160                 ip_rt_put(rt);
1161         }
1162 }
1163 EXPORT_SYMBOL_GPL(ipv4_redirect);
1164
1165 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1166 {
1167         const struct iphdr *iph = (const struct iphdr *) skb->data;
1168         struct flowi4 fl4;
1169         struct rtable *rt;
1170         struct net *net = sock_net(sk);
1171
1172         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1173         rt = __ip_route_output_key(net, &fl4);
1174         if (!IS_ERR(rt)) {
1175                 __ip_do_redirect(rt, skb, &fl4, false);
1176                 ip_rt_put(rt);
1177         }
1178 }
1179 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1180
1181 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182 {
1183         struct rtable *rt = (struct rtable *) dst;
1184
1185         /* All IPV4 dsts are created with ->obsolete set to the value
1186          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1187          * into this function always.
1188          *
1189          * When a PMTU/redirect information update invalidates a route,
1190          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1191          * DST_OBSOLETE_DEAD.
1192          */
1193         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1194                 return NULL;
1195         return dst;
1196 }
1197
1198 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1199 {
1200         struct ip_options opt;
1201         int res;
1202
1203         /* Recompile ip options since IPCB may not be valid anymore.
1204          * Also check we have a reasonable ipv4 header.
1205          */
1206         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1207             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1208                 return;
1209
1210         memset(&opt, 0, sizeof(opt));
1211         if (ip_hdr(skb)->ihl > 5) {
1212                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1213                         return;
1214                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1215
1216                 rcu_read_lock();
1217                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1218                 rcu_read_unlock();
1219
1220                 if (res)
1221                         return;
1222         }
1223         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1224 }
1225
1226 static void ipv4_link_failure(struct sk_buff *skb)
1227 {
1228         struct rtable *rt;
1229
1230         ipv4_send_dest_unreach(skb);
1231
1232         rt = skb_rtable(skb);
1233         if (rt)
1234                 dst_set_expires(&rt->dst, 0);
1235 }
1236
1237 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1238 {
1239         pr_debug("%s: %pI4 -> %pI4, %s\n",
1240                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1241                  skb->dev ? skb->dev->name : "?");
1242         kfree_skb(skb);
1243         WARN_ON(1);
1244         return 0;
1245 }
1246
1247 /*
1248    We do not cache source address of outgoing interface,
1249    because it is used only by IP RR, TS and SRR options,
1250    so that it out of fast path.
1251
1252    BTW remember: "addr" is allowed to be not aligned
1253    in IP options!
1254  */
1255
1256 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1257 {
1258         __be32 src;
1259
1260         if (rt_is_output_route(rt))
1261                 src = ip_hdr(skb)->saddr;
1262         else {
1263                 struct fib_result res;
1264                 struct iphdr *iph = ip_hdr(skb);
1265                 struct flowi4 fl4 = {
1266                         .daddr = iph->daddr,
1267                         .saddr = iph->saddr,
1268                         .flowi4_tos = RT_TOS(iph->tos),
1269                         .flowi4_oif = rt->dst.dev->ifindex,
1270                         .flowi4_iif = skb->dev->ifindex,
1271                         .flowi4_mark = skb->mark,
1272                 };
1273
1274                 rcu_read_lock();
1275                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1276                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1277                 else
1278                         src = inet_select_addr(rt->dst.dev,
1279                                                rt_nexthop(rt, iph->daddr),
1280                                                RT_SCOPE_UNIVERSE);
1281                 rcu_read_unlock();
1282         }
1283         memcpy(addr, &src, 4);
1284 }
1285
1286 #ifdef CONFIG_IP_ROUTE_CLASSID
1287 static void set_class_tag(struct rtable *rt, u32 tag)
1288 {
1289         if (!(rt->dst.tclassid & 0xFFFF))
1290                 rt->dst.tclassid |= tag & 0xFFFF;
1291         if (!(rt->dst.tclassid & 0xFFFF0000))
1292                 rt->dst.tclassid |= tag & 0xFFFF0000;
1293 }
1294 #endif
1295
1296 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1297 {
1298         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1299         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1300                                     ip_rt_min_advmss);
1301
1302         return min(advmss, IPV4_MAX_PMTU - header_size);
1303 }
1304
1305 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1306 {
1307         const struct rtable *rt = (const struct rtable *) dst;
1308         unsigned int mtu = rt->rt_pmtu;
1309
1310         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1311                 mtu = dst_metric_raw(dst, RTAX_MTU);
1312
1313         if (mtu)
1314                 return mtu;
1315
1316         mtu = READ_ONCE(dst->dev->mtu);
1317
1318         if (unlikely(ip_mtu_locked(dst))) {
1319                 if (rt->rt_gw_family && mtu > 576)
1320                         mtu = 576;
1321         }
1322
1323         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1324
1325         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1326 }
1327
1328 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1329 {
1330         struct fnhe_hash_bucket *hash;
1331         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332         u32 hval = fnhe_hashfun(daddr);
1333
1334         spin_lock_bh(&fnhe_lock);
1335
1336         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1337                                          lockdep_is_held(&fnhe_lock));
1338         hash += hval;
1339
1340         fnhe_p = &hash->chain;
1341         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342         while (fnhe) {
1343                 if (fnhe->fnhe_daddr == daddr) {
1344                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1346                         /* set fnhe_daddr to 0 to ensure it won't bind with
1347                          * new dsts in rt_bind_exception().
1348                          */
1349                         fnhe->fnhe_daddr = 0;
1350                         fnhe_flush_routes(fnhe);
1351                         kfree_rcu(fnhe, rcu);
1352                         break;
1353                 }
1354                 fnhe_p = &fnhe->fnhe_next;
1355                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356                                                  lockdep_is_held(&fnhe_lock));
1357         }
1358
1359         spin_unlock_bh(&fnhe_lock);
1360 }
1361
1362 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1363                                                __be32 daddr)
1364 {
1365         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1366         struct fib_nh_exception *fnhe;
1367         u32 hval;
1368
1369         if (!hash)
1370                 return NULL;
1371
1372         hval = fnhe_hashfun(daddr);
1373
1374         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1375              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1376                 if (fnhe->fnhe_daddr == daddr) {
1377                         if (fnhe->fnhe_expires &&
1378                             time_after(jiffies, fnhe->fnhe_expires)) {
1379                                 ip_del_fnhe(nhc, daddr);
1380                                 break;
1381                         }
1382                         return fnhe;
1383                 }
1384         }
1385         return NULL;
1386 }
1387
1388 /* MTU selection:
1389  * 1. mtu on route is locked - use it
1390  * 2. mtu from nexthop exception
1391  * 3. mtu from egress device
1392  */
1393
1394 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1395 {
1396         struct fib_nh_common *nhc = res->nhc;
1397         struct net_device *dev = nhc->nhc_dev;
1398         struct fib_info *fi = res->fi;
1399         u32 mtu = 0;
1400
1401         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1402             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1403                 mtu = fi->fib_mtu;
1404
1405         if (likely(!mtu)) {
1406                 struct fib_nh_exception *fnhe;
1407
1408                 fnhe = find_exception(nhc, daddr);
1409                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1410                         mtu = fnhe->fnhe_pmtu;
1411         }
1412
1413         if (likely(!mtu))
1414                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1415
1416         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1417 }
1418
1419 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1420                               __be32 daddr, const bool do_cache)
1421 {
1422         bool ret = false;
1423
1424         spin_lock_bh(&fnhe_lock);
1425
1426         if (daddr == fnhe->fnhe_daddr) {
1427                 struct rtable __rcu **porig;
1428                 struct rtable *orig;
1429                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1430
1431                 if (rt_is_input_route(rt))
1432                         porig = &fnhe->fnhe_rth_input;
1433                 else
1434                         porig = &fnhe->fnhe_rth_output;
1435                 orig = rcu_dereference(*porig);
1436
1437                 if (fnhe->fnhe_genid != genid) {
1438                         fnhe->fnhe_genid = genid;
1439                         fnhe->fnhe_gw = 0;
1440                         fnhe->fnhe_pmtu = 0;
1441                         fnhe->fnhe_expires = 0;
1442                         fnhe->fnhe_mtu_locked = false;
1443                         fnhe_flush_routes(fnhe);
1444                         orig = NULL;
1445                 }
1446                 fill_route_from_fnhe(rt, fnhe);
1447                 if (!rt->rt_gw4) {
1448                         rt->rt_gw4 = daddr;
1449                         rt->rt_gw_family = AF_INET;
1450                 }
1451
1452                 if (do_cache) {
1453                         dst_hold(&rt->dst);
1454                         rcu_assign_pointer(*porig, rt);
1455                         if (orig) {
1456                                 dst_dev_put(&orig->dst);
1457                                 dst_release(&orig->dst);
1458                         }
1459                         ret = true;
1460                 }
1461
1462                 fnhe->fnhe_stamp = jiffies;
1463         }
1464         spin_unlock_bh(&fnhe_lock);
1465
1466         return ret;
1467 }
1468
1469 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1470 {
1471         struct rtable *orig, *prev, **p;
1472         bool ret = true;
1473
1474         if (rt_is_input_route(rt)) {
1475                 p = (struct rtable **)&nhc->nhc_rth_input;
1476         } else {
1477                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1478         }
1479         orig = *p;
1480
1481         /* hold dst before doing cmpxchg() to avoid race condition
1482          * on this dst
1483          */
1484         dst_hold(&rt->dst);
1485         prev = cmpxchg(p, orig, rt);
1486         if (prev == orig) {
1487                 if (orig) {
1488                         dst_dev_put(&orig->dst);
1489                         dst_release(&orig->dst);
1490                 }
1491         } else {
1492                 dst_release(&rt->dst);
1493                 ret = false;
1494         }
1495
1496         return ret;
1497 }
1498
1499 struct uncached_list {
1500         spinlock_t              lock;
1501         struct list_head        head;
1502 };
1503
1504 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1505
1506 void rt_add_uncached_list(struct rtable *rt)
1507 {
1508         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1509
1510         rt->rt_uncached_list = ul;
1511
1512         spin_lock_bh(&ul->lock);
1513         list_add_tail(&rt->rt_uncached, &ul->head);
1514         spin_unlock_bh(&ul->lock);
1515 }
1516
1517 void rt_del_uncached_list(struct rtable *rt)
1518 {
1519         if (!list_empty(&rt->rt_uncached)) {
1520                 struct uncached_list *ul = rt->rt_uncached_list;
1521
1522                 spin_lock_bh(&ul->lock);
1523                 list_del(&rt->rt_uncached);
1524                 spin_unlock_bh(&ul->lock);
1525         }
1526 }
1527
1528 static void ipv4_dst_destroy(struct dst_entry *dst)
1529 {
1530         struct rtable *rt = (struct rtable *)dst;
1531
1532         ip_dst_metrics_put(dst);
1533         rt_del_uncached_list(rt);
1534 }
1535
1536 void rt_flush_dev(struct net_device *dev)
1537 {
1538         struct net *net = dev_net(dev);
1539         struct rtable *rt;
1540         int cpu;
1541
1542         for_each_possible_cpu(cpu) {
1543                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1544
1545                 spin_lock_bh(&ul->lock);
1546                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1547                         if (rt->dst.dev != dev)
1548                                 continue;
1549                         rt->dst.dev = net->loopback_dev;
1550                         dev_hold(rt->dst.dev);
1551                         dev_put(dev);
1552                 }
1553                 spin_unlock_bh(&ul->lock);
1554         }
1555 }
1556
1557 static bool rt_cache_valid(const struct rtable *rt)
1558 {
1559         return  rt &&
1560                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1561                 !rt_is_expired(rt);
1562 }
1563
1564 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1565                            const struct fib_result *res,
1566                            struct fib_nh_exception *fnhe,
1567                            struct fib_info *fi, u16 type, u32 itag,
1568                            const bool do_cache)
1569 {
1570         bool cached = false;
1571
1572         if (fi) {
1573                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1574
1575                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1576                         rt->rt_gw_family = nhc->nhc_gw_family;
1577                         /* only INET and INET6 are supported */
1578                         if (likely(nhc->nhc_gw_family == AF_INET))
1579                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1580                         else
1581                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1582                 }
1583
1584                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1585
1586 #ifdef CONFIG_IP_ROUTE_CLASSID
1587                 {
1588                         struct fib_nh *nh;
1589
1590                         nh = container_of(nhc, struct fib_nh, nh_common);
1591                         rt->dst.tclassid = nh->nh_tclassid;
1592                 }
1593 #endif
1594                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1595                 if (unlikely(fnhe))
1596                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1597                 else if (do_cache)
1598                         cached = rt_cache_route(nhc, rt);
1599                 if (unlikely(!cached)) {
1600                         /* Routes we intend to cache in nexthop exception or
1601                          * FIB nexthop have the DST_NOCACHE bit clear.
1602                          * However, if we are unsuccessful at storing this
1603                          * route into the cache we really need to set it.
1604                          */
1605                         if (!rt->rt_gw4) {
1606                                 rt->rt_gw_family = AF_INET;
1607                                 rt->rt_gw4 = daddr;
1608                         }
1609                         rt_add_uncached_list(rt);
1610                 }
1611         } else
1612                 rt_add_uncached_list(rt);
1613
1614 #ifdef CONFIG_IP_ROUTE_CLASSID
1615 #ifdef CONFIG_IP_MULTIPLE_TABLES
1616         set_class_tag(rt, res->tclassid);
1617 #endif
1618         set_class_tag(rt, itag);
1619 #endif
1620 }
1621
1622 struct rtable *rt_dst_alloc(struct net_device *dev,
1623                             unsigned int flags, u16 type,
1624                             bool nopolicy, bool noxfrm, bool will_cache)
1625 {
1626         struct rtable *rt;
1627
1628         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1629                        (will_cache ? 0 : DST_HOST) |
1630                        (nopolicy ? DST_NOPOLICY : 0) |
1631                        (noxfrm ? DST_NOXFRM : 0));
1632
1633         if (rt) {
1634                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1635                 rt->rt_flags = flags;
1636                 rt->rt_type = type;
1637                 rt->rt_is_input = 0;
1638                 rt->rt_iif = 0;
1639                 rt->rt_pmtu = 0;
1640                 rt->rt_mtu_locked = 0;
1641                 rt->rt_gw_family = 0;
1642                 rt->rt_gw4 = 0;
1643                 INIT_LIST_HEAD(&rt->rt_uncached);
1644
1645                 rt->dst.output = ip_output;
1646                 if (flags & RTCF_LOCAL)
1647                         rt->dst.input = ip_local_deliver;
1648         }
1649
1650         return rt;
1651 }
1652 EXPORT_SYMBOL(rt_dst_alloc);
1653
1654 /* called in rcu_read_lock() section */
1655 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1656                           u8 tos, struct net_device *dev,
1657                           struct in_device *in_dev, u32 *itag)
1658 {
1659         int err;
1660
1661         /* Primary sanity checks. */
1662         if (!in_dev)
1663                 return -EINVAL;
1664
1665         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1666             skb->protocol != htons(ETH_P_IP))
1667                 return -EINVAL;
1668
1669         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1670                 return -EINVAL;
1671
1672         if (ipv4_is_zeronet(saddr)) {
1673                 if (!ipv4_is_local_multicast(daddr) &&
1674                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1675                         return -EINVAL;
1676         } else {
1677                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1678                                           in_dev, itag);
1679                 if (err < 0)
1680                         return err;
1681         }
1682         return 0;
1683 }
1684
1685 /* called in rcu_read_lock() section */
1686 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1687                              u8 tos, struct net_device *dev, int our)
1688 {
1689         struct in_device *in_dev = __in_dev_get_rcu(dev);
1690         unsigned int flags = RTCF_MULTICAST;
1691         struct rtable *rth;
1692         u32 itag = 0;
1693         int err;
1694
1695         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1696         if (err)
1697                 return err;
1698
1699         if (our)
1700                 flags |= RTCF_LOCAL;
1701
1702         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1703                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1704         if (!rth)
1705                 return -ENOBUFS;
1706
1707 #ifdef CONFIG_IP_ROUTE_CLASSID
1708         rth->dst.tclassid = itag;
1709 #endif
1710         rth->dst.output = ip_rt_bug;
1711         rth->rt_is_input= 1;
1712
1713 #ifdef CONFIG_IP_MROUTE
1714         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1715                 rth->dst.input = ip_mr_input;
1716 #endif
1717         RT_CACHE_STAT_INC(in_slow_mc);
1718
1719         skb_dst_set(skb, &rth->dst);
1720         return 0;
1721 }
1722
1723
1724 static void ip_handle_martian_source(struct net_device *dev,
1725                                      struct in_device *in_dev,
1726                                      struct sk_buff *skb,
1727                                      __be32 daddr,
1728                                      __be32 saddr)
1729 {
1730         RT_CACHE_STAT_INC(in_martian_src);
1731 #ifdef CONFIG_IP_ROUTE_VERBOSE
1732         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1733                 /*
1734                  *      RFC1812 recommendation, if source is martian,
1735                  *      the only hint is MAC header.
1736                  */
1737                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1738                         &daddr, &saddr, dev->name);
1739                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1740                         print_hex_dump(KERN_WARNING, "ll header: ",
1741                                        DUMP_PREFIX_OFFSET, 16, 1,
1742                                        skb_mac_header(skb),
1743                                        dev->hard_header_len, false);
1744                 }
1745         }
1746 #endif
1747 }
1748
1749 /* called in rcu_read_lock() section */
1750 static int __mkroute_input(struct sk_buff *skb,
1751                            const struct fib_result *res,
1752                            struct in_device *in_dev,
1753                            __be32 daddr, __be32 saddr, u32 tos)
1754 {
1755         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1756         struct net_device *dev = nhc->nhc_dev;
1757         struct fib_nh_exception *fnhe;
1758         struct rtable *rth;
1759         int err;
1760         struct in_device *out_dev;
1761         bool do_cache;
1762         u32 itag = 0;
1763
1764         /* get a working reference to the output device */
1765         out_dev = __in_dev_get_rcu(dev);
1766         if (!out_dev) {
1767                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1768                 return -EINVAL;
1769         }
1770
1771         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1772                                   in_dev->dev, in_dev, &itag);
1773         if (err < 0) {
1774                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1775                                          saddr);
1776
1777                 goto cleanup;
1778         }
1779
1780         do_cache = res->fi && !itag;
1781         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1782             skb->protocol == htons(ETH_P_IP)) {
1783                 __be32 gw;
1784
1785                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1786                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1787                     inet_addr_onlink(out_dev, saddr, gw))
1788                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1789         }
1790
1791         if (skb->protocol != htons(ETH_P_IP)) {
1792                 /* Not IP (i.e. ARP). Do not create route, if it is
1793                  * invalid for proxy arp. DNAT routes are always valid.
1794                  *
1795                  * Proxy arp feature have been extended to allow, ARP
1796                  * replies back to the same interface, to support
1797                  * Private VLAN switch technologies. See arp.c.
1798                  */
1799                 if (out_dev == in_dev &&
1800                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1801                         err = -EINVAL;
1802                         goto cleanup;
1803                 }
1804         }
1805
1806         fnhe = find_exception(nhc, daddr);
1807         if (do_cache) {
1808                 if (fnhe)
1809                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1810                 else
1811                         rth = rcu_dereference(nhc->nhc_rth_input);
1812                 if (rt_cache_valid(rth)) {
1813                         skb_dst_set_noref(skb, &rth->dst);
1814                         goto out;
1815                 }
1816         }
1817
1818         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1819                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1820                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1821         if (!rth) {
1822                 err = -ENOBUFS;
1823                 goto cleanup;
1824         }
1825
1826         rth->rt_is_input = 1;
1827         RT_CACHE_STAT_INC(in_slow_tot);
1828
1829         rth->dst.input = ip_forward;
1830
1831         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1832                        do_cache);
1833         lwtunnel_set_redirect(&rth->dst);
1834         skb_dst_set(skb, &rth->dst);
1835 out:
1836         err = 0;
1837  cleanup:
1838         return err;
1839 }
1840
1841 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1842 /* To make ICMP packets follow the right flow, the multipath hash is
1843  * calculated from the inner IP addresses.
1844  */
1845 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1846                                  struct flow_keys *hash_keys)
1847 {
1848         const struct iphdr *outer_iph = ip_hdr(skb);
1849         const struct iphdr *key_iph = outer_iph;
1850         const struct iphdr *inner_iph;
1851         const struct icmphdr *icmph;
1852         struct iphdr _inner_iph;
1853         struct icmphdr _icmph;
1854
1855         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1856                 goto out;
1857
1858         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1859                 goto out;
1860
1861         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1862                                    &_icmph);
1863         if (!icmph)
1864                 goto out;
1865
1866         if (icmph->type != ICMP_DEST_UNREACH &&
1867             icmph->type != ICMP_REDIRECT &&
1868             icmph->type != ICMP_TIME_EXCEEDED &&
1869             icmph->type != ICMP_PARAMETERPROB)
1870                 goto out;
1871
1872         inner_iph = skb_header_pointer(skb,
1873                                        outer_iph->ihl * 4 + sizeof(_icmph),
1874                                        sizeof(_inner_iph), &_inner_iph);
1875         if (!inner_iph)
1876                 goto out;
1877
1878         key_iph = inner_iph;
1879 out:
1880         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1881         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1882 }
1883
1884 /* if skb is set it will be used and fl4 can be NULL */
1885 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1886                        const struct sk_buff *skb, struct flow_keys *flkeys)
1887 {
1888         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1889         struct flow_keys hash_keys;
1890         u32 mhash;
1891
1892         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1893         case 0:
1894                 memset(&hash_keys, 0, sizeof(hash_keys));
1895                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1896                 if (skb) {
1897                         ip_multipath_l3_keys(skb, &hash_keys);
1898                 } else {
1899                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1900                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1901                 }
1902                 break;
1903         case 1:
1904                 /* skb is currently provided only when forwarding */
1905                 if (skb) {
1906                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1907                         struct flow_keys keys;
1908
1909                         /* short-circuit if we already have L4 hash present */
1910                         if (skb->l4_hash)
1911                                 return skb_get_hash_raw(skb) >> 1;
1912
1913                         memset(&hash_keys, 0, sizeof(hash_keys));
1914
1915                         if (!flkeys) {
1916                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1917                                 flkeys = &keys;
1918                         }
1919
1920                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1921                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1922                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1923                         hash_keys.ports.src = flkeys->ports.src;
1924                         hash_keys.ports.dst = flkeys->ports.dst;
1925                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1926                 } else {
1927                         memset(&hash_keys, 0, sizeof(hash_keys));
1928                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1929                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1930                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1931                         hash_keys.ports.src = fl4->fl4_sport;
1932                         hash_keys.ports.dst = fl4->fl4_dport;
1933                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1934                 }
1935                 break;
1936         }
1937         mhash = flow_hash_from_keys(&hash_keys);
1938
1939         if (multipath_hash)
1940                 mhash = jhash_2words(mhash, multipath_hash, 0);
1941
1942         return mhash >> 1;
1943 }
1944 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1945
1946 static int ip_mkroute_input(struct sk_buff *skb,
1947                             struct fib_result *res,
1948                             struct in_device *in_dev,
1949                             __be32 daddr, __be32 saddr, u32 tos,
1950                             struct flow_keys *hkeys)
1951 {
1952 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1953         if (res->fi && res->fi->fib_nhs > 1) {
1954                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1955
1956                 fib_select_multipath(res, h);
1957         }
1958 #endif
1959
1960         /* create a routing cache entry */
1961         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1962 }
1963
1964 /*
1965  *      NOTE. We drop all the packets that has local source
1966  *      addresses, because every properly looped back packet
1967  *      must have correct destination already attached by output routine.
1968  *
1969  *      Such approach solves two big problems:
1970  *      1. Not simplex devices are handled properly.
1971  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1972  *      called with rcu_read_lock()
1973  */
1974
1975 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1976                                u8 tos, struct net_device *dev,
1977                                struct fib_result *res)
1978 {
1979         struct in_device *in_dev = __in_dev_get_rcu(dev);
1980         struct flow_keys *flkeys = NULL, _flkeys;
1981         struct net    *net = dev_net(dev);
1982         struct ip_tunnel_info *tun_info;
1983         int             err = -EINVAL;
1984         unsigned int    flags = 0;
1985         u32             itag = 0;
1986         struct rtable   *rth;
1987         struct flowi4   fl4;
1988         bool do_cache;
1989
1990         /* IP on this device is disabled. */
1991
1992         if (!in_dev)
1993                 goto out;
1994
1995         /* Check for the most weird martians, which can be not detected
1996            by fib_lookup.
1997          */
1998
1999         tun_info = skb_tunnel_info(skb);
2000         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2001                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2002         else
2003                 fl4.flowi4_tun_key.tun_id = 0;
2004         skb_dst_drop(skb);
2005
2006         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2007                 goto martian_source;
2008
2009         res->fi = NULL;
2010         res->table = NULL;
2011         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2012                 goto brd_input;
2013
2014         /* Accept zero addresses only to limited broadcast;
2015          * I even do not know to fix it or not. Waiting for complains :-)
2016          */
2017         if (ipv4_is_zeronet(saddr))
2018                 goto martian_source;
2019
2020         if (ipv4_is_zeronet(daddr))
2021                 goto martian_destination;
2022
2023         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2024          * and call it once if daddr or/and saddr are loopback addresses
2025          */
2026         if (ipv4_is_loopback(daddr)) {
2027                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2028                         goto martian_destination;
2029         } else if (ipv4_is_loopback(saddr)) {
2030                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2031                         goto martian_source;
2032         }
2033
2034         /*
2035          *      Now we are ready to route packet.
2036          */
2037         fl4.flowi4_oif = 0;
2038         fl4.flowi4_iif = dev->ifindex;
2039         fl4.flowi4_mark = skb->mark;
2040         fl4.flowi4_tos = tos;
2041         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2042         fl4.flowi4_flags = 0;
2043         fl4.daddr = daddr;
2044         fl4.saddr = saddr;
2045         fl4.flowi4_uid = sock_net_uid(net, NULL);
2046
2047         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2048                 flkeys = &_flkeys;
2049         } else {
2050                 fl4.flowi4_proto = 0;
2051                 fl4.fl4_sport = 0;
2052                 fl4.fl4_dport = 0;
2053         }
2054
2055         err = fib_lookup(net, &fl4, res, 0);
2056         if (err != 0) {
2057                 if (!IN_DEV_FORWARD(in_dev))
2058                         err = -EHOSTUNREACH;
2059                 goto no_route;
2060         }
2061
2062         if (res->type == RTN_BROADCAST) {
2063                 if (IN_DEV_BFORWARD(in_dev))
2064                         goto make_route;
2065                 goto brd_input;
2066         }
2067
2068         if (res->type == RTN_LOCAL) {
2069                 err = fib_validate_source(skb, saddr, daddr, tos,
2070                                           0, dev, in_dev, &itag);
2071                 if (err < 0)
2072                         goto martian_source;
2073                 goto local_input;
2074         }
2075
2076         if (!IN_DEV_FORWARD(in_dev)) {
2077                 err = -EHOSTUNREACH;
2078                 goto no_route;
2079         }
2080         if (res->type != RTN_UNICAST)
2081                 goto martian_destination;
2082
2083 make_route:
2084         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2085 out:    return err;
2086
2087 brd_input:
2088         if (skb->protocol != htons(ETH_P_IP))
2089                 goto e_inval;
2090
2091         if (!ipv4_is_zeronet(saddr)) {
2092                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2093                                           in_dev, &itag);
2094                 if (err < 0)
2095                         goto martian_source;
2096         }
2097         flags |= RTCF_BROADCAST;
2098         res->type = RTN_BROADCAST;
2099         RT_CACHE_STAT_INC(in_brd);
2100
2101 local_input:
2102         do_cache = false;
2103         if (res->fi) {
2104                 if (!itag) {
2105                         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2106
2107                         rth = rcu_dereference(nhc->nhc_rth_input);
2108                         if (rt_cache_valid(rth)) {
2109                                 skb_dst_set_noref(skb, &rth->dst);
2110                                 err = 0;
2111                                 goto out;
2112                         }
2113                         do_cache = true;
2114                 }
2115         }
2116
2117         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2118                            flags | RTCF_LOCAL, res->type,
2119                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2120         if (!rth)
2121                 goto e_nobufs;
2122
2123         rth->dst.output= ip_rt_bug;
2124 #ifdef CONFIG_IP_ROUTE_CLASSID
2125         rth->dst.tclassid = itag;
2126 #endif
2127         rth->rt_is_input = 1;
2128
2129         RT_CACHE_STAT_INC(in_slow_tot);
2130         if (res->type == RTN_UNREACHABLE) {
2131                 rth->dst.input= ip_error;
2132                 rth->dst.error= -err;
2133                 rth->rt_flags   &= ~RTCF_LOCAL;
2134         }
2135
2136         if (do_cache) {
2137                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2138
2139                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2140                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2141                         WARN_ON(rth->dst.input == lwtunnel_input);
2142                         rth->dst.lwtstate->orig_input = rth->dst.input;
2143                         rth->dst.input = lwtunnel_input;
2144                 }
2145
2146                 if (unlikely(!rt_cache_route(nhc, rth)))
2147                         rt_add_uncached_list(rth);
2148         }
2149         skb_dst_set(skb, &rth->dst);
2150         err = 0;
2151         goto out;
2152
2153 no_route:
2154         RT_CACHE_STAT_INC(in_no_route);
2155         res->type = RTN_UNREACHABLE;
2156         res->fi = NULL;
2157         res->table = NULL;
2158         goto local_input;
2159
2160         /*
2161          *      Do not cache martian addresses: they should be logged (RFC1812)
2162          */
2163 martian_destination:
2164         RT_CACHE_STAT_INC(in_martian_dst);
2165 #ifdef CONFIG_IP_ROUTE_VERBOSE
2166         if (IN_DEV_LOG_MARTIANS(in_dev))
2167                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2168                                      &daddr, &saddr, dev->name);
2169 #endif
2170
2171 e_inval:
2172         err = -EINVAL;
2173         goto out;
2174
2175 e_nobufs:
2176         err = -ENOBUFS;
2177         goto out;
2178
2179 martian_source:
2180         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2181         goto out;
2182 }
2183
2184 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2185                          u8 tos, struct net_device *dev)
2186 {
2187         struct fib_result res;
2188         int err;
2189
2190         tos &= IPTOS_RT_MASK;
2191         rcu_read_lock();
2192         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2193         rcu_read_unlock();
2194
2195         return err;
2196 }
2197 EXPORT_SYMBOL(ip_route_input_noref);
2198
2199 /* called with rcu_read_lock held */
2200 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2201                        u8 tos, struct net_device *dev, struct fib_result *res)
2202 {
2203         /* Multicast recognition logic is moved from route cache to here.
2204            The problem was that too many Ethernet cards have broken/missing
2205            hardware multicast filters :-( As result the host on multicasting
2206            network acquires a lot of useless route cache entries, sort of
2207            SDR messages from all the world. Now we try to get rid of them.
2208            Really, provided software IP multicast filter is organized
2209            reasonably (at least, hashed), it does not result in a slowdown
2210            comparing with route cache reject entries.
2211            Note, that multicast routers are not affected, because
2212            route cache entry is created eventually.
2213          */
2214         if (ipv4_is_multicast(daddr)) {
2215                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2216                 int our = 0;
2217                 int err = -EINVAL;
2218
2219                 if (!in_dev)
2220                         return err;
2221                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2222                                       ip_hdr(skb)->protocol);
2223
2224                 /* check l3 master if no match yet */
2225                 if (!our && netif_is_l3_slave(dev)) {
2226                         struct in_device *l3_in_dev;
2227
2228                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2229                         if (l3_in_dev)
2230                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2231                                                       ip_hdr(skb)->protocol);
2232                 }
2233
2234                 if (our
2235 #ifdef CONFIG_IP_MROUTE
2236                         ||
2237                     (!ipv4_is_local_multicast(daddr) &&
2238                      IN_DEV_MFORWARD(in_dev))
2239 #endif
2240                    ) {
2241                         err = ip_route_input_mc(skb, daddr, saddr,
2242                                                 tos, dev, our);
2243                 }
2244                 return err;
2245         }
2246
2247         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2248 }
2249
2250 /* called with rcu_read_lock() */
2251 static struct rtable *__mkroute_output(const struct fib_result *res,
2252                                        const struct flowi4 *fl4, int orig_oif,
2253                                        struct net_device *dev_out,
2254                                        unsigned int flags)
2255 {
2256         struct fib_info *fi = res->fi;
2257         struct fib_nh_exception *fnhe;
2258         struct in_device *in_dev;
2259         u16 type = res->type;
2260         struct rtable *rth;
2261         bool do_cache;
2262
2263         in_dev = __in_dev_get_rcu(dev_out);
2264         if (!in_dev)
2265                 return ERR_PTR(-EINVAL);
2266
2267         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2268                 if (ipv4_is_loopback(fl4->saddr) &&
2269                     !(dev_out->flags & IFF_LOOPBACK) &&
2270                     !netif_is_l3_master(dev_out))
2271                         return ERR_PTR(-EINVAL);
2272
2273         if (ipv4_is_lbcast(fl4->daddr))
2274                 type = RTN_BROADCAST;
2275         else if (ipv4_is_multicast(fl4->daddr))
2276                 type = RTN_MULTICAST;
2277         else if (ipv4_is_zeronet(fl4->daddr))
2278                 return ERR_PTR(-EINVAL);
2279
2280         if (dev_out->flags & IFF_LOOPBACK)
2281                 flags |= RTCF_LOCAL;
2282
2283         do_cache = true;
2284         if (type == RTN_BROADCAST) {
2285                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2286                 fi = NULL;
2287         } else if (type == RTN_MULTICAST) {
2288                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2289                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2290                                      fl4->flowi4_proto))
2291                         flags &= ~RTCF_LOCAL;
2292                 else
2293                         do_cache = false;
2294                 /* If multicast route do not exist use
2295                  * default one, but do not gateway in this case.
2296                  * Yes, it is hack.
2297                  */
2298                 if (fi && res->prefixlen < 4)
2299                         fi = NULL;
2300         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2301                    (orig_oif != dev_out->ifindex)) {
2302                 /* For local routes that require a particular output interface
2303                  * we do not want to cache the result.  Caching the result
2304                  * causes incorrect behaviour when there are multiple source
2305                  * addresses on the interface, the end result being that if the
2306                  * intended recipient is waiting on that interface for the
2307                  * packet he won't receive it because it will be delivered on
2308                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2309                  * be set to the loopback interface as well.
2310                  */
2311                 do_cache = false;
2312         }
2313
2314         fnhe = NULL;
2315         do_cache &= fi != NULL;
2316         if (fi) {
2317                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2318                 struct rtable __rcu **prth;
2319
2320                 fnhe = find_exception(nhc, fl4->daddr);
2321                 if (!do_cache)
2322                         goto add;
2323                 if (fnhe) {
2324                         prth = &fnhe->fnhe_rth_output;
2325                 } else {
2326                         if (unlikely(fl4->flowi4_flags &
2327                                      FLOWI_FLAG_KNOWN_NH &&
2328                                      !(nhc->nhc_gw_family &&
2329                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2330                                 do_cache = false;
2331                                 goto add;
2332                         }
2333                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2334                 }
2335                 rth = rcu_dereference(*prth);
2336                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2337                         return rth;
2338         }
2339
2340 add:
2341         rth = rt_dst_alloc(dev_out, flags, type,
2342                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2343                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2344                            do_cache);
2345         if (!rth)
2346                 return ERR_PTR(-ENOBUFS);
2347
2348         rth->rt_iif = orig_oif;
2349
2350         RT_CACHE_STAT_INC(out_slow_tot);
2351
2352         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2353                 if (flags & RTCF_LOCAL &&
2354                     !(dev_out->flags & IFF_LOOPBACK)) {
2355                         rth->dst.output = ip_mc_output;
2356                         RT_CACHE_STAT_INC(out_slow_mc);
2357                 }
2358 #ifdef CONFIG_IP_MROUTE
2359                 if (type == RTN_MULTICAST) {
2360                         if (IN_DEV_MFORWARD(in_dev) &&
2361                             !ipv4_is_local_multicast(fl4->daddr)) {
2362                                 rth->dst.input = ip_mr_input;
2363                                 rth->dst.output = ip_mc_output;
2364                         }
2365                 }
2366 #endif
2367         }
2368
2369         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2370         lwtunnel_set_redirect(&rth->dst);
2371
2372         return rth;
2373 }
2374
2375 /*
2376  * Major route resolver routine.
2377  */
2378
2379 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2380                                         const struct sk_buff *skb)
2381 {
2382         __u8 tos = RT_FL_TOS(fl4);
2383         struct fib_result res = {
2384                 .type           = RTN_UNSPEC,
2385                 .fi             = NULL,
2386                 .table          = NULL,
2387                 .tclassid       = 0,
2388         };
2389         struct rtable *rth;
2390
2391         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2392         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2393         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2394                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2395
2396         rcu_read_lock();
2397         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2398         rcu_read_unlock();
2399
2400         return rth;
2401 }
2402 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2403
2404 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2405                                             struct fib_result *res,
2406                                             const struct sk_buff *skb)
2407 {
2408         struct net_device *dev_out = NULL;
2409         int orig_oif = fl4->flowi4_oif;
2410         unsigned int flags = 0;
2411         struct rtable *rth;
2412         int err = -ENETUNREACH;
2413
2414         if (fl4->saddr) {
2415                 rth = ERR_PTR(-EINVAL);
2416                 if (ipv4_is_multicast(fl4->saddr) ||
2417                     ipv4_is_lbcast(fl4->saddr) ||
2418                     ipv4_is_zeronet(fl4->saddr))
2419                         goto out;
2420
2421                 /* I removed check for oif == dev_out->oif here.
2422                    It was wrong for two reasons:
2423                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2424                       is assigned to multiple interfaces.
2425                    2. Moreover, we are allowed to send packets with saddr
2426                       of another iface. --ANK
2427                  */
2428
2429                 if (fl4->flowi4_oif == 0 &&
2430                     (ipv4_is_multicast(fl4->daddr) ||
2431                      ipv4_is_lbcast(fl4->daddr))) {
2432                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2433                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2434                         if (!dev_out)
2435                                 goto out;
2436
2437                         /* Special hack: user can direct multicasts
2438                            and limited broadcast via necessary interface
2439                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2440                            This hack is not just for fun, it allows
2441                            vic,vat and friends to work.
2442                            They bind socket to loopback, set ttl to zero
2443                            and expect that it will work.
2444                            From the viewpoint of routing cache they are broken,
2445                            because we are not allowed to build multicast path
2446                            with loopback source addr (look, routing cache
2447                            cannot know, that ttl is zero, so that packet
2448                            will not leave this host and route is valid).
2449                            Luckily, this hack is good workaround.
2450                          */
2451
2452                         fl4->flowi4_oif = dev_out->ifindex;
2453                         goto make_route;
2454                 }
2455
2456                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2457                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2458                         if (!__ip_dev_find(net, fl4->saddr, false))
2459                                 goto out;
2460                 }
2461         }
2462
2463
2464         if (fl4->flowi4_oif) {
2465                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2466                 rth = ERR_PTR(-ENODEV);
2467                 if (!dev_out)
2468                         goto out;
2469
2470                 /* RACE: Check return value of inet_select_addr instead. */
2471                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2472                         rth = ERR_PTR(-ENETUNREACH);
2473                         goto out;
2474                 }
2475                 if (ipv4_is_local_multicast(fl4->daddr) ||
2476                     ipv4_is_lbcast(fl4->daddr) ||
2477                     fl4->flowi4_proto == IPPROTO_IGMP) {
2478                         if (!fl4->saddr)
2479                                 fl4->saddr = inet_select_addr(dev_out, 0,
2480                                                               RT_SCOPE_LINK);
2481                         goto make_route;
2482                 }
2483                 if (!fl4->saddr) {
2484                         if (ipv4_is_multicast(fl4->daddr))
2485                                 fl4->saddr = inet_select_addr(dev_out, 0,
2486                                                               fl4->flowi4_scope);
2487                         else if (!fl4->daddr)
2488                                 fl4->saddr = inet_select_addr(dev_out, 0,
2489                                                               RT_SCOPE_HOST);
2490                 }
2491         }
2492
2493         if (!fl4->daddr) {
2494                 fl4->daddr = fl4->saddr;
2495                 if (!fl4->daddr)
2496                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2497                 dev_out = net->loopback_dev;
2498                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2499                 res->type = RTN_LOCAL;
2500                 flags |= RTCF_LOCAL;
2501                 goto make_route;
2502         }
2503
2504         err = fib_lookup(net, fl4, res, 0);
2505         if (err) {
2506                 res->fi = NULL;
2507                 res->table = NULL;
2508                 if (fl4->flowi4_oif &&
2509                     (ipv4_is_multicast(fl4->daddr) ||
2510                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2511                         /* Apparently, routing tables are wrong. Assume,
2512                            that the destination is on link.
2513
2514                            WHY? DW.
2515                            Because we are allowed to send to iface
2516                            even if it has NO routes and NO assigned
2517                            addresses. When oif is specified, routing
2518                            tables are looked up with only one purpose:
2519                            to catch if destination is gatewayed, rather than
2520                            direct. Moreover, if MSG_DONTROUTE is set,
2521                            we send packet, ignoring both routing tables
2522                            and ifaddr state. --ANK
2523
2524
2525                            We could make it even if oif is unknown,
2526                            likely IPv6, but we do not.
2527                          */
2528
2529                         if (fl4->saddr == 0)
2530                                 fl4->saddr = inet_select_addr(dev_out, 0,
2531                                                               RT_SCOPE_LINK);
2532                         res->type = RTN_UNICAST;
2533                         goto make_route;
2534                 }
2535                 rth = ERR_PTR(err);
2536                 goto out;
2537         }
2538
2539         if (res->type == RTN_LOCAL) {
2540                 if (!fl4->saddr) {
2541                         if (res->fi->fib_prefsrc)
2542                                 fl4->saddr = res->fi->fib_prefsrc;
2543                         else
2544                                 fl4->saddr = fl4->daddr;
2545                 }
2546
2547                 /* L3 master device is the loopback for that domain */
2548                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2549                         net->loopback_dev;
2550
2551                 /* make sure orig_oif points to fib result device even
2552                  * though packet rx/tx happens over loopback or l3mdev
2553                  */
2554                 orig_oif = FIB_RES_OIF(*res);
2555
2556                 fl4->flowi4_oif = dev_out->ifindex;
2557                 flags |= RTCF_LOCAL;
2558                 goto make_route;
2559         }
2560
2561         fib_select_path(net, res, fl4, skb);
2562
2563         dev_out = FIB_RES_DEV(*res);
2564         fl4->flowi4_oif = dev_out->ifindex;
2565
2566
2567 make_route:
2568         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2569
2570 out:
2571         return rth;
2572 }
2573
2574 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2575 {
2576         return NULL;
2577 }
2578
2579 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2580 {
2581         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2582
2583         return mtu ? : dst->dev->mtu;
2584 }
2585
2586 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2587                                           struct sk_buff *skb, u32 mtu)
2588 {
2589 }
2590
2591 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2592                                        struct sk_buff *skb)
2593 {
2594 }
2595
2596 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2597                                           unsigned long old)
2598 {
2599         return NULL;
2600 }
2601
2602 static struct dst_ops ipv4_dst_blackhole_ops = {
2603         .family                 =       AF_INET,
2604         .check                  =       ipv4_blackhole_dst_check,
2605         .mtu                    =       ipv4_blackhole_mtu,
2606         .default_advmss         =       ipv4_default_advmss,
2607         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2608         .redirect               =       ipv4_rt_blackhole_redirect,
2609         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2610         .neigh_lookup           =       ipv4_neigh_lookup,
2611 };
2612
2613 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2614 {
2615         struct rtable *ort = (struct rtable *) dst_orig;
2616         struct rtable *rt;
2617
2618         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2619         if (rt) {
2620                 struct dst_entry *new = &rt->dst;
2621
2622                 new->__use = 1;
2623                 new->input = dst_discard;
2624                 new->output = dst_discard_out;
2625
2626                 new->dev = net->loopback_dev;
2627                 if (new->dev)
2628                         dev_hold(new->dev);
2629
2630                 rt->rt_is_input = ort->rt_is_input;
2631                 rt->rt_iif = ort->rt_iif;
2632                 rt->rt_pmtu = ort->rt_pmtu;
2633                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2634
2635                 rt->rt_genid = rt_genid_ipv4(net);
2636                 rt->rt_flags = ort->rt_flags;
2637                 rt->rt_type = ort->rt_type;
2638                 rt->rt_gw_family = ort->rt_gw_family;
2639                 if (rt->rt_gw_family == AF_INET)
2640                         rt->rt_gw4 = ort->rt_gw4;
2641                 else if (rt->rt_gw_family == AF_INET6)
2642                         rt->rt_gw6 = ort->rt_gw6;
2643
2644                 INIT_LIST_HEAD(&rt->rt_uncached);
2645         }
2646
2647         dst_release(dst_orig);
2648
2649         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2650 }
2651
2652 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2653                                     const struct sock *sk)
2654 {
2655         struct rtable *rt = __ip_route_output_key(net, flp4);
2656
2657         if (IS_ERR(rt))
2658                 return rt;
2659
2660         if (flp4->flowi4_proto)
2661                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2662                                                         flowi4_to_flowi(flp4),
2663                                                         sk, 0);
2664
2665         return rt;
2666 }
2667 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2668
2669 /* called with rcu_read_lock held */
2670 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2671                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2672                         struct sk_buff *skb, u32 portid, u32 seq)
2673 {
2674         struct rtmsg *r;
2675         struct nlmsghdr *nlh;
2676         unsigned long expires = 0;
2677         u32 error;
2678         u32 metrics[RTAX_MAX];
2679
2680         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2681         if (!nlh)
2682                 return -EMSGSIZE;
2683
2684         r = nlmsg_data(nlh);
2685         r->rtm_family    = AF_INET;
2686         r->rtm_dst_len  = 32;
2687         r->rtm_src_len  = 0;
2688         r->rtm_tos      = fl4->flowi4_tos;
2689         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2690         if (nla_put_u32(skb, RTA_TABLE, table_id))
2691                 goto nla_put_failure;
2692         r->rtm_type     = rt->rt_type;
2693         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2694         r->rtm_protocol = RTPROT_UNSPEC;
2695         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2696         if (rt->rt_flags & RTCF_NOTIFY)
2697                 r->rtm_flags |= RTM_F_NOTIFY;
2698         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2699                 r->rtm_flags |= RTCF_DOREDIRECT;
2700
2701         if (nla_put_in_addr(skb, RTA_DST, dst))
2702                 goto nla_put_failure;
2703         if (src) {
2704                 r->rtm_src_len = 32;
2705                 if (nla_put_in_addr(skb, RTA_SRC, src))
2706                         goto nla_put_failure;
2707         }
2708         if (rt->dst.dev &&
2709             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2710                 goto nla_put_failure;
2711 #ifdef CONFIG_IP_ROUTE_CLASSID
2712         if (rt->dst.tclassid &&
2713             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2714                 goto nla_put_failure;
2715 #endif
2716         if (!rt_is_input_route(rt) &&
2717             fl4->saddr != src) {
2718                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2719                         goto nla_put_failure;
2720         }
2721         if (rt->rt_gw_family == AF_INET &&
2722             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2723                 goto nla_put_failure;
2724         } else if (rt->rt_gw_family == AF_INET6) {
2725                 int alen = sizeof(struct in6_addr);
2726                 struct nlattr *nla;
2727                 struct rtvia *via;
2728
2729                 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2730                 if (!nla)
2731                         goto nla_put_failure;
2732
2733                 via = nla_data(nla);
2734                 via->rtvia_family = AF_INET6;
2735                 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2736         }
2737
2738         expires = rt->dst.expires;
2739         if (expires) {
2740                 unsigned long now = jiffies;
2741
2742                 if (time_before(now, expires))
2743                         expires -= now;
2744                 else
2745                         expires = 0;
2746         }
2747
2748         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2749         if (rt->rt_pmtu && expires)
2750                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2751         if (rt->rt_mtu_locked && expires)
2752                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2753         if (rtnetlink_put_metrics(skb, metrics) < 0)
2754                 goto nla_put_failure;
2755
2756         if (fl4->flowi4_mark &&
2757             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2758                 goto nla_put_failure;
2759
2760         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2761             nla_put_u32(skb, RTA_UID,
2762                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2763                 goto nla_put_failure;
2764
2765         error = rt->dst.error;
2766
2767         if (rt_is_input_route(rt)) {
2768 #ifdef CONFIG_IP_MROUTE
2769                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2770                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2771                         int err = ipmr_get_route(net, skb,
2772                                                  fl4->saddr, fl4->daddr,
2773                                                  r, portid);
2774
2775                         if (err <= 0) {
2776                                 if (err == 0)
2777                                         return 0;
2778                                 goto nla_put_failure;
2779                         }
2780                 } else
2781 #endif
2782                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2783                                 goto nla_put_failure;
2784         }
2785
2786         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2787                 goto nla_put_failure;
2788
2789         nlmsg_end(skb, nlh);
2790         return 0;
2791
2792 nla_put_failure:
2793         nlmsg_cancel(skb, nlh);
2794         return -EMSGSIZE;
2795 }
2796
2797 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2798                                                    u8 ip_proto, __be16 sport,
2799                                                    __be16 dport)
2800 {
2801         struct sk_buff *skb;
2802         struct iphdr *iph;
2803
2804         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2805         if (!skb)
2806                 return NULL;
2807
2808         /* Reserve room for dummy headers, this skb can pass
2809          * through good chunk of routing engine.
2810          */
2811         skb_reset_mac_header(skb);
2812         skb_reset_network_header(skb);
2813         skb->protocol = htons(ETH_P_IP);
2814         iph = skb_put(skb, sizeof(struct iphdr));
2815         iph->protocol = ip_proto;
2816         iph->saddr = src;
2817         iph->daddr = dst;
2818         iph->version = 0x4;
2819         iph->frag_off = 0;
2820         iph->ihl = 0x5;
2821         skb_set_transport_header(skb, skb->len);
2822
2823         switch (iph->protocol) {
2824         case IPPROTO_UDP: {
2825                 struct udphdr *udph;
2826
2827                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2828                 udph->source = sport;
2829                 udph->dest = dport;
2830                 udph->len = sizeof(struct udphdr);
2831                 udph->check = 0;
2832                 break;
2833         }
2834         case IPPROTO_TCP: {
2835                 struct tcphdr *tcph;
2836
2837                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2838                 tcph->source    = sport;
2839                 tcph->dest      = dport;
2840                 tcph->doff      = sizeof(struct tcphdr) / 4;
2841                 tcph->rst = 1;
2842                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2843                                             src, dst, 0);
2844                 break;
2845         }
2846         case IPPROTO_ICMP: {
2847                 struct icmphdr *icmph;
2848
2849                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2850                 icmph->type = ICMP_ECHO;
2851                 icmph->code = 0;
2852         }
2853         }
2854
2855         return skb;
2856 }
2857
2858 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2859                                        const struct nlmsghdr *nlh,
2860                                        struct nlattr **tb,
2861                                        struct netlink_ext_ack *extack)
2862 {
2863         struct rtmsg *rtm;
2864         int i, err;
2865
2866         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2867                 NL_SET_ERR_MSG(extack,
2868                                "ipv4: Invalid header for route get request");
2869                 return -EINVAL;
2870         }
2871
2872         if (!netlink_strict_get_check(skb))
2873                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2874                                               rtm_ipv4_policy, extack);
2875
2876         rtm = nlmsg_data(nlh);
2877         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2878             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2879             rtm->rtm_table || rtm->rtm_protocol ||
2880             rtm->rtm_scope || rtm->rtm_type) {
2881                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2882                 return -EINVAL;
2883         }
2884
2885         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2886                                RTM_F_LOOKUP_TABLE |
2887                                RTM_F_FIB_MATCH)) {
2888                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2889                 return -EINVAL;
2890         }
2891
2892         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2893                                             rtm_ipv4_policy, extack);
2894         if (err)
2895                 return err;
2896
2897         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2898             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2899                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2900                 return -EINVAL;
2901         }
2902
2903         for (i = 0; i <= RTA_MAX; i++) {
2904                 if (!tb[i])
2905                         continue;
2906
2907                 switch (i) {
2908                 case RTA_IIF:
2909                 case RTA_OIF:
2910                 case RTA_SRC:
2911                 case RTA_DST:
2912                 case RTA_IP_PROTO:
2913                 case RTA_SPORT:
2914                 case RTA_DPORT:
2915                 case RTA_MARK:
2916                 case RTA_UID:
2917                         break;
2918                 default:
2919                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2920                         return -EINVAL;
2921                 }
2922         }
2923
2924         return 0;
2925 }
2926
2927 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2928                              struct netlink_ext_ack *extack)
2929 {
2930         struct net *net = sock_net(in_skb->sk);
2931         struct nlattr *tb[RTA_MAX+1];
2932         u32 table_id = RT_TABLE_MAIN;
2933         __be16 sport = 0, dport = 0;
2934         struct fib_result res = {};
2935         u8 ip_proto = IPPROTO_UDP;
2936         struct rtable *rt = NULL;
2937         struct sk_buff *skb;
2938         struct rtmsg *rtm;
2939         struct flowi4 fl4 = {};
2940         __be32 dst = 0;
2941         __be32 src = 0;
2942         kuid_t uid;
2943         u32 iif;
2944         int err;
2945         int mark;
2946
2947         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2948         if (err < 0)
2949                 return err;
2950
2951         rtm = nlmsg_data(nlh);
2952         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2953         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2954         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2955         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2956         if (tb[RTA_UID])
2957                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2958         else
2959                 uid = (iif ? INVALID_UID : current_uid());
2960
2961         if (tb[RTA_IP_PROTO]) {
2962                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2963                                                   &ip_proto, AF_INET, extack);
2964                 if (err)
2965                         return err;
2966         }
2967
2968         if (tb[RTA_SPORT])
2969                 sport = nla_get_be16(tb[RTA_SPORT]);
2970
2971         if (tb[RTA_DPORT])
2972                 dport = nla_get_be16(tb[RTA_DPORT]);
2973
2974         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2975         if (!skb)
2976                 return -ENOBUFS;
2977
2978         fl4.daddr = dst;
2979         fl4.saddr = src;
2980         fl4.flowi4_tos = rtm->rtm_tos;
2981         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2982         fl4.flowi4_mark = mark;
2983         fl4.flowi4_uid = uid;
2984         if (sport)
2985                 fl4.fl4_sport = sport;
2986         if (dport)
2987                 fl4.fl4_dport = dport;
2988         fl4.flowi4_proto = ip_proto;
2989
2990         rcu_read_lock();
2991
2992         if (iif) {
2993                 struct net_device *dev;
2994
2995                 dev = dev_get_by_index_rcu(net, iif);
2996                 if (!dev) {
2997                         err = -ENODEV;
2998                         goto errout_rcu;
2999                 }
3000
3001                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3002                 skb->dev        = dev;
3003                 skb->mark       = mark;
3004                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3005                                          dev, &res);
3006
3007                 rt = skb_rtable(skb);
3008                 if (err == 0 && rt->dst.error)
3009                         err = -rt->dst.error;
3010         } else {
3011                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3012                 skb->dev = net->loopback_dev;
3013                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3014                 err = 0;
3015                 if (IS_ERR(rt))
3016                         err = PTR_ERR(rt);
3017                 else
3018                         skb_dst_set(skb, &rt->dst);
3019         }
3020
3021         if (err)
3022                 goto errout_rcu;
3023
3024         if (rtm->rtm_flags & RTM_F_NOTIFY)
3025                 rt->rt_flags |= RTCF_NOTIFY;
3026
3027         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3028                 table_id = res.table ? res.table->tb_id : 0;
3029
3030         /* reset skb for netlink reply msg */
3031         skb_trim(skb, 0);
3032         skb_reset_network_header(skb);
3033         skb_reset_transport_header(skb);
3034         skb_reset_mac_header(skb);
3035
3036         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3037                 if (!res.fi) {
3038                         err = fib_props[res.type].error;
3039                         if (!err)
3040                                 err = -EHOSTUNREACH;
3041                         goto errout_rcu;
3042                 }
3043                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3044                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3045                                     rt->rt_type, res.prefix, res.prefixlen,
3046                                     fl4.flowi4_tos, res.fi, 0);
3047         } else {
3048                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3049                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3050         }
3051         if (err < 0)
3052                 goto errout_rcu;
3053
3054         rcu_read_unlock();
3055
3056         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3057
3058 errout_free:
3059         return err;
3060 errout_rcu:
3061         rcu_read_unlock();
3062         kfree_skb(skb);
3063         goto errout_free;
3064 }
3065
3066 void ip_rt_multicast_event(struct in_device *in_dev)
3067 {
3068         rt_cache_flush(dev_net(in_dev->dev));
3069 }
3070
3071 #ifdef CONFIG_SYSCTL
3072 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3073 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3074 static int ip_rt_gc_elasticity __read_mostly    = 8;
3075 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3076
3077 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3078                                         void __user *buffer,
3079                                         size_t *lenp, loff_t *ppos)
3080 {
3081         struct net *net = (struct net *)__ctl->extra1;
3082
3083         if (write) {
3084                 rt_cache_flush(net);
3085                 fnhe_genid_bump(net);
3086                 return 0;
3087         }
3088
3089         return -EINVAL;
3090 }
3091
3092 static struct ctl_table ipv4_route_table[] = {
3093         {
3094                 .procname       = "gc_thresh",
3095                 .data           = &ipv4_dst_ops.gc_thresh,
3096                 .maxlen         = sizeof(int),
3097                 .mode           = 0644,
3098                 .proc_handler   = proc_dointvec,
3099         },
3100         {
3101                 .procname       = "max_size",
3102                 .data           = &ip_rt_max_size,
3103                 .maxlen         = sizeof(int),
3104                 .mode           = 0644,
3105                 .proc_handler   = proc_dointvec,
3106         },
3107         {
3108                 /*  Deprecated. Use gc_min_interval_ms */
3109
3110                 .procname       = "gc_min_interval",
3111                 .data           = &ip_rt_gc_min_interval,
3112                 .maxlen         = sizeof(int),
3113                 .mode           = 0644,
3114                 .proc_handler   = proc_dointvec_jiffies,
3115         },
3116         {
3117                 .procname       = "gc_min_interval_ms",
3118                 .data           = &ip_rt_gc_min_interval,
3119                 .maxlen         = sizeof(int),
3120                 .mode           = 0644,
3121                 .proc_handler   = proc_dointvec_ms_jiffies,
3122         },
3123         {
3124                 .procname       = "gc_timeout",
3125                 .data           = &ip_rt_gc_timeout,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec_jiffies,
3129         },
3130         {
3131                 .procname       = "gc_interval",
3132                 .data           = &ip_rt_gc_interval,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec_jiffies,
3136         },
3137         {
3138                 .procname       = "redirect_load",
3139                 .data           = &ip_rt_redirect_load,
3140                 .maxlen         = sizeof(int),
3141                 .mode           = 0644,
3142                 .proc_handler   = proc_dointvec,
3143         },
3144         {
3145                 .procname       = "redirect_number",
3146                 .data           = &ip_rt_redirect_number,
3147                 .maxlen         = sizeof(int),
3148                 .mode           = 0644,
3149                 .proc_handler   = proc_dointvec,
3150         },
3151         {
3152                 .procname       = "redirect_silence",
3153                 .data           = &ip_rt_redirect_silence,
3154                 .maxlen         = sizeof(int),
3155                 .mode           = 0644,
3156                 .proc_handler   = proc_dointvec,
3157         },
3158         {
3159                 .procname       = "error_cost",
3160                 .data           = &ip_rt_error_cost,
3161                 .maxlen         = sizeof(int),
3162                 .mode           = 0644,
3163                 .proc_handler   = proc_dointvec,
3164         },
3165         {
3166                 .procname       = "error_burst",
3167                 .data           = &ip_rt_error_burst,
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0644,
3170                 .proc_handler   = proc_dointvec,
3171         },
3172         {
3173                 .procname       = "gc_elasticity",
3174                 .data           = &ip_rt_gc_elasticity,
3175                 .maxlen         = sizeof(int),
3176                 .mode           = 0644,
3177                 .proc_handler   = proc_dointvec,
3178         },
3179         {
3180                 .procname       = "mtu_expires",
3181                 .data           = &ip_rt_mtu_expires,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = proc_dointvec_jiffies,
3185         },
3186         {
3187                 .procname       = "min_pmtu",
3188                 .data           = &ip_rt_min_pmtu,
3189                 .maxlen         = sizeof(int),
3190                 .mode           = 0644,
3191                 .proc_handler   = proc_dointvec_minmax,
3192                 .extra1         = &ip_min_valid_pmtu,
3193         },
3194         {
3195                 .procname       = "min_adv_mss",
3196                 .data           = &ip_rt_min_advmss,
3197                 .maxlen         = sizeof(int),
3198                 .mode           = 0644,
3199                 .proc_handler   = proc_dointvec,
3200         },
3201         { }
3202 };
3203
3204 static struct ctl_table ipv4_route_flush_table[] = {
3205         {
3206                 .procname       = "flush",
3207                 .maxlen         = sizeof(int),
3208                 .mode           = 0200,
3209                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3210         },
3211         { },
3212 };
3213
3214 static __net_init int sysctl_route_net_init(struct net *net)
3215 {
3216         struct ctl_table *tbl;
3217
3218         tbl = ipv4_route_flush_table;
3219         if (!net_eq(net, &init_net)) {
3220                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3221                 if (!tbl)
3222                         goto err_dup;
3223
3224                 /* Don't export sysctls to unprivileged users */
3225                 if (net->user_ns != &init_user_ns)
3226                         tbl[0].procname = NULL;
3227         }
3228         tbl[0].extra1 = net;
3229
3230         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3231         if (!net->ipv4.route_hdr)
3232                 goto err_reg;
3233         return 0;
3234
3235 err_reg:
3236         if (tbl != ipv4_route_flush_table)
3237                 kfree(tbl);
3238 err_dup:
3239         return -ENOMEM;
3240 }
3241
3242 static __net_exit void sysctl_route_net_exit(struct net *net)
3243 {
3244         struct ctl_table *tbl;
3245
3246         tbl = net->ipv4.route_hdr->ctl_table_arg;
3247         unregister_net_sysctl_table(net->ipv4.route_hdr);
3248         BUG_ON(tbl == ipv4_route_flush_table);
3249         kfree(tbl);
3250 }
3251
3252 static __net_initdata struct pernet_operations sysctl_route_ops = {
3253         .init = sysctl_route_net_init,
3254         .exit = sysctl_route_net_exit,
3255 };
3256 #endif
3257
3258 static __net_init int rt_genid_init(struct net *net)
3259 {
3260         atomic_set(&net->ipv4.rt_genid, 0);
3261         atomic_set(&net->fnhe_genid, 0);
3262         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3263         return 0;
3264 }
3265
3266 static __net_initdata struct pernet_operations rt_genid_ops = {
3267         .init = rt_genid_init,
3268 };
3269
3270 static int __net_init ipv4_inetpeer_init(struct net *net)
3271 {
3272         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3273
3274         if (!bp)
3275                 return -ENOMEM;
3276         inet_peer_base_init(bp);
3277         net->ipv4.peers = bp;
3278         return 0;
3279 }
3280
3281 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3282 {
3283         struct inet_peer_base *bp = net->ipv4.peers;
3284
3285         net->ipv4.peers = NULL;
3286         inetpeer_invalidate_tree(bp);
3287         kfree(bp);
3288 }
3289
3290 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3291         .init   =       ipv4_inetpeer_init,
3292         .exit   =       ipv4_inetpeer_exit,
3293 };
3294
3295 #ifdef CONFIG_IP_ROUTE_CLASSID
3296 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3297 #endif /* CONFIG_IP_ROUTE_CLASSID */
3298
3299 int __init ip_rt_init(void)
3300 {
3301         int cpu;
3302
3303         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3304                                   GFP_KERNEL);
3305         if (!ip_idents)
3306                 panic("IP: failed to allocate ip_idents\n");
3307
3308         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3309
3310         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3311         if (!ip_tstamps)
3312                 panic("IP: failed to allocate ip_tstamps\n");
3313
3314         for_each_possible_cpu(cpu) {
3315                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3316
3317                 INIT_LIST_HEAD(&ul->head);
3318                 spin_lock_init(&ul->lock);
3319         }
3320 #ifdef CONFIG_IP_ROUTE_CLASSID
3321         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3322         if (!ip_rt_acct)
3323                 panic("IP: failed to allocate ip_rt_acct\n");
3324 #endif
3325
3326         ipv4_dst_ops.kmem_cachep =
3327                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3328                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3329
3330         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3331
3332         if (dst_entries_init(&ipv4_dst_ops) < 0)
3333                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3334
3335         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3336                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3337
3338         ipv4_dst_ops.gc_thresh = ~0;
3339         ip_rt_max_size = INT_MAX;
3340
3341         devinet_init();
3342         ip_fib_init();
3343
3344         if (ip_rt_proc_init())
3345                 pr_err("Unable to create route proc files\n");
3346 #ifdef CONFIG_XFRM
3347         xfrm_init();
3348         xfrm4_init();
3349 #endif
3350         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3351                       RTNL_FLAG_DOIT_UNLOCKED);
3352
3353 #ifdef CONFIG_SYSCTL
3354         register_pernet_subsys(&sysctl_route_ops);
3355 #endif
3356         register_pernet_subsys(&rt_genid_ops);
3357         register_pernet_subsys(&ipv4_inetpeer_ops);
3358         return 0;
3359 }
3360
3361 #ifdef CONFIG_SYSCTL
3362 /*
3363  * We really need to sanitize the damn ipv4 init order, then all
3364  * this nonsense will go away.
3365  */
3366 void __init ip_static_sysctl_init(void)
3367 {
3368         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3369 }
3370 #endif