netfilter: Add MODULE_DESCRIPTION entries to kernel modules
[linux-2.6-microblaze.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu,
143                                            bool confirm_neigh);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct proc_ops rt_cache_proc_ops = {
241         .proc_open      = rt_cache_seq_open,
242         .proc_read      = seq_read,
243         .proc_lseek     = seq_lseek,
244         .proc_release   = seq_release,
245 };
246
247
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250         int cpu;
251
252         if (*pos == 0)
253                 return SEQ_START_TOKEN;
254
255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256                 if (!cpu_possible(cpu))
257                         continue;
258                 *pos = cpu+1;
259                 return &per_cpu(rt_cache_stat, cpu);
260         }
261         return NULL;
262 }
263
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266         int cpu;
267
268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269                 if (!cpu_possible(cpu))
270                         continue;
271                 *pos = cpu+1;
272                 return &per_cpu(rt_cache_stat, cpu);
273         }
274         (*pos)++;
275         return NULL;
276
277 }
278
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281
282 }
283
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324
325
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330
331 static const struct proc_ops rt_cpu_proc_ops = {
332         .proc_open      = rt_cpu_seq_open,
333         .proc_read      = seq_read,
334         .proc_lseek     = seq_lseek,
335         .proc_release   = seq_release,
336 };
337
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341         struct ip_rt_acct *dst, *src;
342         unsigned int i, j;
343
344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345         if (!dst)
346                 return -ENOMEM;
347
348         for_each_possible_cpu(i) {
349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350                 for (j = 0; j < 256; j++) {
351                         dst[j].o_bytes   += src[j].o_bytes;
352                         dst[j].o_packets += src[j].o_packets;
353                         dst[j].i_bytes   += src[j].i_bytes;
354                         dst[j].i_packets += src[j].i_packets;
355                 }
356         }
357
358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359         kfree(dst);
360         return 0;
361 }
362 #endif
363
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366         struct proc_dir_entry *pde;
367
368         pde = proc_create("rt_cache", 0444, net->proc_net,
369                           &rt_cache_proc_ops);
370         if (!pde)
371                 goto err1;
372
373         pde = proc_create("rt_cache", 0444,
374                           net->proc_net_stat, &rt_cpu_proc_ops);
375         if (!pde)
376                 goto err2;
377
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379         pde = proc_create_single("rt_acct", 0, net->proc_net,
380                         rt_acct_proc_show);
381         if (!pde)
382                 goto err3;
383 #endif
384         return 0;
385
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388         remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391         remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393         return -ENOMEM;
394 }
395
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399         remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401         remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406         .init = ip_rt_do_proc_init,
407         .exit = ip_rt_do_proc_exit,
408 };
409
410 static int __init ip_rt_proc_init(void)
411 {
412         return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418         return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426
427 void rt_cache_flush(struct net *net)
428 {
429         rt_genid_bump_ipv4(net);
430 }
431
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433                                            struct sk_buff *skb,
434                                            const void *daddr)
435 {
436         const struct rtable *rt = container_of(dst, struct rtable, dst);
437         struct net_device *dev = dst->dev;
438         struct neighbour *n;
439
440         rcu_read_lock_bh();
441
442         if (likely(rt->rt_gw_family == AF_INET)) {
443                 n = ip_neigh_gw4(dev, rt->rt_gw4);
444         } else if (rt->rt_gw_family == AF_INET6) {
445                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447                 __be32 pkey;
448
449                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450                 n = ip_neigh_gw4(dev, pkey);
451         }
452
453         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454                 n = NULL;
455
456         rcu_read_unlock_bh();
457
458         return n;
459 }
460
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463         const struct rtable *rt = container_of(dst, struct rtable, dst);
464         struct net_device *dev = dst->dev;
465         const __be32 *pkey = daddr;
466
467         if (rt->rt_gw_family == AF_INET) {
468                 pkey = (const __be32 *)&rt->rt_gw4;
469         } else if (rt->rt_gw_family == AF_INET6) {
470                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471         } else if (!daddr ||
472                  (rt->rt_flags &
473                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474                 return;
475         }
476         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478
479 #define IP_IDENTS_SZ 2048u
480
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492         u32 old = READ_ONCE(*p_tstamp);
493         u32 now = (u32)jiffies;
494         u32 delta = 0;
495
496         if (old != now && cmpxchg(p_tstamp, old, now) == old)
497                 delta = prandom_u32_max(now - old);
498
499         /* If UBSAN reports an error there, please make sure your compiler
500          * supports -fno-strict-overflow before reporting it that was a bug
501          * in UBSAN, and it has been fixed in GCC-8.
502          */
503         return atomic_add_return(segs + delta, p_id) - segs;
504 }
505 EXPORT_SYMBOL(ip_idents_reserve);
506
507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
508 {
509         u32 hash, id;
510
511         /* Note the following code is not safe, but this is okay. */
512         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513                 get_random_bytes(&net->ipv4.ip_id_key,
514                                  sizeof(net->ipv4.ip_id_key));
515
516         hash = siphash_3u32((__force u32)iph->daddr,
517                             (__force u32)iph->saddr,
518                             iph->protocol,
519                             &net->ipv4.ip_id_key);
520         id = ip_idents_reserve(hash, segs);
521         iph->id = htons(id);
522 }
523 EXPORT_SYMBOL(__ip_select_ident);
524
525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526                              const struct sock *sk,
527                              const struct iphdr *iph,
528                              int oif, u8 tos,
529                              u8 prot, u32 mark, int flow_flags)
530 {
531         if (sk) {
532                 const struct inet_sock *inet = inet_sk(sk);
533
534                 oif = sk->sk_bound_dev_if;
535                 mark = sk->sk_mark;
536                 tos = RT_CONN_FLAGS(sk);
537                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538         }
539         flowi4_init_output(fl4, oif, mark, tos,
540                            RT_SCOPE_UNIVERSE, prot,
541                            flow_flags,
542                            iph->daddr, iph->saddr, 0, 0,
543                            sock_net_uid(net, sk));
544 }
545
546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547                                const struct sock *sk)
548 {
549         const struct net *net = dev_net(skb->dev);
550         const struct iphdr *iph = ip_hdr(skb);
551         int oif = skb->dev->ifindex;
552         u8 tos = RT_TOS(iph->tos);
553         u8 prot = iph->protocol;
554         u32 mark = skb->mark;
555
556         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
557 }
558
559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
560 {
561         const struct inet_sock *inet = inet_sk(sk);
562         const struct ip_options_rcu *inet_opt;
563         __be32 daddr = inet->inet_daddr;
564
565         rcu_read_lock();
566         inet_opt = rcu_dereference(inet->inet_opt);
567         if (inet_opt && inet_opt->opt.srr)
568                 daddr = inet_opt->opt.faddr;
569         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572                            inet_sk_flowi_flags(sk),
573                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
574         rcu_read_unlock();
575 }
576
577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578                                  const struct sk_buff *skb)
579 {
580         if (skb)
581                 build_skb_flow_key(fl4, skb, sk);
582         else
583                 build_sk_flow_key(fl4, sk);
584 }
585
586 static DEFINE_SPINLOCK(fnhe_lock);
587
588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589 {
590         struct rtable *rt;
591
592         rt = rcu_dereference(fnhe->fnhe_rth_input);
593         if (rt) {
594                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
595                 dst_dev_put(&rt->dst);
596                 dst_release(&rt->dst);
597         }
598         rt = rcu_dereference(fnhe->fnhe_rth_output);
599         if (rt) {
600                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
601                 dst_dev_put(&rt->dst);
602                 dst_release(&rt->dst);
603         }
604 }
605
606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
607 {
608         struct fib_nh_exception *fnhe, *oldest;
609
610         oldest = rcu_dereference(hash->chain);
611         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612              fnhe = rcu_dereference(fnhe->fnhe_next)) {
613                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
614                         oldest = fnhe;
615         }
616         fnhe_flush_routes(oldest);
617         return oldest;
618 }
619
620 static inline u32 fnhe_hashfun(__be32 daddr)
621 {
622         static u32 fnhe_hashrnd __read_mostly;
623         u32 hval;
624
625         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
627         return hash_32(hval, FNHE_HASH_SHIFT);
628 }
629
630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631 {
632         rt->rt_pmtu = fnhe->fnhe_pmtu;
633         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
634         rt->dst.expires = fnhe->fnhe_expires;
635
636         if (fnhe->fnhe_gw) {
637                 rt->rt_flags |= RTCF_REDIRECTED;
638                 rt->rt_uses_gateway = 1;
639                 rt->rt_gw_family = AF_INET;
640                 rt->rt_gw4 = fnhe->fnhe_gw;
641         }
642 }
643
644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645                                   __be32 gw, u32 pmtu, bool lock,
646                                   unsigned long expires)
647 {
648         struct fnhe_hash_bucket *hash;
649         struct fib_nh_exception *fnhe;
650         struct rtable *rt;
651         u32 genid, hval;
652         unsigned int i;
653         int depth;
654
655         genid = fnhe_genid(dev_net(nhc->nhc_dev));
656         hval = fnhe_hashfun(daddr);
657
658         spin_lock_bh(&fnhe_lock);
659
660         hash = rcu_dereference(nhc->nhc_exceptions);
661         if (!hash) {
662                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
663                 if (!hash)
664                         goto out_unlock;
665                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
666         }
667
668         hash += hval;
669
670         depth = 0;
671         for (fnhe = rcu_dereference(hash->chain); fnhe;
672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
673                 if (fnhe->fnhe_daddr == daddr)
674                         break;
675                 depth++;
676         }
677
678         if (fnhe) {
679                 if (fnhe->fnhe_genid != genid)
680                         fnhe->fnhe_genid = genid;
681                 if (gw)
682                         fnhe->fnhe_gw = gw;
683                 if (pmtu) {
684                         fnhe->fnhe_pmtu = pmtu;
685                         fnhe->fnhe_mtu_locked = lock;
686                 }
687                 fnhe->fnhe_expires = max(1UL, expires);
688                 /* Update all cached dsts too */
689                 rt = rcu_dereference(fnhe->fnhe_rth_input);
690                 if (rt)
691                         fill_route_from_fnhe(rt, fnhe);
692                 rt = rcu_dereference(fnhe->fnhe_rth_output);
693                 if (rt)
694                         fill_route_from_fnhe(rt, fnhe);
695         } else {
696                 if (depth > FNHE_RECLAIM_DEPTH)
697                         fnhe = fnhe_oldest(hash);
698                 else {
699                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700                         if (!fnhe)
701                                 goto out_unlock;
702
703                         fnhe->fnhe_next = hash->chain;
704                         rcu_assign_pointer(hash->chain, fnhe);
705                 }
706                 fnhe->fnhe_genid = genid;
707                 fnhe->fnhe_daddr = daddr;
708                 fnhe->fnhe_gw = gw;
709                 fnhe->fnhe_pmtu = pmtu;
710                 fnhe->fnhe_mtu_locked = lock;
711                 fnhe->fnhe_expires = max(1UL, expires);
712
713                 /* Exception created; mark the cached routes for the nexthop
714                  * stale, so anyone caching it rechecks if this exception
715                  * applies to them.
716                  */
717                 rt = rcu_dereference(nhc->nhc_rth_input);
718                 if (rt)
719                         rt->dst.obsolete = DST_OBSOLETE_KILL;
720
721                 for_each_possible_cpu(i) {
722                         struct rtable __rcu **prt;
723                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
724                         rt = rcu_dereference(*prt);
725                         if (rt)
726                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
727                 }
728         }
729
730         fnhe->fnhe_stamp = jiffies;
731
732 out_unlock:
733         spin_unlock_bh(&fnhe_lock);
734 }
735
736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737                              bool kill_route)
738 {
739         __be32 new_gw = icmp_hdr(skb)->un.gateway;
740         __be32 old_gw = ip_hdr(skb)->saddr;
741         struct net_device *dev = skb->dev;
742         struct in_device *in_dev;
743         struct fib_result res;
744         struct neighbour *n;
745         struct net *net;
746
747         switch (icmp_hdr(skb)->code & 7) {
748         case ICMP_REDIR_NET:
749         case ICMP_REDIR_NETTOS:
750         case ICMP_REDIR_HOST:
751         case ICMP_REDIR_HOSTTOS:
752                 break;
753
754         default:
755                 return;
756         }
757
758         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
759                 return;
760
761         in_dev = __in_dev_get_rcu(dev);
762         if (!in_dev)
763                 return;
764
765         net = dev_net(dev);
766         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768             ipv4_is_zeronet(new_gw))
769                 goto reject_redirect;
770
771         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773                         goto reject_redirect;
774                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775                         goto reject_redirect;
776         } else {
777                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
778                         goto reject_redirect;
779         }
780
781         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782         if (!n)
783                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
784         if (!IS_ERR(n)) {
785                 if (!(n->nud_state & NUD_VALID)) {
786                         neigh_event_send(n, NULL);
787                 } else {
788                         if (fib_lookup(net, fl4, &res, 0) == 0) {
789                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
790
791                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
792                                                 0, false,
793                                                 jiffies + ip_rt_gc_timeout);
794                         }
795                         if (kill_route)
796                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
797                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
798                 }
799                 neigh_release(n);
800         }
801         return;
802
803 reject_redirect:
804 #ifdef CONFIG_IP_ROUTE_VERBOSE
805         if (IN_DEV_LOG_MARTIANS(in_dev)) {
806                 const struct iphdr *iph = (const struct iphdr *) skb->data;
807                 __be32 daddr = iph->daddr;
808                 __be32 saddr = iph->saddr;
809
810                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
811                                      "  Advised path = %pI4 -> %pI4\n",
812                                      &old_gw, dev->name, &new_gw,
813                                      &saddr, &daddr);
814         }
815 #endif
816         ;
817 }
818
819 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
820 {
821         struct rtable *rt;
822         struct flowi4 fl4;
823         const struct iphdr *iph = (const struct iphdr *) skb->data;
824         struct net *net = dev_net(skb->dev);
825         int oif = skb->dev->ifindex;
826         u8 tos = RT_TOS(iph->tos);
827         u8 prot = iph->protocol;
828         u32 mark = skb->mark;
829
830         rt = (struct rtable *) dst;
831
832         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
833         __ip_do_redirect(rt, skb, &fl4, true);
834 }
835
836 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
837 {
838         struct rtable *rt = (struct rtable *)dst;
839         struct dst_entry *ret = dst;
840
841         if (rt) {
842                 if (dst->obsolete > 0) {
843                         ip_rt_put(rt);
844                         ret = NULL;
845                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
846                            rt->dst.expires) {
847                         ip_rt_put(rt);
848                         ret = NULL;
849                 }
850         }
851         return ret;
852 }
853
854 /*
855  * Algorithm:
856  *      1. The first ip_rt_redirect_number redirects are sent
857  *         with exponential backoff, then we stop sending them at all,
858  *         assuming that the host ignores our redirects.
859  *      2. If we did not see packets requiring redirects
860  *         during ip_rt_redirect_silence, we assume that the host
861  *         forgot redirected route and start to send redirects again.
862  *
863  * This algorithm is much cheaper and more intelligent than dumb load limiting
864  * in icmp.c.
865  *
866  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
867  * and "frag. need" (breaks PMTU discovery) in icmp.c.
868  */
869
870 void ip_rt_send_redirect(struct sk_buff *skb)
871 {
872         struct rtable *rt = skb_rtable(skb);
873         struct in_device *in_dev;
874         struct inet_peer *peer;
875         struct net *net;
876         int log_martians;
877         int vif;
878
879         rcu_read_lock();
880         in_dev = __in_dev_get_rcu(rt->dst.dev);
881         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
882                 rcu_read_unlock();
883                 return;
884         }
885         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
886         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
887         rcu_read_unlock();
888
889         net = dev_net(rt->dst.dev);
890         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
891         if (!peer) {
892                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
893                           rt_nexthop(rt, ip_hdr(skb)->daddr));
894                 return;
895         }
896
897         /* No redirected packets during ip_rt_redirect_silence;
898          * reset the algorithm.
899          */
900         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
901                 peer->rate_tokens = 0;
902                 peer->n_redirects = 0;
903         }
904
905         /* Too many ignored redirects; do not send anything
906          * set dst.rate_last to the last seen redirected packet.
907          */
908         if (peer->n_redirects >= ip_rt_redirect_number) {
909                 peer->rate_last = jiffies;
910                 goto out_put_peer;
911         }
912
913         /* Check for load limit; set rate_last to the latest sent
914          * redirect.
915          */
916         if (peer->n_redirects == 0 ||
917             time_after(jiffies,
918                        (peer->rate_last +
919                         (ip_rt_redirect_load << peer->n_redirects)))) {
920                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
921
922                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
923                 peer->rate_last = jiffies;
924                 ++peer->n_redirects;
925 #ifdef CONFIG_IP_ROUTE_VERBOSE
926                 if (log_martians &&
927                     peer->n_redirects == ip_rt_redirect_number)
928                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
929                                              &ip_hdr(skb)->saddr, inet_iif(skb),
930                                              &ip_hdr(skb)->daddr, &gw);
931 #endif
932         }
933 out_put_peer:
934         inet_putpeer(peer);
935 }
936
937 static int ip_error(struct sk_buff *skb)
938 {
939         struct rtable *rt = skb_rtable(skb);
940         struct net_device *dev = skb->dev;
941         struct in_device *in_dev;
942         struct inet_peer *peer;
943         unsigned long now;
944         struct net *net;
945         bool send;
946         int code;
947
948         if (netif_is_l3_master(skb->dev)) {
949                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
950                 if (!dev)
951                         goto out;
952         }
953
954         in_dev = __in_dev_get_rcu(dev);
955
956         /* IP on this device is disabled. */
957         if (!in_dev)
958                 goto out;
959
960         net = dev_net(rt->dst.dev);
961         if (!IN_DEV_FORWARD(in_dev)) {
962                 switch (rt->dst.error) {
963                 case EHOSTUNREACH:
964                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
965                         break;
966
967                 case ENETUNREACH:
968                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
969                         break;
970                 }
971                 goto out;
972         }
973
974         switch (rt->dst.error) {
975         case EINVAL:
976         default:
977                 goto out;
978         case EHOSTUNREACH:
979                 code = ICMP_HOST_UNREACH;
980                 break;
981         case ENETUNREACH:
982                 code = ICMP_NET_UNREACH;
983                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
984                 break;
985         case EACCES:
986                 code = ICMP_PKT_FILTERED;
987                 break;
988         }
989
990         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
991                                l3mdev_master_ifindex(skb->dev), 1);
992
993         send = true;
994         if (peer) {
995                 now = jiffies;
996                 peer->rate_tokens += now - peer->rate_last;
997                 if (peer->rate_tokens > ip_rt_error_burst)
998                         peer->rate_tokens = ip_rt_error_burst;
999                 peer->rate_last = now;
1000                 if (peer->rate_tokens >= ip_rt_error_cost)
1001                         peer->rate_tokens -= ip_rt_error_cost;
1002                 else
1003                         send = false;
1004                 inet_putpeer(peer);
1005         }
1006         if (send)
1007                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009 out:    kfree_skb(skb);
1010         return 0;
1011 }
1012
1013 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014 {
1015         struct dst_entry *dst = &rt->dst;
1016         u32 old_mtu = ipv4_mtu(dst);
1017         struct fib_result res;
1018         bool lock = false;
1019
1020         if (ip_mtu_locked(dst))
1021                 return;
1022
1023         if (old_mtu < mtu)
1024                 return;
1025
1026         if (mtu < ip_rt_min_pmtu) {
1027                 lock = true;
1028                 mtu = min(old_mtu, ip_rt_min_pmtu);
1029         }
1030
1031         if (rt->rt_pmtu == mtu && !lock &&
1032             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033                 return;
1034
1035         rcu_read_lock();
1036         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1038
1039                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1040                                       jiffies + ip_rt_mtu_expires);
1041         }
1042         rcu_read_unlock();
1043 }
1044
1045 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046                               struct sk_buff *skb, u32 mtu,
1047                               bool confirm_neigh)
1048 {
1049         struct rtable *rt = (struct rtable *) dst;
1050         struct flowi4 fl4;
1051
1052         ip_rt_build_flow_key(&fl4, sk, skb);
1053         __ip_rt_update_pmtu(rt, &fl4, mtu);
1054 }
1055
1056 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1057                       int oif, u8 protocol)
1058 {
1059         const struct iphdr *iph = (const struct iphdr *) skb->data;
1060         struct flowi4 fl4;
1061         struct rtable *rt;
1062         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1063
1064         __build_flow_key(net, &fl4, NULL, iph, oif,
1065                          RT_TOS(iph->tos), protocol, mark, 0);
1066         rt = __ip_route_output_key(net, &fl4);
1067         if (!IS_ERR(rt)) {
1068                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                 ip_rt_put(rt);
1070         }
1071 }
1072 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073
1074 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076         const struct iphdr *iph = (const struct iphdr *) skb->data;
1077         struct flowi4 fl4;
1078         struct rtable *rt;
1079
1080         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1081
1082         if (!fl4.flowi4_mark)
1083                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084
1085         rt = __ip_route_output_key(sock_net(sk), &fl4);
1086         if (!IS_ERR(rt)) {
1087                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1088                 ip_rt_put(rt);
1089         }
1090 }
1091
1092 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093 {
1094         const struct iphdr *iph = (const struct iphdr *) skb->data;
1095         struct flowi4 fl4;
1096         struct rtable *rt;
1097         struct dst_entry *odst = NULL;
1098         bool new = false;
1099         struct net *net = sock_net(sk);
1100
1101         bh_lock_sock(sk);
1102
1103         if (!ip_sk_accept_pmtu(sk))
1104                 goto out;
1105
1106         odst = sk_dst_get(sk);
1107
1108         if (sock_owned_by_user(sk) || !odst) {
1109                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1110                 goto out;
1111         }
1112
1113         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1114
1115         rt = (struct rtable *)odst;
1116         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1117                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118                 if (IS_ERR(rt))
1119                         goto out;
1120
1121                 new = true;
1122         }
1123
1124         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1125
1126         if (!dst_check(&rt->dst, 0)) {
1127                 if (new)
1128                         dst_release(&rt->dst);
1129
1130                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131                 if (IS_ERR(rt))
1132                         goto out;
1133
1134                 new = true;
1135         }
1136
1137         if (new)
1138                 sk_dst_set(sk, &rt->dst);
1139
1140 out:
1141         bh_unlock_sock(sk);
1142         dst_release(odst);
1143 }
1144 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1145
1146 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1147                    int oif, u8 protocol)
1148 {
1149         const struct iphdr *iph = (const struct iphdr *) skb->data;
1150         struct flowi4 fl4;
1151         struct rtable *rt;
1152
1153         __build_flow_key(net, &fl4, NULL, iph, oif,
1154                          RT_TOS(iph->tos), protocol, 0, 0);
1155         rt = __ip_route_output_key(net, &fl4);
1156         if (!IS_ERR(rt)) {
1157                 __ip_do_redirect(rt, skb, &fl4, false);
1158                 ip_rt_put(rt);
1159         }
1160 }
1161 EXPORT_SYMBOL_GPL(ipv4_redirect);
1162
1163 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164 {
1165         const struct iphdr *iph = (const struct iphdr *) skb->data;
1166         struct flowi4 fl4;
1167         struct rtable *rt;
1168         struct net *net = sock_net(sk);
1169
1170         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171         rt = __ip_route_output_key(net, &fl4);
1172         if (!IS_ERR(rt)) {
1173                 __ip_do_redirect(rt, skb, &fl4, false);
1174                 ip_rt_put(rt);
1175         }
1176 }
1177 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178
1179 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180 {
1181         struct rtable *rt = (struct rtable *) dst;
1182
1183         /* All IPV4 dsts are created with ->obsolete set to the value
1184          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185          * into this function always.
1186          *
1187          * When a PMTU/redirect information update invalidates a route,
1188          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1189          * DST_OBSOLETE_DEAD.
1190          */
1191         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1192                 return NULL;
1193         return dst;
1194 }
1195
1196 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1197 {
1198         struct ip_options opt;
1199         int res;
1200
1201         /* Recompile ip options since IPCB may not be valid anymore.
1202          * Also check we have a reasonable ipv4 header.
1203          */
1204         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1205             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1206                 return;
1207
1208         memset(&opt, 0, sizeof(opt));
1209         if (ip_hdr(skb)->ihl > 5) {
1210                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1211                         return;
1212                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1213
1214                 rcu_read_lock();
1215                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1216                 rcu_read_unlock();
1217
1218                 if (res)
1219                         return;
1220         }
1221         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1222 }
1223
1224 static void ipv4_link_failure(struct sk_buff *skb)
1225 {
1226         struct rtable *rt;
1227
1228         ipv4_send_dest_unreach(skb);
1229
1230         rt = skb_rtable(skb);
1231         if (rt)
1232                 dst_set_expires(&rt->dst, 0);
1233 }
1234
1235 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1236 {
1237         pr_debug("%s: %pI4 -> %pI4, %s\n",
1238                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1239                  skb->dev ? skb->dev->name : "?");
1240         kfree_skb(skb);
1241         WARN_ON(1);
1242         return 0;
1243 }
1244
1245 /*
1246    We do not cache source address of outgoing interface,
1247    because it is used only by IP RR, TS and SRR options,
1248    so that it out of fast path.
1249
1250    BTW remember: "addr" is allowed to be not aligned
1251    in IP options!
1252  */
1253
1254 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1255 {
1256         __be32 src;
1257
1258         if (rt_is_output_route(rt))
1259                 src = ip_hdr(skb)->saddr;
1260         else {
1261                 struct fib_result res;
1262                 struct iphdr *iph = ip_hdr(skb);
1263                 struct flowi4 fl4 = {
1264                         .daddr = iph->daddr,
1265                         .saddr = iph->saddr,
1266                         .flowi4_tos = RT_TOS(iph->tos),
1267                         .flowi4_oif = rt->dst.dev->ifindex,
1268                         .flowi4_iif = skb->dev->ifindex,
1269                         .flowi4_mark = skb->mark,
1270                 };
1271
1272                 rcu_read_lock();
1273                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1274                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1275                 else
1276                         src = inet_select_addr(rt->dst.dev,
1277                                                rt_nexthop(rt, iph->daddr),
1278                                                RT_SCOPE_UNIVERSE);
1279                 rcu_read_unlock();
1280         }
1281         memcpy(addr, &src, 4);
1282 }
1283
1284 #ifdef CONFIG_IP_ROUTE_CLASSID
1285 static void set_class_tag(struct rtable *rt, u32 tag)
1286 {
1287         if (!(rt->dst.tclassid & 0xFFFF))
1288                 rt->dst.tclassid |= tag & 0xFFFF;
1289         if (!(rt->dst.tclassid & 0xFFFF0000))
1290                 rt->dst.tclassid |= tag & 0xFFFF0000;
1291 }
1292 #endif
1293
1294 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1295 {
1296         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1297         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1298                                     ip_rt_min_advmss);
1299
1300         return min(advmss, IPV4_MAX_PMTU - header_size);
1301 }
1302
1303 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1304 {
1305         const struct rtable *rt = (const struct rtable *) dst;
1306         unsigned int mtu = rt->rt_pmtu;
1307
1308         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1309                 mtu = dst_metric_raw(dst, RTAX_MTU);
1310
1311         if (mtu)
1312                 return mtu;
1313
1314         mtu = READ_ONCE(dst->dev->mtu);
1315
1316         if (unlikely(ip_mtu_locked(dst))) {
1317                 if (rt->rt_uses_gateway && mtu > 576)
1318                         mtu = 576;
1319         }
1320
1321         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1322
1323         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1324 }
1325
1326 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1327 {
1328         struct fnhe_hash_bucket *hash;
1329         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1330         u32 hval = fnhe_hashfun(daddr);
1331
1332         spin_lock_bh(&fnhe_lock);
1333
1334         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1335                                          lockdep_is_held(&fnhe_lock));
1336         hash += hval;
1337
1338         fnhe_p = &hash->chain;
1339         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1340         while (fnhe) {
1341                 if (fnhe->fnhe_daddr == daddr) {
1342                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1343                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1344                         /* set fnhe_daddr to 0 to ensure it won't bind with
1345                          * new dsts in rt_bind_exception().
1346                          */
1347                         fnhe->fnhe_daddr = 0;
1348                         fnhe_flush_routes(fnhe);
1349                         kfree_rcu(fnhe, rcu);
1350                         break;
1351                 }
1352                 fnhe_p = &fnhe->fnhe_next;
1353                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1354                                                  lockdep_is_held(&fnhe_lock));
1355         }
1356
1357         spin_unlock_bh(&fnhe_lock);
1358 }
1359
1360 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1361                                                __be32 daddr)
1362 {
1363         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1364         struct fib_nh_exception *fnhe;
1365         u32 hval;
1366
1367         if (!hash)
1368                 return NULL;
1369
1370         hval = fnhe_hashfun(daddr);
1371
1372         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1373              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1374                 if (fnhe->fnhe_daddr == daddr) {
1375                         if (fnhe->fnhe_expires &&
1376                             time_after(jiffies, fnhe->fnhe_expires)) {
1377                                 ip_del_fnhe(nhc, daddr);
1378                                 break;
1379                         }
1380                         return fnhe;
1381                 }
1382         }
1383         return NULL;
1384 }
1385
1386 /* MTU selection:
1387  * 1. mtu on route is locked - use it
1388  * 2. mtu from nexthop exception
1389  * 3. mtu from egress device
1390  */
1391
1392 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1393 {
1394         struct fib_nh_common *nhc = res->nhc;
1395         struct net_device *dev = nhc->nhc_dev;
1396         struct fib_info *fi = res->fi;
1397         u32 mtu = 0;
1398
1399         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1400             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1401                 mtu = fi->fib_mtu;
1402
1403         if (likely(!mtu)) {
1404                 struct fib_nh_exception *fnhe;
1405
1406                 fnhe = find_exception(nhc, daddr);
1407                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1408                         mtu = fnhe->fnhe_pmtu;
1409         }
1410
1411         if (likely(!mtu))
1412                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1413
1414         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1415 }
1416
1417 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1418                               __be32 daddr, const bool do_cache)
1419 {
1420         bool ret = false;
1421
1422         spin_lock_bh(&fnhe_lock);
1423
1424         if (daddr == fnhe->fnhe_daddr) {
1425                 struct rtable __rcu **porig;
1426                 struct rtable *orig;
1427                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1428
1429                 if (rt_is_input_route(rt))
1430                         porig = &fnhe->fnhe_rth_input;
1431                 else
1432                         porig = &fnhe->fnhe_rth_output;
1433                 orig = rcu_dereference(*porig);
1434
1435                 if (fnhe->fnhe_genid != genid) {
1436                         fnhe->fnhe_genid = genid;
1437                         fnhe->fnhe_gw = 0;
1438                         fnhe->fnhe_pmtu = 0;
1439                         fnhe->fnhe_expires = 0;
1440                         fnhe->fnhe_mtu_locked = false;
1441                         fnhe_flush_routes(fnhe);
1442                         orig = NULL;
1443                 }
1444                 fill_route_from_fnhe(rt, fnhe);
1445                 if (!rt->rt_gw4) {
1446                         rt->rt_gw4 = daddr;
1447                         rt->rt_gw_family = AF_INET;
1448                 }
1449
1450                 if (do_cache) {
1451                         dst_hold(&rt->dst);
1452                         rcu_assign_pointer(*porig, rt);
1453                         if (orig) {
1454                                 dst_dev_put(&orig->dst);
1455                                 dst_release(&orig->dst);
1456                         }
1457                         ret = true;
1458                 }
1459
1460                 fnhe->fnhe_stamp = jiffies;
1461         }
1462         spin_unlock_bh(&fnhe_lock);
1463
1464         return ret;
1465 }
1466
1467 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1468 {
1469         struct rtable *orig, *prev, **p;
1470         bool ret = true;
1471
1472         if (rt_is_input_route(rt)) {
1473                 p = (struct rtable **)&nhc->nhc_rth_input;
1474         } else {
1475                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1476         }
1477         orig = *p;
1478
1479         /* hold dst before doing cmpxchg() to avoid race condition
1480          * on this dst
1481          */
1482         dst_hold(&rt->dst);
1483         prev = cmpxchg(p, orig, rt);
1484         if (prev == orig) {
1485                 if (orig) {
1486                         rt_add_uncached_list(orig);
1487                         dst_release(&orig->dst);
1488                 }
1489         } else {
1490                 dst_release(&rt->dst);
1491                 ret = false;
1492         }
1493
1494         return ret;
1495 }
1496
1497 struct uncached_list {
1498         spinlock_t              lock;
1499         struct list_head        head;
1500 };
1501
1502 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1503
1504 void rt_add_uncached_list(struct rtable *rt)
1505 {
1506         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1507
1508         rt->rt_uncached_list = ul;
1509
1510         spin_lock_bh(&ul->lock);
1511         list_add_tail(&rt->rt_uncached, &ul->head);
1512         spin_unlock_bh(&ul->lock);
1513 }
1514
1515 void rt_del_uncached_list(struct rtable *rt)
1516 {
1517         if (!list_empty(&rt->rt_uncached)) {
1518                 struct uncached_list *ul = rt->rt_uncached_list;
1519
1520                 spin_lock_bh(&ul->lock);
1521                 list_del(&rt->rt_uncached);
1522                 spin_unlock_bh(&ul->lock);
1523         }
1524 }
1525
1526 static void ipv4_dst_destroy(struct dst_entry *dst)
1527 {
1528         struct rtable *rt = (struct rtable *)dst;
1529
1530         ip_dst_metrics_put(dst);
1531         rt_del_uncached_list(rt);
1532 }
1533
1534 void rt_flush_dev(struct net_device *dev)
1535 {
1536         struct rtable *rt;
1537         int cpu;
1538
1539         for_each_possible_cpu(cpu) {
1540                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1541
1542                 spin_lock_bh(&ul->lock);
1543                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1544                         if (rt->dst.dev != dev)
1545                                 continue;
1546                         rt->dst.dev = blackhole_netdev;
1547                         dev_hold(rt->dst.dev);
1548                         dev_put(dev);
1549                 }
1550                 spin_unlock_bh(&ul->lock);
1551         }
1552 }
1553
1554 static bool rt_cache_valid(const struct rtable *rt)
1555 {
1556         return  rt &&
1557                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1558                 !rt_is_expired(rt);
1559 }
1560
1561 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1562                            const struct fib_result *res,
1563                            struct fib_nh_exception *fnhe,
1564                            struct fib_info *fi, u16 type, u32 itag,
1565                            const bool do_cache)
1566 {
1567         bool cached = false;
1568
1569         if (fi) {
1570                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1571
1572                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1573                         rt->rt_uses_gateway = 1;
1574                         rt->rt_gw_family = nhc->nhc_gw_family;
1575                         /* only INET and INET6 are supported */
1576                         if (likely(nhc->nhc_gw_family == AF_INET))
1577                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1578                         else
1579                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1580                 }
1581
1582                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1583
1584 #ifdef CONFIG_IP_ROUTE_CLASSID
1585                 if (nhc->nhc_family == AF_INET) {
1586                         struct fib_nh *nh;
1587
1588                         nh = container_of(nhc, struct fib_nh, nh_common);
1589                         rt->dst.tclassid = nh->nh_tclassid;
1590                 }
1591 #endif
1592                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1593                 if (unlikely(fnhe))
1594                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1595                 else if (do_cache)
1596                         cached = rt_cache_route(nhc, rt);
1597                 if (unlikely(!cached)) {
1598                         /* Routes we intend to cache in nexthop exception or
1599                          * FIB nexthop have the DST_NOCACHE bit clear.
1600                          * However, if we are unsuccessful at storing this
1601                          * route into the cache we really need to set it.
1602                          */
1603                         if (!rt->rt_gw4) {
1604                                 rt->rt_gw_family = AF_INET;
1605                                 rt->rt_gw4 = daddr;
1606                         }
1607                         rt_add_uncached_list(rt);
1608                 }
1609         } else
1610                 rt_add_uncached_list(rt);
1611
1612 #ifdef CONFIG_IP_ROUTE_CLASSID
1613 #ifdef CONFIG_IP_MULTIPLE_TABLES
1614         set_class_tag(rt, res->tclassid);
1615 #endif
1616         set_class_tag(rt, itag);
1617 #endif
1618 }
1619
1620 struct rtable *rt_dst_alloc(struct net_device *dev,
1621                             unsigned int flags, u16 type,
1622                             bool nopolicy, bool noxfrm)
1623 {
1624         struct rtable *rt;
1625
1626         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1627                        (nopolicy ? DST_NOPOLICY : 0) |
1628                        (noxfrm ? DST_NOXFRM : 0));
1629
1630         if (rt) {
1631                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1632                 rt->rt_flags = flags;
1633                 rt->rt_type = type;
1634                 rt->rt_is_input = 0;
1635                 rt->rt_iif = 0;
1636                 rt->rt_pmtu = 0;
1637                 rt->rt_mtu_locked = 0;
1638                 rt->rt_uses_gateway = 0;
1639                 rt->rt_gw_family = 0;
1640                 rt->rt_gw4 = 0;
1641                 INIT_LIST_HEAD(&rt->rt_uncached);
1642
1643                 rt->dst.output = ip_output;
1644                 if (flags & RTCF_LOCAL)
1645                         rt->dst.input = ip_local_deliver;
1646         }
1647
1648         return rt;
1649 }
1650 EXPORT_SYMBOL(rt_dst_alloc);
1651
1652 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1653 {
1654         struct rtable *new_rt;
1655
1656         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1657                            rt->dst.flags);
1658
1659         if (new_rt) {
1660                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1661                 new_rt->rt_flags = rt->rt_flags;
1662                 new_rt->rt_type = rt->rt_type;
1663                 new_rt->rt_is_input = rt->rt_is_input;
1664                 new_rt->rt_iif = rt->rt_iif;
1665                 new_rt->rt_pmtu = rt->rt_pmtu;
1666                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1667                 new_rt->rt_gw_family = rt->rt_gw_family;
1668                 if (rt->rt_gw_family == AF_INET)
1669                         new_rt->rt_gw4 = rt->rt_gw4;
1670                 else if (rt->rt_gw_family == AF_INET6)
1671                         new_rt->rt_gw6 = rt->rt_gw6;
1672                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1673
1674                 new_rt->dst.input = rt->dst.input;
1675                 new_rt->dst.output = rt->dst.output;
1676                 new_rt->dst.error = rt->dst.error;
1677                 new_rt->dst.lastuse = jiffies;
1678                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1679         }
1680         return new_rt;
1681 }
1682 EXPORT_SYMBOL(rt_dst_clone);
1683
1684 /* called in rcu_read_lock() section */
1685 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1686                           u8 tos, struct net_device *dev,
1687                           struct in_device *in_dev, u32 *itag)
1688 {
1689         int err;
1690
1691         /* Primary sanity checks. */
1692         if (!in_dev)
1693                 return -EINVAL;
1694
1695         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1696             skb->protocol != htons(ETH_P_IP))
1697                 return -EINVAL;
1698
1699         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1700                 return -EINVAL;
1701
1702         if (ipv4_is_zeronet(saddr)) {
1703                 if (!ipv4_is_local_multicast(daddr) &&
1704                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1705                         return -EINVAL;
1706         } else {
1707                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1708                                           in_dev, itag);
1709                 if (err < 0)
1710                         return err;
1711         }
1712         return 0;
1713 }
1714
1715 /* called in rcu_read_lock() section */
1716 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1717                              u8 tos, struct net_device *dev, int our)
1718 {
1719         struct in_device *in_dev = __in_dev_get_rcu(dev);
1720         unsigned int flags = RTCF_MULTICAST;
1721         struct rtable *rth;
1722         u32 itag = 0;
1723         int err;
1724
1725         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1726         if (err)
1727                 return err;
1728
1729         if (our)
1730                 flags |= RTCF_LOCAL;
1731
1732         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1733                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1734         if (!rth)
1735                 return -ENOBUFS;
1736
1737 #ifdef CONFIG_IP_ROUTE_CLASSID
1738         rth->dst.tclassid = itag;
1739 #endif
1740         rth->dst.output = ip_rt_bug;
1741         rth->rt_is_input= 1;
1742
1743 #ifdef CONFIG_IP_MROUTE
1744         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1745                 rth->dst.input = ip_mr_input;
1746 #endif
1747         RT_CACHE_STAT_INC(in_slow_mc);
1748
1749         skb_dst_set(skb, &rth->dst);
1750         return 0;
1751 }
1752
1753
1754 static void ip_handle_martian_source(struct net_device *dev,
1755                                      struct in_device *in_dev,
1756                                      struct sk_buff *skb,
1757                                      __be32 daddr,
1758                                      __be32 saddr)
1759 {
1760         RT_CACHE_STAT_INC(in_martian_src);
1761 #ifdef CONFIG_IP_ROUTE_VERBOSE
1762         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1763                 /*
1764                  *      RFC1812 recommendation, if source is martian,
1765                  *      the only hint is MAC header.
1766                  */
1767                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1768                         &daddr, &saddr, dev->name);
1769                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1770                         print_hex_dump(KERN_WARNING, "ll header: ",
1771                                        DUMP_PREFIX_OFFSET, 16, 1,
1772                                        skb_mac_header(skb),
1773                                        dev->hard_header_len, false);
1774                 }
1775         }
1776 #endif
1777 }
1778
1779 /* called in rcu_read_lock() section */
1780 static int __mkroute_input(struct sk_buff *skb,
1781                            const struct fib_result *res,
1782                            struct in_device *in_dev,
1783                            __be32 daddr, __be32 saddr, u32 tos)
1784 {
1785         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1786         struct net_device *dev = nhc->nhc_dev;
1787         struct fib_nh_exception *fnhe;
1788         struct rtable *rth;
1789         int err;
1790         struct in_device *out_dev;
1791         bool do_cache;
1792         u32 itag = 0;
1793
1794         /* get a working reference to the output device */
1795         out_dev = __in_dev_get_rcu(dev);
1796         if (!out_dev) {
1797                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1798                 return -EINVAL;
1799         }
1800
1801         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1802                                   in_dev->dev, in_dev, &itag);
1803         if (err < 0) {
1804                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1805                                          saddr);
1806
1807                 goto cleanup;
1808         }
1809
1810         do_cache = res->fi && !itag;
1811         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1812             skb->protocol == htons(ETH_P_IP)) {
1813                 __be32 gw;
1814
1815                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1816                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1817                     inet_addr_onlink(out_dev, saddr, gw))
1818                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1819         }
1820
1821         if (skb->protocol != htons(ETH_P_IP)) {
1822                 /* Not IP (i.e. ARP). Do not create route, if it is
1823                  * invalid for proxy arp. DNAT routes are always valid.
1824                  *
1825                  * Proxy arp feature have been extended to allow, ARP
1826                  * replies back to the same interface, to support
1827                  * Private VLAN switch technologies. See arp.c.
1828                  */
1829                 if (out_dev == in_dev &&
1830                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1831                         err = -EINVAL;
1832                         goto cleanup;
1833                 }
1834         }
1835
1836         fnhe = find_exception(nhc, daddr);
1837         if (do_cache) {
1838                 if (fnhe)
1839                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1840                 else
1841                         rth = rcu_dereference(nhc->nhc_rth_input);
1842                 if (rt_cache_valid(rth)) {
1843                         skb_dst_set_noref(skb, &rth->dst);
1844                         goto out;
1845                 }
1846         }
1847
1848         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1849                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1850                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1851         if (!rth) {
1852                 err = -ENOBUFS;
1853                 goto cleanup;
1854         }
1855
1856         rth->rt_is_input = 1;
1857         RT_CACHE_STAT_INC(in_slow_tot);
1858
1859         rth->dst.input = ip_forward;
1860
1861         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1862                        do_cache);
1863         lwtunnel_set_redirect(&rth->dst);
1864         skb_dst_set(skb, &rth->dst);
1865 out:
1866         err = 0;
1867  cleanup:
1868         return err;
1869 }
1870
1871 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1872 /* To make ICMP packets follow the right flow, the multipath hash is
1873  * calculated from the inner IP addresses.
1874  */
1875 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1876                                  struct flow_keys *hash_keys)
1877 {
1878         const struct iphdr *outer_iph = ip_hdr(skb);
1879         const struct iphdr *key_iph = outer_iph;
1880         const struct iphdr *inner_iph;
1881         const struct icmphdr *icmph;
1882         struct iphdr _inner_iph;
1883         struct icmphdr _icmph;
1884
1885         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1886                 goto out;
1887
1888         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1889                 goto out;
1890
1891         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1892                                    &_icmph);
1893         if (!icmph)
1894                 goto out;
1895
1896         if (!icmp_is_err(icmph->type))
1897                 goto out;
1898
1899         inner_iph = skb_header_pointer(skb,
1900                                        outer_iph->ihl * 4 + sizeof(_icmph),
1901                                        sizeof(_inner_iph), &_inner_iph);
1902         if (!inner_iph)
1903                 goto out;
1904
1905         key_iph = inner_iph;
1906 out:
1907         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1908         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1909 }
1910
1911 /* if skb is set it will be used and fl4 can be NULL */
1912 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1913                        const struct sk_buff *skb, struct flow_keys *flkeys)
1914 {
1915         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1916         struct flow_keys hash_keys;
1917         u32 mhash;
1918
1919         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1920         case 0:
1921                 memset(&hash_keys, 0, sizeof(hash_keys));
1922                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1923                 if (skb) {
1924                         ip_multipath_l3_keys(skb, &hash_keys);
1925                 } else {
1926                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1927                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1928                 }
1929                 break;
1930         case 1:
1931                 /* skb is currently provided only when forwarding */
1932                 if (skb) {
1933                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1934                         struct flow_keys keys;
1935
1936                         /* short-circuit if we already have L4 hash present */
1937                         if (skb->l4_hash)
1938                                 return skb_get_hash_raw(skb) >> 1;
1939
1940                         memset(&hash_keys, 0, sizeof(hash_keys));
1941
1942                         if (!flkeys) {
1943                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1944                                 flkeys = &keys;
1945                         }
1946
1947                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1948                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1949                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1950                         hash_keys.ports.src = flkeys->ports.src;
1951                         hash_keys.ports.dst = flkeys->ports.dst;
1952                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1953                 } else {
1954                         memset(&hash_keys, 0, sizeof(hash_keys));
1955                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1956                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1957                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1958                         hash_keys.ports.src = fl4->fl4_sport;
1959                         hash_keys.ports.dst = fl4->fl4_dport;
1960                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1961                 }
1962                 break;
1963         case 2:
1964                 memset(&hash_keys, 0, sizeof(hash_keys));
1965                 /* skb is currently provided only when forwarding */
1966                 if (skb) {
1967                         struct flow_keys keys;
1968
1969                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1970                         /* Inner can be v4 or v6 */
1971                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1972                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1973                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1974                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1975                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1976                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1977                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1978                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1979                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1980                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1981                         } else {
1982                                 /* Same as case 0 */
1983                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1984                                 ip_multipath_l3_keys(skb, &hash_keys);
1985                         }
1986                 } else {
1987                         /* Same as case 0 */
1988                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1990                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1991                 }
1992                 break;
1993         }
1994         mhash = flow_hash_from_keys(&hash_keys);
1995
1996         if (multipath_hash)
1997                 mhash = jhash_2words(mhash, multipath_hash, 0);
1998
1999         return mhash >> 1;
2000 }
2001 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2002
2003 static int ip_mkroute_input(struct sk_buff *skb,
2004                             struct fib_result *res,
2005                             struct in_device *in_dev,
2006                             __be32 daddr, __be32 saddr, u32 tos,
2007                             struct flow_keys *hkeys)
2008 {
2009 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2010         if (res->fi && fib_info_num_path(res->fi) > 1) {
2011                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2012
2013                 fib_select_multipath(res, h);
2014         }
2015 #endif
2016
2017         /* create a routing cache entry */
2018         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2019 }
2020
2021 /* Implements all the saddr-related checks as ip_route_input_slow(),
2022  * assuming daddr is valid and the destination is not a local broadcast one.
2023  * Uses the provided hint instead of performing a route lookup.
2024  */
2025 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2026                       u8 tos, struct net_device *dev,
2027                       const struct sk_buff *hint)
2028 {
2029         struct in_device *in_dev = __in_dev_get_rcu(dev);
2030         struct rtable *rt = (struct rtable *)hint;
2031         struct net *net = dev_net(dev);
2032         int err = -EINVAL;
2033         u32 tag = 0;
2034
2035         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2036                 goto martian_source;
2037
2038         if (ipv4_is_zeronet(saddr))
2039                 goto martian_source;
2040
2041         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2042                 goto martian_source;
2043
2044         if (rt->rt_type != RTN_LOCAL)
2045                 goto skip_validate_source;
2046
2047         tos &= IPTOS_RT_MASK;
2048         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2049         if (err < 0)
2050                 goto martian_source;
2051
2052 skip_validate_source:
2053         skb_dst_copy(skb, hint);
2054         return 0;
2055
2056 martian_source:
2057         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2058         return err;
2059 }
2060
2061 /*
2062  *      NOTE. We drop all the packets that has local source
2063  *      addresses, because every properly looped back packet
2064  *      must have correct destination already attached by output routine.
2065  *      Changes in the enforced policies must be applied also to
2066  *      ip_route_use_hint().
2067  *
2068  *      Such approach solves two big problems:
2069  *      1. Not simplex devices are handled properly.
2070  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2071  *      called with rcu_read_lock()
2072  */
2073
2074 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2075                                u8 tos, struct net_device *dev,
2076                                struct fib_result *res)
2077 {
2078         struct in_device *in_dev = __in_dev_get_rcu(dev);
2079         struct flow_keys *flkeys = NULL, _flkeys;
2080         struct net    *net = dev_net(dev);
2081         struct ip_tunnel_info *tun_info;
2082         int             err = -EINVAL;
2083         unsigned int    flags = 0;
2084         u32             itag = 0;
2085         struct rtable   *rth;
2086         struct flowi4   fl4;
2087         bool do_cache = true;
2088
2089         /* IP on this device is disabled. */
2090
2091         if (!in_dev)
2092                 goto out;
2093
2094         /* Check for the most weird martians, which can be not detected
2095            by fib_lookup.
2096          */
2097
2098         tun_info = skb_tunnel_info(skb);
2099         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2100                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2101         else
2102                 fl4.flowi4_tun_key.tun_id = 0;
2103         skb_dst_drop(skb);
2104
2105         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2106                 goto martian_source;
2107
2108         res->fi = NULL;
2109         res->table = NULL;
2110         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2111                 goto brd_input;
2112
2113         /* Accept zero addresses only to limited broadcast;
2114          * I even do not know to fix it or not. Waiting for complains :-)
2115          */
2116         if (ipv4_is_zeronet(saddr))
2117                 goto martian_source;
2118
2119         if (ipv4_is_zeronet(daddr))
2120                 goto martian_destination;
2121
2122         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2123          * and call it once if daddr or/and saddr are loopback addresses
2124          */
2125         if (ipv4_is_loopback(daddr)) {
2126                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2127                         goto martian_destination;
2128         } else if (ipv4_is_loopback(saddr)) {
2129                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2130                         goto martian_source;
2131         }
2132
2133         /*
2134          *      Now we are ready to route packet.
2135          */
2136         fl4.flowi4_oif = 0;
2137         fl4.flowi4_iif = dev->ifindex;
2138         fl4.flowi4_mark = skb->mark;
2139         fl4.flowi4_tos = tos;
2140         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2141         fl4.flowi4_flags = 0;
2142         fl4.daddr = daddr;
2143         fl4.saddr = saddr;
2144         fl4.flowi4_uid = sock_net_uid(net, NULL);
2145
2146         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2147                 flkeys = &_flkeys;
2148         } else {
2149                 fl4.flowi4_proto = 0;
2150                 fl4.fl4_sport = 0;
2151                 fl4.fl4_dport = 0;
2152         }
2153
2154         err = fib_lookup(net, &fl4, res, 0);
2155         if (err != 0) {
2156                 if (!IN_DEV_FORWARD(in_dev))
2157                         err = -EHOSTUNREACH;
2158                 goto no_route;
2159         }
2160
2161         if (res->type == RTN_BROADCAST) {
2162                 if (IN_DEV_BFORWARD(in_dev))
2163                         goto make_route;
2164                 /* not do cache if bc_forwarding is enabled */
2165                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2166                         do_cache = false;
2167                 goto brd_input;
2168         }
2169
2170         if (res->type == RTN_LOCAL) {
2171                 err = fib_validate_source(skb, saddr, daddr, tos,
2172                                           0, dev, in_dev, &itag);
2173                 if (err < 0)
2174                         goto martian_source;
2175                 goto local_input;
2176         }
2177
2178         if (!IN_DEV_FORWARD(in_dev)) {
2179                 err = -EHOSTUNREACH;
2180                 goto no_route;
2181         }
2182         if (res->type != RTN_UNICAST)
2183                 goto martian_destination;
2184
2185 make_route:
2186         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2187 out:    return err;
2188
2189 brd_input:
2190         if (skb->protocol != htons(ETH_P_IP))
2191                 goto e_inval;
2192
2193         if (!ipv4_is_zeronet(saddr)) {
2194                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2195                                           in_dev, &itag);
2196                 if (err < 0)
2197                         goto martian_source;
2198         }
2199         flags |= RTCF_BROADCAST;
2200         res->type = RTN_BROADCAST;
2201         RT_CACHE_STAT_INC(in_brd);
2202
2203 local_input:
2204         do_cache &= res->fi && !itag;
2205         if (do_cache) {
2206                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2207
2208                 rth = rcu_dereference(nhc->nhc_rth_input);
2209                 if (rt_cache_valid(rth)) {
2210                         skb_dst_set_noref(skb, &rth->dst);
2211                         err = 0;
2212                         goto out;
2213                 }
2214         }
2215
2216         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2217                            flags | RTCF_LOCAL, res->type,
2218                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2219         if (!rth)
2220                 goto e_nobufs;
2221
2222         rth->dst.output= ip_rt_bug;
2223 #ifdef CONFIG_IP_ROUTE_CLASSID
2224         rth->dst.tclassid = itag;
2225 #endif
2226         rth->rt_is_input = 1;
2227
2228         RT_CACHE_STAT_INC(in_slow_tot);
2229         if (res->type == RTN_UNREACHABLE) {
2230                 rth->dst.input= ip_error;
2231                 rth->dst.error= -err;
2232                 rth->rt_flags   &= ~RTCF_LOCAL;
2233         }
2234
2235         if (do_cache) {
2236                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2237
2238                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2239                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2240                         WARN_ON(rth->dst.input == lwtunnel_input);
2241                         rth->dst.lwtstate->orig_input = rth->dst.input;
2242                         rth->dst.input = lwtunnel_input;
2243                 }
2244
2245                 if (unlikely(!rt_cache_route(nhc, rth)))
2246                         rt_add_uncached_list(rth);
2247         }
2248         skb_dst_set(skb, &rth->dst);
2249         err = 0;
2250         goto out;
2251
2252 no_route:
2253         RT_CACHE_STAT_INC(in_no_route);
2254         res->type = RTN_UNREACHABLE;
2255         res->fi = NULL;
2256         res->table = NULL;
2257         goto local_input;
2258
2259         /*
2260          *      Do not cache martian addresses: they should be logged (RFC1812)
2261          */
2262 martian_destination:
2263         RT_CACHE_STAT_INC(in_martian_dst);
2264 #ifdef CONFIG_IP_ROUTE_VERBOSE
2265         if (IN_DEV_LOG_MARTIANS(in_dev))
2266                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2267                                      &daddr, &saddr, dev->name);
2268 #endif
2269
2270 e_inval:
2271         err = -EINVAL;
2272         goto out;
2273
2274 e_nobufs:
2275         err = -ENOBUFS;
2276         goto out;
2277
2278 martian_source:
2279         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2280         goto out;
2281 }
2282
2283 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2284                          u8 tos, struct net_device *dev)
2285 {
2286         struct fib_result res;
2287         int err;
2288
2289         tos &= IPTOS_RT_MASK;
2290         rcu_read_lock();
2291         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2292         rcu_read_unlock();
2293
2294         return err;
2295 }
2296 EXPORT_SYMBOL(ip_route_input_noref);
2297
2298 /* called with rcu_read_lock held */
2299 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2300                        u8 tos, struct net_device *dev, struct fib_result *res)
2301 {
2302         /* Multicast recognition logic is moved from route cache to here.
2303            The problem was that too many Ethernet cards have broken/missing
2304            hardware multicast filters :-( As result the host on multicasting
2305            network acquires a lot of useless route cache entries, sort of
2306            SDR messages from all the world. Now we try to get rid of them.
2307            Really, provided software IP multicast filter is organized
2308            reasonably (at least, hashed), it does not result in a slowdown
2309            comparing with route cache reject entries.
2310            Note, that multicast routers are not affected, because
2311            route cache entry is created eventually.
2312          */
2313         if (ipv4_is_multicast(daddr)) {
2314                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2315                 int our = 0;
2316                 int err = -EINVAL;
2317
2318                 if (!in_dev)
2319                         return err;
2320                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2321                                       ip_hdr(skb)->protocol);
2322
2323                 /* check l3 master if no match yet */
2324                 if (!our && netif_is_l3_slave(dev)) {
2325                         struct in_device *l3_in_dev;
2326
2327                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2328                         if (l3_in_dev)
2329                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2330                                                       ip_hdr(skb)->protocol);
2331                 }
2332
2333                 if (our
2334 #ifdef CONFIG_IP_MROUTE
2335                         ||
2336                     (!ipv4_is_local_multicast(daddr) &&
2337                      IN_DEV_MFORWARD(in_dev))
2338 #endif
2339                    ) {
2340                         err = ip_route_input_mc(skb, daddr, saddr,
2341                                                 tos, dev, our);
2342                 }
2343                 return err;
2344         }
2345
2346         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2347 }
2348
2349 /* called with rcu_read_lock() */
2350 static struct rtable *__mkroute_output(const struct fib_result *res,
2351                                        const struct flowi4 *fl4, int orig_oif,
2352                                        struct net_device *dev_out,
2353                                        unsigned int flags)
2354 {
2355         struct fib_info *fi = res->fi;
2356         struct fib_nh_exception *fnhe;
2357         struct in_device *in_dev;
2358         u16 type = res->type;
2359         struct rtable *rth;
2360         bool do_cache;
2361
2362         in_dev = __in_dev_get_rcu(dev_out);
2363         if (!in_dev)
2364                 return ERR_PTR(-EINVAL);
2365
2366         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2367                 if (ipv4_is_loopback(fl4->saddr) &&
2368                     !(dev_out->flags & IFF_LOOPBACK) &&
2369                     !netif_is_l3_master(dev_out))
2370                         return ERR_PTR(-EINVAL);
2371
2372         if (ipv4_is_lbcast(fl4->daddr))
2373                 type = RTN_BROADCAST;
2374         else if (ipv4_is_multicast(fl4->daddr))
2375                 type = RTN_MULTICAST;
2376         else if (ipv4_is_zeronet(fl4->daddr))
2377                 return ERR_PTR(-EINVAL);
2378
2379         if (dev_out->flags & IFF_LOOPBACK)
2380                 flags |= RTCF_LOCAL;
2381
2382         do_cache = true;
2383         if (type == RTN_BROADCAST) {
2384                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2385                 fi = NULL;
2386         } else if (type == RTN_MULTICAST) {
2387                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2388                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2389                                      fl4->flowi4_proto))
2390                         flags &= ~RTCF_LOCAL;
2391                 else
2392                         do_cache = false;
2393                 /* If multicast route do not exist use
2394                  * default one, but do not gateway in this case.
2395                  * Yes, it is hack.
2396                  */
2397                 if (fi && res->prefixlen < 4)
2398                         fi = NULL;
2399         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2400                    (orig_oif != dev_out->ifindex)) {
2401                 /* For local routes that require a particular output interface
2402                  * we do not want to cache the result.  Caching the result
2403                  * causes incorrect behaviour when there are multiple source
2404                  * addresses on the interface, the end result being that if the
2405                  * intended recipient is waiting on that interface for the
2406                  * packet he won't receive it because it will be delivered on
2407                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2408                  * be set to the loopback interface as well.
2409                  */
2410                 do_cache = false;
2411         }
2412
2413         fnhe = NULL;
2414         do_cache &= fi != NULL;
2415         if (fi) {
2416                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2417                 struct rtable __rcu **prth;
2418
2419                 fnhe = find_exception(nhc, fl4->daddr);
2420                 if (!do_cache)
2421                         goto add;
2422                 if (fnhe) {
2423                         prth = &fnhe->fnhe_rth_output;
2424                 } else {
2425                         if (unlikely(fl4->flowi4_flags &
2426                                      FLOWI_FLAG_KNOWN_NH &&
2427                                      !(nhc->nhc_gw_family &&
2428                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2429                                 do_cache = false;
2430                                 goto add;
2431                         }
2432                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2433                 }
2434                 rth = rcu_dereference(*prth);
2435                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2436                         return rth;
2437         }
2438
2439 add:
2440         rth = rt_dst_alloc(dev_out, flags, type,
2441                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2442                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2443         if (!rth)
2444                 return ERR_PTR(-ENOBUFS);
2445
2446         rth->rt_iif = orig_oif;
2447
2448         RT_CACHE_STAT_INC(out_slow_tot);
2449
2450         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2451                 if (flags & RTCF_LOCAL &&
2452                     !(dev_out->flags & IFF_LOOPBACK)) {
2453                         rth->dst.output = ip_mc_output;
2454                         RT_CACHE_STAT_INC(out_slow_mc);
2455                 }
2456 #ifdef CONFIG_IP_MROUTE
2457                 if (type == RTN_MULTICAST) {
2458                         if (IN_DEV_MFORWARD(in_dev) &&
2459                             !ipv4_is_local_multicast(fl4->daddr)) {
2460                                 rth->dst.input = ip_mr_input;
2461                                 rth->dst.output = ip_mc_output;
2462                         }
2463                 }
2464 #endif
2465         }
2466
2467         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2468         lwtunnel_set_redirect(&rth->dst);
2469
2470         return rth;
2471 }
2472
2473 /*
2474  * Major route resolver routine.
2475  */
2476
2477 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2478                                         const struct sk_buff *skb)
2479 {
2480         __u8 tos = RT_FL_TOS(fl4);
2481         struct fib_result res = {
2482                 .type           = RTN_UNSPEC,
2483                 .fi             = NULL,
2484                 .table          = NULL,
2485                 .tclassid       = 0,
2486         };
2487         struct rtable *rth;
2488
2489         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2490         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2491         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2492                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2493
2494         rcu_read_lock();
2495         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2496         rcu_read_unlock();
2497
2498         return rth;
2499 }
2500 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2501
2502 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2503                                             struct fib_result *res,
2504                                             const struct sk_buff *skb)
2505 {
2506         struct net_device *dev_out = NULL;
2507         int orig_oif = fl4->flowi4_oif;
2508         unsigned int flags = 0;
2509         struct rtable *rth;
2510         int err;
2511
2512         if (fl4->saddr) {
2513                 if (ipv4_is_multicast(fl4->saddr) ||
2514                     ipv4_is_lbcast(fl4->saddr) ||
2515                     ipv4_is_zeronet(fl4->saddr)) {
2516                         rth = ERR_PTR(-EINVAL);
2517                         goto out;
2518                 }
2519
2520                 rth = ERR_PTR(-ENETUNREACH);
2521
2522                 /* I removed check for oif == dev_out->oif here.
2523                    It was wrong for two reasons:
2524                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2525                       is assigned to multiple interfaces.
2526                    2. Moreover, we are allowed to send packets with saddr
2527                       of another iface. --ANK
2528                  */
2529
2530                 if (fl4->flowi4_oif == 0 &&
2531                     (ipv4_is_multicast(fl4->daddr) ||
2532                      ipv4_is_lbcast(fl4->daddr))) {
2533                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2534                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2535                         if (!dev_out)
2536                                 goto out;
2537
2538                         /* Special hack: user can direct multicasts
2539                            and limited broadcast via necessary interface
2540                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2541                            This hack is not just for fun, it allows
2542                            vic,vat and friends to work.
2543                            They bind socket to loopback, set ttl to zero
2544                            and expect that it will work.
2545                            From the viewpoint of routing cache they are broken,
2546                            because we are not allowed to build multicast path
2547                            with loopback source addr (look, routing cache
2548                            cannot know, that ttl is zero, so that packet
2549                            will not leave this host and route is valid).
2550                            Luckily, this hack is good workaround.
2551                          */
2552
2553                         fl4->flowi4_oif = dev_out->ifindex;
2554                         goto make_route;
2555                 }
2556
2557                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2558                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2559                         if (!__ip_dev_find(net, fl4->saddr, false))
2560                                 goto out;
2561                 }
2562         }
2563
2564
2565         if (fl4->flowi4_oif) {
2566                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2567                 rth = ERR_PTR(-ENODEV);
2568                 if (!dev_out)
2569                         goto out;
2570
2571                 /* RACE: Check return value of inet_select_addr instead. */
2572                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2573                         rth = ERR_PTR(-ENETUNREACH);
2574                         goto out;
2575                 }
2576                 if (ipv4_is_local_multicast(fl4->daddr) ||
2577                     ipv4_is_lbcast(fl4->daddr) ||
2578                     fl4->flowi4_proto == IPPROTO_IGMP) {
2579                         if (!fl4->saddr)
2580                                 fl4->saddr = inet_select_addr(dev_out, 0,
2581                                                               RT_SCOPE_LINK);
2582                         goto make_route;
2583                 }
2584                 if (!fl4->saddr) {
2585                         if (ipv4_is_multicast(fl4->daddr))
2586                                 fl4->saddr = inet_select_addr(dev_out, 0,
2587                                                               fl4->flowi4_scope);
2588                         else if (!fl4->daddr)
2589                                 fl4->saddr = inet_select_addr(dev_out, 0,
2590                                                               RT_SCOPE_HOST);
2591                 }
2592         }
2593
2594         if (!fl4->daddr) {
2595                 fl4->daddr = fl4->saddr;
2596                 if (!fl4->daddr)
2597                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2598                 dev_out = net->loopback_dev;
2599                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2600                 res->type = RTN_LOCAL;
2601                 flags |= RTCF_LOCAL;
2602                 goto make_route;
2603         }
2604
2605         err = fib_lookup(net, fl4, res, 0);
2606         if (err) {
2607                 res->fi = NULL;
2608                 res->table = NULL;
2609                 if (fl4->flowi4_oif &&
2610                     (ipv4_is_multicast(fl4->daddr) ||
2611                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2612                         /* Apparently, routing tables are wrong. Assume,
2613                            that the destination is on link.
2614
2615                            WHY? DW.
2616                            Because we are allowed to send to iface
2617                            even if it has NO routes and NO assigned
2618                            addresses. When oif is specified, routing
2619                            tables are looked up with only one purpose:
2620                            to catch if destination is gatewayed, rather than
2621                            direct. Moreover, if MSG_DONTROUTE is set,
2622                            we send packet, ignoring both routing tables
2623                            and ifaddr state. --ANK
2624
2625
2626                            We could make it even if oif is unknown,
2627                            likely IPv6, but we do not.
2628                          */
2629
2630                         if (fl4->saddr == 0)
2631                                 fl4->saddr = inet_select_addr(dev_out, 0,
2632                                                               RT_SCOPE_LINK);
2633                         res->type = RTN_UNICAST;
2634                         goto make_route;
2635                 }
2636                 rth = ERR_PTR(err);
2637                 goto out;
2638         }
2639
2640         if (res->type == RTN_LOCAL) {
2641                 if (!fl4->saddr) {
2642                         if (res->fi->fib_prefsrc)
2643                                 fl4->saddr = res->fi->fib_prefsrc;
2644                         else
2645                                 fl4->saddr = fl4->daddr;
2646                 }
2647
2648                 /* L3 master device is the loopback for that domain */
2649                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2650                         net->loopback_dev;
2651
2652                 /* make sure orig_oif points to fib result device even
2653                  * though packet rx/tx happens over loopback or l3mdev
2654                  */
2655                 orig_oif = FIB_RES_OIF(*res);
2656
2657                 fl4->flowi4_oif = dev_out->ifindex;
2658                 flags |= RTCF_LOCAL;
2659                 goto make_route;
2660         }
2661
2662         fib_select_path(net, res, fl4, skb);
2663
2664         dev_out = FIB_RES_DEV(*res);
2665         fl4->flowi4_oif = dev_out->ifindex;
2666
2667
2668 make_route:
2669         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2670
2671 out:
2672         return rth;
2673 }
2674
2675 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2676 {
2677         return NULL;
2678 }
2679
2680 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2681 {
2682         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2683
2684         return mtu ? : dst->dev->mtu;
2685 }
2686
2687 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2688                                           struct sk_buff *skb, u32 mtu,
2689                                           bool confirm_neigh)
2690 {
2691 }
2692
2693 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2694                                        struct sk_buff *skb)
2695 {
2696 }
2697
2698 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2699                                           unsigned long old)
2700 {
2701         return NULL;
2702 }
2703
2704 static struct dst_ops ipv4_dst_blackhole_ops = {
2705         .family                 =       AF_INET,
2706         .check                  =       ipv4_blackhole_dst_check,
2707         .mtu                    =       ipv4_blackhole_mtu,
2708         .default_advmss         =       ipv4_default_advmss,
2709         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2710         .redirect               =       ipv4_rt_blackhole_redirect,
2711         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2712         .neigh_lookup           =       ipv4_neigh_lookup,
2713 };
2714
2715 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2716 {
2717         struct rtable *ort = (struct rtable *) dst_orig;
2718         struct rtable *rt;
2719
2720         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2721         if (rt) {
2722                 struct dst_entry *new = &rt->dst;
2723
2724                 new->__use = 1;
2725                 new->input = dst_discard;
2726                 new->output = dst_discard_out;
2727
2728                 new->dev = net->loopback_dev;
2729                 if (new->dev)
2730                         dev_hold(new->dev);
2731
2732                 rt->rt_is_input = ort->rt_is_input;
2733                 rt->rt_iif = ort->rt_iif;
2734                 rt->rt_pmtu = ort->rt_pmtu;
2735                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2736
2737                 rt->rt_genid = rt_genid_ipv4(net);
2738                 rt->rt_flags = ort->rt_flags;
2739                 rt->rt_type = ort->rt_type;
2740                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2741                 rt->rt_gw_family = ort->rt_gw_family;
2742                 if (rt->rt_gw_family == AF_INET)
2743                         rt->rt_gw4 = ort->rt_gw4;
2744                 else if (rt->rt_gw_family == AF_INET6)
2745                         rt->rt_gw6 = ort->rt_gw6;
2746
2747                 INIT_LIST_HEAD(&rt->rt_uncached);
2748         }
2749
2750         dst_release(dst_orig);
2751
2752         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2753 }
2754
2755 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2756                                     const struct sock *sk)
2757 {
2758         struct rtable *rt = __ip_route_output_key(net, flp4);
2759
2760         if (IS_ERR(rt))
2761                 return rt;
2762
2763         if (flp4->flowi4_proto)
2764                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2765                                                         flowi4_to_flowi(flp4),
2766                                                         sk, 0);
2767
2768         return rt;
2769 }
2770 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2771
2772 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2773                                       struct net_device *dev,
2774                                       struct net *net, __be32 *saddr,
2775                                       const struct ip_tunnel_info *info,
2776                                       u8 protocol, bool use_cache)
2777 {
2778 #ifdef CONFIG_DST_CACHE
2779         struct dst_cache *dst_cache;
2780 #endif
2781         struct rtable *rt = NULL;
2782         struct flowi4 fl4;
2783         __u8 tos;
2784
2785 #ifdef CONFIG_DST_CACHE
2786         dst_cache = (struct dst_cache *)&info->dst_cache;
2787         if (use_cache) {
2788                 rt = dst_cache_get_ip4(dst_cache, saddr);
2789                 if (rt)
2790                         return rt;
2791         }
2792 #endif
2793         memset(&fl4, 0, sizeof(fl4));
2794         fl4.flowi4_mark = skb->mark;
2795         fl4.flowi4_proto = protocol;
2796         fl4.daddr = info->key.u.ipv4.dst;
2797         fl4.saddr = info->key.u.ipv4.src;
2798         tos = info->key.tos;
2799         fl4.flowi4_tos = RT_TOS(tos);
2800
2801         rt = ip_route_output_key(net, &fl4);
2802         if (IS_ERR(rt)) {
2803                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2804                 return ERR_PTR(-ENETUNREACH);
2805         }
2806         if (rt->dst.dev == dev) { /* is this necessary? */
2807                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2808                 ip_rt_put(rt);
2809                 return ERR_PTR(-ELOOP);
2810         }
2811 #ifdef CONFIG_DST_CACHE
2812         if (use_cache)
2813                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2814 #endif
2815         *saddr = fl4.saddr;
2816         return rt;
2817 }
2818 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2819
2820 /* called with rcu_read_lock held */
2821 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2822                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2823                         struct sk_buff *skb, u32 portid, u32 seq,
2824                         unsigned int flags)
2825 {
2826         struct rtmsg *r;
2827         struct nlmsghdr *nlh;
2828         unsigned long expires = 0;
2829         u32 error;
2830         u32 metrics[RTAX_MAX];
2831
2832         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2833         if (!nlh)
2834                 return -EMSGSIZE;
2835
2836         r = nlmsg_data(nlh);
2837         r->rtm_family    = AF_INET;
2838         r->rtm_dst_len  = 32;
2839         r->rtm_src_len  = 0;
2840         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2841         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2842         if (nla_put_u32(skb, RTA_TABLE, table_id))
2843                 goto nla_put_failure;
2844         r->rtm_type     = rt->rt_type;
2845         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2846         r->rtm_protocol = RTPROT_UNSPEC;
2847         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2848         if (rt->rt_flags & RTCF_NOTIFY)
2849                 r->rtm_flags |= RTM_F_NOTIFY;
2850         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2851                 r->rtm_flags |= RTCF_DOREDIRECT;
2852
2853         if (nla_put_in_addr(skb, RTA_DST, dst))
2854                 goto nla_put_failure;
2855         if (src) {
2856                 r->rtm_src_len = 32;
2857                 if (nla_put_in_addr(skb, RTA_SRC, src))
2858                         goto nla_put_failure;
2859         }
2860         if (rt->dst.dev &&
2861             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2862                 goto nla_put_failure;
2863 #ifdef CONFIG_IP_ROUTE_CLASSID
2864         if (rt->dst.tclassid &&
2865             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2866                 goto nla_put_failure;
2867 #endif
2868         if (fl4 && !rt_is_input_route(rt) &&
2869             fl4->saddr != src) {
2870                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2871                         goto nla_put_failure;
2872         }
2873         if (rt->rt_uses_gateway) {
2874                 if (rt->rt_gw_family == AF_INET &&
2875                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2876                         goto nla_put_failure;
2877                 } else if (rt->rt_gw_family == AF_INET6) {
2878                         int alen = sizeof(struct in6_addr);
2879                         struct nlattr *nla;
2880                         struct rtvia *via;
2881
2882                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2883                         if (!nla)
2884                                 goto nla_put_failure;
2885
2886                         via = nla_data(nla);
2887                         via->rtvia_family = AF_INET6;
2888                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2889                 }
2890         }
2891
2892         expires = rt->dst.expires;
2893         if (expires) {
2894                 unsigned long now = jiffies;
2895
2896                 if (time_before(now, expires))
2897                         expires -= now;
2898                 else
2899                         expires = 0;
2900         }
2901
2902         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2903         if (rt->rt_pmtu && expires)
2904                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2905         if (rt->rt_mtu_locked && expires)
2906                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2907         if (rtnetlink_put_metrics(skb, metrics) < 0)
2908                 goto nla_put_failure;
2909
2910         if (fl4) {
2911                 if (fl4->flowi4_mark &&
2912                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2913                         goto nla_put_failure;
2914
2915                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2916                     nla_put_u32(skb, RTA_UID,
2917                                 from_kuid_munged(current_user_ns(),
2918                                                  fl4->flowi4_uid)))
2919                         goto nla_put_failure;
2920
2921                 if (rt_is_input_route(rt)) {
2922 #ifdef CONFIG_IP_MROUTE
2923                         if (ipv4_is_multicast(dst) &&
2924                             !ipv4_is_local_multicast(dst) &&
2925                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2926                                 int err = ipmr_get_route(net, skb,
2927                                                          fl4->saddr, fl4->daddr,
2928                                                          r, portid);
2929
2930                                 if (err <= 0) {
2931                                         if (err == 0)
2932                                                 return 0;
2933                                         goto nla_put_failure;
2934                                 }
2935                         } else
2936 #endif
2937                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2938                                         goto nla_put_failure;
2939                 }
2940         }
2941
2942         error = rt->dst.error;
2943
2944         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2945                 goto nla_put_failure;
2946
2947         nlmsg_end(skb, nlh);
2948         return 0;
2949
2950 nla_put_failure:
2951         nlmsg_cancel(skb, nlh);
2952         return -EMSGSIZE;
2953 }
2954
2955 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2956                             struct netlink_callback *cb, u32 table_id,
2957                             struct fnhe_hash_bucket *bucket, int genid,
2958                             int *fa_index, int fa_start, unsigned int flags)
2959 {
2960         int i;
2961
2962         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2963                 struct fib_nh_exception *fnhe;
2964
2965                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2966                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2967                         struct rtable *rt;
2968                         int err;
2969
2970                         if (*fa_index < fa_start)
2971                                 goto next;
2972
2973                         if (fnhe->fnhe_genid != genid)
2974                                 goto next;
2975
2976                         if (fnhe->fnhe_expires &&
2977                             time_after(jiffies, fnhe->fnhe_expires))
2978                                 goto next;
2979
2980                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2981                         if (!rt)
2982                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2983                         if (!rt)
2984                                 goto next;
2985
2986                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2987                                            table_id, NULL, skb,
2988                                            NETLINK_CB(cb->skb).portid,
2989                                            cb->nlh->nlmsg_seq, flags);
2990                         if (err)
2991                                 return err;
2992 next:
2993                         (*fa_index)++;
2994                 }
2995         }
2996
2997         return 0;
2998 }
2999
3000 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3001                        u32 table_id, struct fib_info *fi,
3002                        int *fa_index, int fa_start, unsigned int flags)
3003 {
3004         struct net *net = sock_net(cb->skb->sk);
3005         int nhsel, genid = fnhe_genid(net);
3006
3007         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3008                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3009                 struct fnhe_hash_bucket *bucket;
3010                 int err;
3011
3012                 if (nhc->nhc_flags & RTNH_F_DEAD)
3013                         continue;
3014
3015                 rcu_read_lock();
3016                 bucket = rcu_dereference(nhc->nhc_exceptions);
3017                 err = 0;
3018                 if (bucket)
3019                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3020                                                genid, fa_index, fa_start,
3021                                                flags);
3022                 rcu_read_unlock();
3023                 if (err)
3024                         return err;
3025         }
3026
3027         return 0;
3028 }
3029
3030 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3031                                                    u8 ip_proto, __be16 sport,
3032                                                    __be16 dport)
3033 {
3034         struct sk_buff *skb;
3035         struct iphdr *iph;
3036
3037         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3038         if (!skb)
3039                 return NULL;
3040
3041         /* Reserve room for dummy headers, this skb can pass
3042          * through good chunk of routing engine.
3043          */
3044         skb_reset_mac_header(skb);
3045         skb_reset_network_header(skb);
3046         skb->protocol = htons(ETH_P_IP);
3047         iph = skb_put(skb, sizeof(struct iphdr));
3048         iph->protocol = ip_proto;
3049         iph->saddr = src;
3050         iph->daddr = dst;
3051         iph->version = 0x4;
3052         iph->frag_off = 0;
3053         iph->ihl = 0x5;
3054         skb_set_transport_header(skb, skb->len);
3055
3056         switch (iph->protocol) {
3057         case IPPROTO_UDP: {
3058                 struct udphdr *udph;
3059
3060                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3061                 udph->source = sport;
3062                 udph->dest = dport;
3063                 udph->len = sizeof(struct udphdr);
3064                 udph->check = 0;
3065                 break;
3066         }
3067         case IPPROTO_TCP: {
3068                 struct tcphdr *tcph;
3069
3070                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3071                 tcph->source    = sport;
3072                 tcph->dest      = dport;
3073                 tcph->doff      = sizeof(struct tcphdr) / 4;
3074                 tcph->rst = 1;
3075                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3076                                             src, dst, 0);
3077                 break;
3078         }
3079         case IPPROTO_ICMP: {
3080                 struct icmphdr *icmph;
3081
3082                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3083                 icmph->type = ICMP_ECHO;
3084                 icmph->code = 0;
3085         }
3086         }
3087
3088         return skb;
3089 }
3090
3091 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3092                                        const struct nlmsghdr *nlh,
3093                                        struct nlattr **tb,
3094                                        struct netlink_ext_ack *extack)
3095 {
3096         struct rtmsg *rtm;
3097         int i, err;
3098
3099         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3100                 NL_SET_ERR_MSG(extack,
3101                                "ipv4: Invalid header for route get request");
3102                 return -EINVAL;
3103         }
3104
3105         if (!netlink_strict_get_check(skb))
3106                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3107                                               rtm_ipv4_policy, extack);
3108
3109         rtm = nlmsg_data(nlh);
3110         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3111             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3112             rtm->rtm_table || rtm->rtm_protocol ||
3113             rtm->rtm_scope || rtm->rtm_type) {
3114                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3115                 return -EINVAL;
3116         }
3117
3118         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3119                                RTM_F_LOOKUP_TABLE |
3120                                RTM_F_FIB_MATCH)) {
3121                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3122                 return -EINVAL;
3123         }
3124
3125         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3126                                             rtm_ipv4_policy, extack);
3127         if (err)
3128                 return err;
3129
3130         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3131             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3132                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3133                 return -EINVAL;
3134         }
3135
3136         for (i = 0; i <= RTA_MAX; i++) {
3137                 if (!tb[i])
3138                         continue;
3139
3140                 switch (i) {
3141                 case RTA_IIF:
3142                 case RTA_OIF:
3143                 case RTA_SRC:
3144                 case RTA_DST:
3145                 case RTA_IP_PROTO:
3146                 case RTA_SPORT:
3147                 case RTA_DPORT:
3148                 case RTA_MARK:
3149                 case RTA_UID:
3150                         break;
3151                 default:
3152                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3153                         return -EINVAL;
3154                 }
3155         }
3156
3157         return 0;
3158 }
3159
3160 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3161                              struct netlink_ext_ack *extack)
3162 {
3163         struct net *net = sock_net(in_skb->sk);
3164         struct nlattr *tb[RTA_MAX+1];
3165         u32 table_id = RT_TABLE_MAIN;
3166         __be16 sport = 0, dport = 0;
3167         struct fib_result res = {};
3168         u8 ip_proto = IPPROTO_UDP;
3169         struct rtable *rt = NULL;
3170         struct sk_buff *skb;
3171         struct rtmsg *rtm;
3172         struct flowi4 fl4 = {};
3173         __be32 dst = 0;
3174         __be32 src = 0;
3175         kuid_t uid;
3176         u32 iif;
3177         int err;
3178         int mark;
3179
3180         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3181         if (err < 0)
3182                 return err;
3183
3184         rtm = nlmsg_data(nlh);
3185         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3186         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3187         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3188         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3189         if (tb[RTA_UID])
3190                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3191         else
3192                 uid = (iif ? INVALID_UID : current_uid());
3193
3194         if (tb[RTA_IP_PROTO]) {
3195                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3196                                                   &ip_proto, AF_INET, extack);
3197                 if (err)
3198                         return err;
3199         }
3200
3201         if (tb[RTA_SPORT])
3202                 sport = nla_get_be16(tb[RTA_SPORT]);
3203
3204         if (tb[RTA_DPORT])
3205                 dport = nla_get_be16(tb[RTA_DPORT]);
3206
3207         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3208         if (!skb)
3209                 return -ENOBUFS;
3210
3211         fl4.daddr = dst;
3212         fl4.saddr = src;
3213         fl4.flowi4_tos = rtm->rtm_tos;
3214         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3215         fl4.flowi4_mark = mark;
3216         fl4.flowi4_uid = uid;
3217         if (sport)
3218                 fl4.fl4_sport = sport;
3219         if (dport)
3220                 fl4.fl4_dport = dport;
3221         fl4.flowi4_proto = ip_proto;
3222
3223         rcu_read_lock();
3224
3225         if (iif) {
3226                 struct net_device *dev;
3227
3228                 dev = dev_get_by_index_rcu(net, iif);
3229                 if (!dev) {
3230                         err = -ENODEV;
3231                         goto errout_rcu;
3232                 }
3233
3234                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3235                 skb->dev        = dev;
3236                 skb->mark       = mark;
3237                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3238                                          dev, &res);
3239
3240                 rt = skb_rtable(skb);
3241                 if (err == 0 && rt->dst.error)
3242                         err = -rt->dst.error;
3243         } else {
3244                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3245                 skb->dev = net->loopback_dev;
3246                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3247                 err = 0;
3248                 if (IS_ERR(rt))
3249                         err = PTR_ERR(rt);
3250                 else
3251                         skb_dst_set(skb, &rt->dst);
3252         }
3253
3254         if (err)
3255                 goto errout_rcu;
3256
3257         if (rtm->rtm_flags & RTM_F_NOTIFY)
3258                 rt->rt_flags |= RTCF_NOTIFY;
3259
3260         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3261                 table_id = res.table ? res.table->tb_id : 0;
3262
3263         /* reset skb for netlink reply msg */
3264         skb_trim(skb, 0);
3265         skb_reset_network_header(skb);
3266         skb_reset_transport_header(skb);
3267         skb_reset_mac_header(skb);
3268
3269         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3270                 struct fib_rt_info fri;
3271
3272                 if (!res.fi) {
3273                         err = fib_props[res.type].error;
3274                         if (!err)
3275                                 err = -EHOSTUNREACH;
3276                         goto errout_rcu;
3277                 }
3278                 fri.fi = res.fi;
3279                 fri.tb_id = table_id;
3280                 fri.dst = res.prefix;
3281                 fri.dst_len = res.prefixlen;
3282                 fri.tos = fl4.flowi4_tos;
3283                 fri.type = rt->rt_type;
3284                 fri.offload = 0;
3285                 fri.trap = 0;
3286                 if (res.fa_head) {
3287                         struct fib_alias *fa;
3288
3289                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3290                                 u8 slen = 32 - fri.dst_len;
3291
3292                                 if (fa->fa_slen == slen &&
3293                                     fa->tb_id == fri.tb_id &&
3294                                     fa->fa_tos == fri.tos &&
3295                                     fa->fa_info == res.fi &&
3296                                     fa->fa_type == fri.type) {
3297                                         fri.offload = fa->offload;
3298                                         fri.trap = fa->trap;
3299                                         break;
3300                                 }
3301                         }
3302                 }
3303                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3304                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3305         } else {
3306                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3307                                    NETLINK_CB(in_skb).portid,
3308                                    nlh->nlmsg_seq, 0);
3309         }
3310         if (err < 0)
3311                 goto errout_rcu;
3312
3313         rcu_read_unlock();
3314
3315         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3316
3317 errout_free:
3318         return err;
3319 errout_rcu:
3320         rcu_read_unlock();
3321         kfree_skb(skb);
3322         goto errout_free;
3323 }
3324
3325 void ip_rt_multicast_event(struct in_device *in_dev)
3326 {
3327         rt_cache_flush(dev_net(in_dev->dev));
3328 }
3329
3330 #ifdef CONFIG_SYSCTL
3331 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3332 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3333 static int ip_rt_gc_elasticity __read_mostly    = 8;
3334 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3335
3336 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3337                 void *buffer, size_t *lenp, loff_t *ppos)
3338 {
3339         struct net *net = (struct net *)__ctl->extra1;
3340
3341         if (write) {
3342                 rt_cache_flush(net);
3343                 fnhe_genid_bump(net);
3344                 return 0;
3345         }
3346
3347         return -EINVAL;
3348 }
3349
3350 static struct ctl_table ipv4_route_table[] = {
3351         {
3352                 .procname       = "gc_thresh",
3353                 .data           = &ipv4_dst_ops.gc_thresh,
3354                 .maxlen         = sizeof(int),
3355                 .mode           = 0644,
3356                 .proc_handler   = proc_dointvec,
3357         },
3358         {
3359                 .procname       = "max_size",
3360                 .data           = &ip_rt_max_size,
3361                 .maxlen         = sizeof(int),
3362                 .mode           = 0644,
3363                 .proc_handler   = proc_dointvec,
3364         },
3365         {
3366                 /*  Deprecated. Use gc_min_interval_ms */
3367
3368                 .procname       = "gc_min_interval",
3369                 .data           = &ip_rt_gc_min_interval,
3370                 .maxlen         = sizeof(int),
3371                 .mode           = 0644,
3372                 .proc_handler   = proc_dointvec_jiffies,
3373         },
3374         {
3375                 .procname       = "gc_min_interval_ms",
3376                 .data           = &ip_rt_gc_min_interval,
3377                 .maxlen         = sizeof(int),
3378                 .mode           = 0644,
3379                 .proc_handler   = proc_dointvec_ms_jiffies,
3380         },
3381         {
3382                 .procname       = "gc_timeout",
3383                 .data           = &ip_rt_gc_timeout,
3384                 .maxlen         = sizeof(int),
3385                 .mode           = 0644,
3386                 .proc_handler   = proc_dointvec_jiffies,
3387         },
3388         {
3389                 .procname       = "gc_interval",
3390                 .data           = &ip_rt_gc_interval,
3391                 .maxlen         = sizeof(int),
3392                 .mode           = 0644,
3393                 .proc_handler   = proc_dointvec_jiffies,
3394         },
3395         {
3396                 .procname       = "redirect_load",
3397                 .data           = &ip_rt_redirect_load,
3398                 .maxlen         = sizeof(int),
3399                 .mode           = 0644,
3400                 .proc_handler   = proc_dointvec,
3401         },
3402         {
3403                 .procname       = "redirect_number",
3404                 .data           = &ip_rt_redirect_number,
3405                 .maxlen         = sizeof(int),
3406                 .mode           = 0644,
3407                 .proc_handler   = proc_dointvec,
3408         },
3409         {
3410                 .procname       = "redirect_silence",
3411                 .data           = &ip_rt_redirect_silence,
3412                 .maxlen         = sizeof(int),
3413                 .mode           = 0644,
3414                 .proc_handler   = proc_dointvec,
3415         },
3416         {
3417                 .procname       = "error_cost",
3418                 .data           = &ip_rt_error_cost,
3419                 .maxlen         = sizeof(int),
3420                 .mode           = 0644,
3421                 .proc_handler   = proc_dointvec,
3422         },
3423         {
3424                 .procname       = "error_burst",
3425                 .data           = &ip_rt_error_burst,
3426                 .maxlen         = sizeof(int),
3427                 .mode           = 0644,
3428                 .proc_handler   = proc_dointvec,
3429         },
3430         {
3431                 .procname       = "gc_elasticity",
3432                 .data           = &ip_rt_gc_elasticity,
3433                 .maxlen         = sizeof(int),
3434                 .mode           = 0644,
3435                 .proc_handler   = proc_dointvec,
3436         },
3437         {
3438                 .procname       = "mtu_expires",
3439                 .data           = &ip_rt_mtu_expires,
3440                 .maxlen         = sizeof(int),
3441                 .mode           = 0644,
3442                 .proc_handler   = proc_dointvec_jiffies,
3443         },
3444         {
3445                 .procname       = "min_pmtu",
3446                 .data           = &ip_rt_min_pmtu,
3447                 .maxlen         = sizeof(int),
3448                 .mode           = 0644,
3449                 .proc_handler   = proc_dointvec_minmax,
3450                 .extra1         = &ip_min_valid_pmtu,
3451         },
3452         {
3453                 .procname       = "min_adv_mss",
3454                 .data           = &ip_rt_min_advmss,
3455                 .maxlen         = sizeof(int),
3456                 .mode           = 0644,
3457                 .proc_handler   = proc_dointvec,
3458         },
3459         { }
3460 };
3461
3462 static const char ipv4_route_flush_procname[] = "flush";
3463
3464 static struct ctl_table ipv4_route_flush_table[] = {
3465         {
3466                 .procname       = ipv4_route_flush_procname,
3467                 .maxlen         = sizeof(int),
3468                 .mode           = 0200,
3469                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3470         },
3471         { },
3472 };
3473
3474 static __net_init int sysctl_route_net_init(struct net *net)
3475 {
3476         struct ctl_table *tbl;
3477
3478         tbl = ipv4_route_flush_table;
3479         if (!net_eq(net, &init_net)) {
3480                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3481                 if (!tbl)
3482                         goto err_dup;
3483
3484                 /* Don't export non-whitelisted sysctls to unprivileged users */
3485                 if (net->user_ns != &init_user_ns) {
3486                         if (tbl[0].procname != ipv4_route_flush_procname)
3487                                 tbl[0].procname = NULL;
3488                 }
3489         }
3490         tbl[0].extra1 = net;
3491
3492         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3493         if (!net->ipv4.route_hdr)
3494                 goto err_reg;
3495         return 0;
3496
3497 err_reg:
3498         if (tbl != ipv4_route_flush_table)
3499                 kfree(tbl);
3500 err_dup:
3501         return -ENOMEM;
3502 }
3503
3504 static __net_exit void sysctl_route_net_exit(struct net *net)
3505 {
3506         struct ctl_table *tbl;
3507
3508         tbl = net->ipv4.route_hdr->ctl_table_arg;
3509         unregister_net_sysctl_table(net->ipv4.route_hdr);
3510         BUG_ON(tbl == ipv4_route_flush_table);
3511         kfree(tbl);
3512 }
3513
3514 static __net_initdata struct pernet_operations sysctl_route_ops = {
3515         .init = sysctl_route_net_init,
3516         .exit = sysctl_route_net_exit,
3517 };
3518 #endif
3519
3520 static __net_init int rt_genid_init(struct net *net)
3521 {
3522         atomic_set(&net->ipv4.rt_genid, 0);
3523         atomic_set(&net->fnhe_genid, 0);
3524         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3525         return 0;
3526 }
3527
3528 static __net_initdata struct pernet_operations rt_genid_ops = {
3529         .init = rt_genid_init,
3530 };
3531
3532 static int __net_init ipv4_inetpeer_init(struct net *net)
3533 {
3534         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3535
3536         if (!bp)
3537                 return -ENOMEM;
3538         inet_peer_base_init(bp);
3539         net->ipv4.peers = bp;
3540         return 0;
3541 }
3542
3543 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3544 {
3545         struct inet_peer_base *bp = net->ipv4.peers;
3546
3547         net->ipv4.peers = NULL;
3548         inetpeer_invalidate_tree(bp);
3549         kfree(bp);
3550 }
3551
3552 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3553         .init   =       ipv4_inetpeer_init,
3554         .exit   =       ipv4_inetpeer_exit,
3555 };
3556
3557 #ifdef CONFIG_IP_ROUTE_CLASSID
3558 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3559 #endif /* CONFIG_IP_ROUTE_CLASSID */
3560
3561 int __init ip_rt_init(void)
3562 {
3563         int cpu;
3564
3565         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3566                                   GFP_KERNEL);
3567         if (!ip_idents)
3568                 panic("IP: failed to allocate ip_idents\n");
3569
3570         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3571
3572         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3573         if (!ip_tstamps)
3574                 panic("IP: failed to allocate ip_tstamps\n");
3575
3576         for_each_possible_cpu(cpu) {
3577                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3578
3579                 INIT_LIST_HEAD(&ul->head);
3580                 spin_lock_init(&ul->lock);
3581         }
3582 #ifdef CONFIG_IP_ROUTE_CLASSID
3583         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3584         if (!ip_rt_acct)
3585                 panic("IP: failed to allocate ip_rt_acct\n");
3586 #endif
3587
3588         ipv4_dst_ops.kmem_cachep =
3589                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3590                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3591
3592         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3593
3594         if (dst_entries_init(&ipv4_dst_ops) < 0)
3595                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3596
3597         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3598                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3599
3600         ipv4_dst_ops.gc_thresh = ~0;
3601         ip_rt_max_size = INT_MAX;
3602
3603         devinet_init();
3604         ip_fib_init();
3605
3606         if (ip_rt_proc_init())
3607                 pr_err("Unable to create route proc files\n");
3608 #ifdef CONFIG_XFRM
3609         xfrm_init();
3610         xfrm4_init();
3611 #endif
3612         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3613                       RTNL_FLAG_DOIT_UNLOCKED);
3614
3615 #ifdef CONFIG_SYSCTL
3616         register_pernet_subsys(&sysctl_route_ops);
3617 #endif
3618         register_pernet_subsys(&rt_genid_ops);
3619         register_pernet_subsys(&ipv4_inetpeer_ops);
3620         return 0;
3621 }
3622
3623 #ifdef CONFIG_SYSCTL
3624 /*
3625  * We really need to sanitize the damn ipv4 init order, then all
3626  * this nonsense will go away.
3627  */
3628 void __init ip_static_sysctl_init(void)
3629 {
3630         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3631 }
3632 #endif