Merge tag 'integrity-v5.12-fix' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 INDIRECT_CALLABLE_SCOPE
137 struct dst_entry        *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 INDIRECT_CALLABLE_SCOPE
140 unsigned int            ipv4_mtu(const struct dst_entry *dst);
141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
142 static void              ipv4_link_failure(struct sk_buff *skb);
143 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144                                            struct sk_buff *skb, u32 mtu,
145                                            bool confirm_neigh);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct proc_ops rt_cache_proc_ops = {
243         .proc_open      = rt_cache_seq_open,
244         .proc_read      = seq_read,
245         .proc_lseek     = seq_lseek,
246         .proc_release   = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         (*pos)++;
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    0, /* st->in_hit */
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    0, /* st->out_hit */
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    0, /* st->gc_total */
311                    0, /* st->gc_ignored */
312                    0, /* st->gc_goal_miss */
313                    0, /* st->gc_dst_overflow */
314                    0, /* st->in_hlist_search */
315                    0  /* st->out_hlist_search */
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct proc_ops rt_cpu_proc_ops = {
334         .proc_open      = rt_cpu_seq_open,
335         .proc_read      = seq_read,
336         .proc_lseek     = seq_lseek,
337         .proc_release   = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364 #endif
365
366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368         struct proc_dir_entry *pde;
369
370         pde = proc_create("rt_cache", 0444, net->proc_net,
371                           &rt_cache_proc_ops);
372         if (!pde)
373                 goto err1;
374
375         pde = proc_create("rt_cache", 0444,
376                           net->proc_net_stat, &rt_cpu_proc_ops);
377         if (!pde)
378                 goto err2;
379
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381         pde = proc_create_single("rt_acct", 0, net->proc_net,
382                         rt_acct_proc_show);
383         if (!pde)
384                 goto err3;
385 #endif
386         return 0;
387
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390         remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393         remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395         return -ENOMEM;
396 }
397
398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400         remove_proc_entry("rt_cache", net->proc_net_stat);
401         remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403         remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406
407 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
408         .init = ip_rt_do_proc_init,
409         .exit = ip_rt_do_proc_exit,
410 };
411
412 static int __init ip_rt_proc_init(void)
413 {
414         return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416
417 #else
418 static inline int ip_rt_proc_init(void)
419 {
420         return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423
424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428
429 void rt_cache_flush(struct net *net)
430 {
431         rt_genid_bump_ipv4(net);
432 }
433
434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435                                            struct sk_buff *skb,
436                                            const void *daddr)
437 {
438         const struct rtable *rt = container_of(dst, struct rtable, dst);
439         struct net_device *dev = dst->dev;
440         struct neighbour *n;
441
442         rcu_read_lock_bh();
443
444         if (likely(rt->rt_gw_family == AF_INET)) {
445                 n = ip_neigh_gw4(dev, rt->rt_gw4);
446         } else if (rt->rt_gw_family == AF_INET6) {
447                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
448         } else {
449                 __be32 pkey;
450
451                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452                 n = ip_neigh_gw4(dev, pkey);
453         }
454
455         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456                 n = NULL;
457
458         rcu_read_unlock_bh();
459
460         return n;
461 }
462
463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465         const struct rtable *rt = container_of(dst, struct rtable, dst);
466         struct net_device *dev = dst->dev;
467         const __be32 *pkey = daddr;
468
469         if (rt->rt_gw_family == AF_INET) {
470                 pkey = (const __be32 *)&rt->rt_gw4;
471         } else if (rt->rt_gw_family == AF_INET6) {
472                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473         } else if (!daddr ||
474                  (rt->rt_flags &
475                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476                 return;
477         }
478         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480
481 #define IP_IDENTS_SZ 2048u
482
483 static atomic_t *ip_idents __read_mostly;
484 static u32 *ip_tstamps __read_mostly;
485
486 /* In order to protect privacy, we add a perturbation to identifiers
487  * if one generator is seldom used. This makes hard for an attacker
488  * to infer how many packets were sent between two points in time.
489  */
490 u32 ip_idents_reserve(u32 hash, int segs)
491 {
492         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
493         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
494         u32 old = READ_ONCE(*p_tstamp);
495         u32 now = (u32)jiffies;
496         u32 delta = 0;
497
498         if (old != now && cmpxchg(p_tstamp, old, now) == old)
499                 delta = prandom_u32_max(now - old);
500
501         /* If UBSAN reports an error there, please make sure your compiler
502          * supports -fno-strict-overflow before reporting it that was a bug
503          * in UBSAN, and it has been fixed in GCC-8.
504          */
505         return atomic_add_return(segs + delta, p_id) - segs;
506 }
507 EXPORT_SYMBOL(ip_idents_reserve);
508
509 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
510 {
511         u32 hash, id;
512
513         /* Note the following code is not safe, but this is okay. */
514         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
515                 get_random_bytes(&net->ipv4.ip_id_key,
516                                  sizeof(net->ipv4.ip_id_key));
517
518         hash = siphash_3u32((__force u32)iph->daddr,
519                             (__force u32)iph->saddr,
520                             iph->protocol,
521                             &net->ipv4.ip_id_key);
522         id = ip_idents_reserve(hash, segs);
523         iph->id = htons(id);
524 }
525 EXPORT_SYMBOL(__ip_select_ident);
526
527 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
528                              const struct sock *sk,
529                              const struct iphdr *iph,
530                              int oif, u8 tos,
531                              u8 prot, u32 mark, int flow_flags)
532 {
533         if (sk) {
534                 const struct inet_sock *inet = inet_sk(sk);
535
536                 oif = sk->sk_bound_dev_if;
537                 mark = sk->sk_mark;
538                 tos = RT_CONN_FLAGS(sk);
539                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540         }
541         flowi4_init_output(fl4, oif, mark, tos,
542                            RT_SCOPE_UNIVERSE, prot,
543                            flow_flags,
544                            iph->daddr, iph->saddr, 0, 0,
545                            sock_net_uid(net, sk));
546 }
547
548 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
549                                const struct sock *sk)
550 {
551         const struct net *net = dev_net(skb->dev);
552         const struct iphdr *iph = ip_hdr(skb);
553         int oif = skb->dev->ifindex;
554         u8 tos = RT_TOS(iph->tos);
555         u8 prot = iph->protocol;
556         u32 mark = skb->mark;
557
558         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564         const struct ip_options_rcu *inet_opt;
565         __be32 daddr = inet->inet_daddr;
566
567         rcu_read_lock();
568         inet_opt = rcu_dereference(inet->inet_opt);
569         if (inet_opt && inet_opt->opt.srr)
570                 daddr = inet_opt->opt.faddr;
571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574                            inet_sk_flowi_flags(sk),
575                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
576         rcu_read_unlock();
577 }
578
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580                                  const struct sk_buff *skb)
581 {
582         if (skb)
583                 build_skb_flow_key(fl4, skb, sk);
584         else
585                 build_sk_flow_key(fl4, sk);
586 }
587
588 static DEFINE_SPINLOCK(fnhe_lock);
589
590 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
591 {
592         struct rtable *rt;
593
594         rt = rcu_dereference(fnhe->fnhe_rth_input);
595         if (rt) {
596                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
597                 dst_dev_put(&rt->dst);
598                 dst_release(&rt->dst);
599         }
600         rt = rcu_dereference(fnhe->fnhe_rth_output);
601         if (rt) {
602                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
603                 dst_dev_put(&rt->dst);
604                 dst_release(&rt->dst);
605         }
606 }
607
608 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
609 {
610         struct fib_nh_exception *fnhe, *oldest;
611
612         oldest = rcu_dereference(hash->chain);
613         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
614              fnhe = rcu_dereference(fnhe->fnhe_next)) {
615                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616                         oldest = fnhe;
617         }
618         fnhe_flush_routes(oldest);
619         return oldest;
620 }
621
622 static inline u32 fnhe_hashfun(__be32 daddr)
623 {
624         static u32 fnhe_hashrnd __read_mostly;
625         u32 hval;
626
627         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
628         hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
629         return hash_32(hval, FNHE_HASH_SHIFT);
630 }
631
632 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
633 {
634         rt->rt_pmtu = fnhe->fnhe_pmtu;
635         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
636         rt->dst.expires = fnhe->fnhe_expires;
637
638         if (fnhe->fnhe_gw) {
639                 rt->rt_flags |= RTCF_REDIRECTED;
640                 rt->rt_uses_gateway = 1;
641                 rt->rt_gw_family = AF_INET;
642                 rt->rt_gw4 = fnhe->fnhe_gw;
643         }
644 }
645
646 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647                                   __be32 gw, u32 pmtu, bool lock,
648                                   unsigned long expires)
649 {
650         struct fnhe_hash_bucket *hash;
651         struct fib_nh_exception *fnhe;
652         struct rtable *rt;
653         u32 genid, hval;
654         unsigned int i;
655         int depth;
656
657         genid = fnhe_genid(dev_net(nhc->nhc_dev));
658         hval = fnhe_hashfun(daddr);
659
660         spin_lock_bh(&fnhe_lock);
661
662         hash = rcu_dereference(nhc->nhc_exceptions);
663         if (!hash) {
664                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665                 if (!hash)
666                         goto out_unlock;
667                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
668         }
669
670         hash += hval;
671
672         depth = 0;
673         for (fnhe = rcu_dereference(hash->chain); fnhe;
674              fnhe = rcu_dereference(fnhe->fnhe_next)) {
675                 if (fnhe->fnhe_daddr == daddr)
676                         break;
677                 depth++;
678         }
679
680         if (fnhe) {
681                 if (fnhe->fnhe_genid != genid)
682                         fnhe->fnhe_genid = genid;
683                 if (gw)
684                         fnhe->fnhe_gw = gw;
685                 if (pmtu) {
686                         fnhe->fnhe_pmtu = pmtu;
687                         fnhe->fnhe_mtu_locked = lock;
688                 }
689                 fnhe->fnhe_expires = max(1UL, expires);
690                 /* Update all cached dsts too */
691                 rt = rcu_dereference(fnhe->fnhe_rth_input);
692                 if (rt)
693                         fill_route_from_fnhe(rt, fnhe);
694                 rt = rcu_dereference(fnhe->fnhe_rth_output);
695                 if (rt)
696                         fill_route_from_fnhe(rt, fnhe);
697         } else {
698                 if (depth > FNHE_RECLAIM_DEPTH)
699                         fnhe = fnhe_oldest(hash);
700                 else {
701                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702                         if (!fnhe)
703                                 goto out_unlock;
704
705                         fnhe->fnhe_next = hash->chain;
706                         rcu_assign_pointer(hash->chain, fnhe);
707                 }
708                 fnhe->fnhe_genid = genid;
709                 fnhe->fnhe_daddr = daddr;
710                 fnhe->fnhe_gw = gw;
711                 fnhe->fnhe_pmtu = pmtu;
712                 fnhe->fnhe_mtu_locked = lock;
713                 fnhe->fnhe_expires = max(1UL, expires);
714
715                 /* Exception created; mark the cached routes for the nexthop
716                  * stale, so anyone caching it rechecks if this exception
717                  * applies to them.
718                  */
719                 rt = rcu_dereference(nhc->nhc_rth_input);
720                 if (rt)
721                         rt->dst.obsolete = DST_OBSOLETE_KILL;
722
723                 for_each_possible_cpu(i) {
724                         struct rtable __rcu **prt;
725                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726                         rt = rcu_dereference(*prt);
727                         if (rt)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                 }
730         }
731
732         fnhe->fnhe_stamp = jiffies;
733
734 out_unlock:
735         spin_unlock_bh(&fnhe_lock);
736 }
737
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739                              bool kill_route)
740 {
741         __be32 new_gw = icmp_hdr(skb)->un.gateway;
742         __be32 old_gw = ip_hdr(skb)->saddr;
743         struct net_device *dev = skb->dev;
744         struct in_device *in_dev;
745         struct fib_result res;
746         struct neighbour *n;
747         struct net *net;
748
749         switch (icmp_hdr(skb)->code & 7) {
750         case ICMP_REDIR_NET:
751         case ICMP_REDIR_NETTOS:
752         case ICMP_REDIR_HOST:
753         case ICMP_REDIR_HOSTTOS:
754                 break;
755
756         default:
757                 return;
758         }
759
760         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761                 return;
762
763         in_dev = __in_dev_get_rcu(dev);
764         if (!in_dev)
765                 return;
766
767         net = dev_net(dev);
768         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770             ipv4_is_zeronet(new_gw))
771                 goto reject_redirect;
772
773         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775                         goto reject_redirect;
776                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777                         goto reject_redirect;
778         } else {
779                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780                         goto reject_redirect;
781         }
782
783         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784         if (!n)
785                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786         if (!IS_ERR(n)) {
787                 if (!(n->nud_state & NUD_VALID)) {
788                         neigh_event_send(n, NULL);
789                 } else {
790                         if (fib_lookup(net, fl4, &res, 0) == 0) {
791                                 struct fib_nh_common *nhc;
792
793                                 fib_select_path(net, &res, fl4, skb);
794                                 nhc = FIB_RES_NHC(res);
795                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
796                                                 0, false,
797                                                 jiffies + ip_rt_gc_timeout);
798                         }
799                         if (kill_route)
800                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
801                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
802                 }
803                 neigh_release(n);
804         }
805         return;
806
807 reject_redirect:
808 #ifdef CONFIG_IP_ROUTE_VERBOSE
809         if (IN_DEV_LOG_MARTIANS(in_dev)) {
810                 const struct iphdr *iph = (const struct iphdr *) skb->data;
811                 __be32 daddr = iph->daddr;
812                 __be32 saddr = iph->saddr;
813
814                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
815                                      "  Advised path = %pI4 -> %pI4\n",
816                                      &old_gw, dev->name, &new_gw,
817                                      &saddr, &daddr);
818         }
819 #endif
820         ;
821 }
822
823 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
824 {
825         struct rtable *rt;
826         struct flowi4 fl4;
827         const struct iphdr *iph = (const struct iphdr *) skb->data;
828         struct net *net = dev_net(skb->dev);
829         int oif = skb->dev->ifindex;
830         u8 tos = RT_TOS(iph->tos);
831         u8 prot = iph->protocol;
832         u32 mark = skb->mark;
833
834         rt = (struct rtable *) dst;
835
836         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
837         __ip_do_redirect(rt, skb, &fl4, true);
838 }
839
840 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
841 {
842         struct rtable *rt = (struct rtable *)dst;
843         struct dst_entry *ret = dst;
844
845         if (rt) {
846                 if (dst->obsolete > 0) {
847                         ip_rt_put(rt);
848                         ret = NULL;
849                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
850                            rt->dst.expires) {
851                         ip_rt_put(rt);
852                         ret = NULL;
853                 }
854         }
855         return ret;
856 }
857
858 /*
859  * Algorithm:
860  *      1. The first ip_rt_redirect_number redirects are sent
861  *         with exponential backoff, then we stop sending them at all,
862  *         assuming that the host ignores our redirects.
863  *      2. If we did not see packets requiring redirects
864  *         during ip_rt_redirect_silence, we assume that the host
865  *         forgot redirected route and start to send redirects again.
866  *
867  * This algorithm is much cheaper and more intelligent than dumb load limiting
868  * in icmp.c.
869  *
870  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
871  * and "frag. need" (breaks PMTU discovery) in icmp.c.
872  */
873
874 void ip_rt_send_redirect(struct sk_buff *skb)
875 {
876         struct rtable *rt = skb_rtable(skb);
877         struct in_device *in_dev;
878         struct inet_peer *peer;
879         struct net *net;
880         int log_martians;
881         int vif;
882
883         rcu_read_lock();
884         in_dev = __in_dev_get_rcu(rt->dst.dev);
885         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
886                 rcu_read_unlock();
887                 return;
888         }
889         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
890         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
891         rcu_read_unlock();
892
893         net = dev_net(rt->dst.dev);
894         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
895         if (!peer) {
896                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
897                           rt_nexthop(rt, ip_hdr(skb)->daddr));
898                 return;
899         }
900
901         /* No redirected packets during ip_rt_redirect_silence;
902          * reset the algorithm.
903          */
904         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
905                 peer->rate_tokens = 0;
906                 peer->n_redirects = 0;
907         }
908
909         /* Too many ignored redirects; do not send anything
910          * set dst.rate_last to the last seen redirected packet.
911          */
912         if (peer->n_redirects >= ip_rt_redirect_number) {
913                 peer->rate_last = jiffies;
914                 goto out_put_peer;
915         }
916
917         /* Check for load limit; set rate_last to the latest sent
918          * redirect.
919          */
920         if (peer->n_redirects == 0 ||
921             time_after(jiffies,
922                        (peer->rate_last +
923                         (ip_rt_redirect_load << peer->n_redirects)))) {
924                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
925
926                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
927                 peer->rate_last = jiffies;
928                 ++peer->n_redirects;
929 #ifdef CONFIG_IP_ROUTE_VERBOSE
930                 if (log_martians &&
931                     peer->n_redirects == ip_rt_redirect_number)
932                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
933                                              &ip_hdr(skb)->saddr, inet_iif(skb),
934                                              &ip_hdr(skb)->daddr, &gw);
935 #endif
936         }
937 out_put_peer:
938         inet_putpeer(peer);
939 }
940
941 static int ip_error(struct sk_buff *skb)
942 {
943         struct rtable *rt = skb_rtable(skb);
944         struct net_device *dev = skb->dev;
945         struct in_device *in_dev;
946         struct inet_peer *peer;
947         unsigned long now;
948         struct net *net;
949         bool send;
950         int code;
951
952         if (netif_is_l3_master(skb->dev)) {
953                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
954                 if (!dev)
955                         goto out;
956         }
957
958         in_dev = __in_dev_get_rcu(dev);
959
960         /* IP on this device is disabled. */
961         if (!in_dev)
962                 goto out;
963
964         net = dev_net(rt->dst.dev);
965         if (!IN_DEV_FORWARD(in_dev)) {
966                 switch (rt->dst.error) {
967                 case EHOSTUNREACH:
968                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
969                         break;
970
971                 case ENETUNREACH:
972                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
973                         break;
974                 }
975                 goto out;
976         }
977
978         switch (rt->dst.error) {
979         case EINVAL:
980         default:
981                 goto out;
982         case EHOSTUNREACH:
983                 code = ICMP_HOST_UNREACH;
984                 break;
985         case ENETUNREACH:
986                 code = ICMP_NET_UNREACH;
987                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
988                 break;
989         case EACCES:
990                 code = ICMP_PKT_FILTERED;
991                 break;
992         }
993
994         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
995                                l3mdev_master_ifindex(skb->dev), 1);
996
997         send = true;
998         if (peer) {
999                 now = jiffies;
1000                 peer->rate_tokens += now - peer->rate_last;
1001                 if (peer->rate_tokens > ip_rt_error_burst)
1002                         peer->rate_tokens = ip_rt_error_burst;
1003                 peer->rate_last = now;
1004                 if (peer->rate_tokens >= ip_rt_error_cost)
1005                         peer->rate_tokens -= ip_rt_error_cost;
1006                 else
1007                         send = false;
1008                 inet_putpeer(peer);
1009         }
1010         if (send)
1011                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1012
1013 out:    kfree_skb(skb);
1014         return 0;
1015 }
1016
1017 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1018 {
1019         struct dst_entry *dst = &rt->dst;
1020         struct net *net = dev_net(dst->dev);
1021         struct fib_result res;
1022         bool lock = false;
1023         u32 old_mtu;
1024
1025         if (ip_mtu_locked(dst))
1026                 return;
1027
1028         old_mtu = ipv4_mtu(dst);
1029         if (old_mtu < mtu)
1030                 return;
1031
1032         if (mtu < ip_rt_min_pmtu) {
1033                 lock = true;
1034                 mtu = min(old_mtu, ip_rt_min_pmtu);
1035         }
1036
1037         if (rt->rt_pmtu == mtu && !lock &&
1038             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1039                 return;
1040
1041         rcu_read_lock();
1042         if (fib_lookup(net, fl4, &res, 0) == 0) {
1043                 struct fib_nh_common *nhc;
1044
1045                 fib_select_path(net, &res, fl4, NULL);
1046                 nhc = FIB_RES_NHC(res);
1047                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1048                                       jiffies + ip_rt_mtu_expires);
1049         }
1050         rcu_read_unlock();
1051 }
1052
1053 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1054                               struct sk_buff *skb, u32 mtu,
1055                               bool confirm_neigh)
1056 {
1057         struct rtable *rt = (struct rtable *) dst;
1058         struct flowi4 fl4;
1059
1060         ip_rt_build_flow_key(&fl4, sk, skb);
1061
1062         /* Don't make lookup fail for bridged encapsulations */
1063         if (skb && netif_is_any_bridge_port(skb->dev))
1064                 fl4.flowi4_oif = 0;
1065
1066         __ip_rt_update_pmtu(rt, &fl4, mtu);
1067 }
1068
1069 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1070                       int oif, u8 protocol)
1071 {
1072         const struct iphdr *iph = (const struct iphdr *)skb->data;
1073         struct flowi4 fl4;
1074         struct rtable *rt;
1075         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1076
1077         __build_flow_key(net, &fl4, NULL, iph, oif,
1078                          RT_TOS(iph->tos), protocol, mark, 0);
1079         rt = __ip_route_output_key(net, &fl4);
1080         if (!IS_ERR(rt)) {
1081                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1082                 ip_rt_put(rt);
1083         }
1084 }
1085 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1086
1087 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1088 {
1089         const struct iphdr *iph = (const struct iphdr *)skb->data;
1090         struct flowi4 fl4;
1091         struct rtable *rt;
1092
1093         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1094
1095         if (!fl4.flowi4_mark)
1096                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1097
1098         rt = __ip_route_output_key(sock_net(sk), &fl4);
1099         if (!IS_ERR(rt)) {
1100                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1101                 ip_rt_put(rt);
1102         }
1103 }
1104
1105 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1106 {
1107         const struct iphdr *iph = (const struct iphdr *)skb->data;
1108         struct flowi4 fl4;
1109         struct rtable *rt;
1110         struct dst_entry *odst = NULL;
1111         bool new = false;
1112         struct net *net = sock_net(sk);
1113
1114         bh_lock_sock(sk);
1115
1116         if (!ip_sk_accept_pmtu(sk))
1117                 goto out;
1118
1119         odst = sk_dst_get(sk);
1120
1121         if (sock_owned_by_user(sk) || !odst) {
1122                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1123                 goto out;
1124         }
1125
1126         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1127
1128         rt = (struct rtable *)odst;
1129         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1130                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131                 if (IS_ERR(rt))
1132                         goto out;
1133
1134                 new = true;
1135         }
1136
1137         __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1138
1139         if (!dst_check(&rt->dst, 0)) {
1140                 if (new)
1141                         dst_release(&rt->dst);
1142
1143                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144                 if (IS_ERR(rt))
1145                         goto out;
1146
1147                 new = true;
1148         }
1149
1150         if (new)
1151                 sk_dst_set(sk, &rt->dst);
1152
1153 out:
1154         bh_unlock_sock(sk);
1155         dst_release(odst);
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1158
1159 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1160                    int oif, u8 protocol)
1161 {
1162         const struct iphdr *iph = (const struct iphdr *)skb->data;
1163         struct flowi4 fl4;
1164         struct rtable *rt;
1165
1166         __build_flow_key(net, &fl4, NULL, iph, oif,
1167                          RT_TOS(iph->tos), protocol, 0, 0);
1168         rt = __ip_route_output_key(net, &fl4);
1169         if (!IS_ERR(rt)) {
1170                 __ip_do_redirect(rt, skb, &fl4, false);
1171                 ip_rt_put(rt);
1172         }
1173 }
1174 EXPORT_SYMBOL_GPL(ipv4_redirect);
1175
1176 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1177 {
1178         const struct iphdr *iph = (const struct iphdr *)skb->data;
1179         struct flowi4 fl4;
1180         struct rtable *rt;
1181         struct net *net = sock_net(sk);
1182
1183         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1184         rt = __ip_route_output_key(net, &fl4);
1185         if (!IS_ERR(rt)) {
1186                 __ip_do_redirect(rt, skb, &fl4, false);
1187                 ip_rt_put(rt);
1188         }
1189 }
1190 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1191
1192 INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1193                                                          u32 cookie)
1194 {
1195         struct rtable *rt = (struct rtable *) dst;
1196
1197         /* All IPV4 dsts are created with ->obsolete set to the value
1198          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1199          * into this function always.
1200          *
1201          * When a PMTU/redirect information update invalidates a route,
1202          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1203          * DST_OBSOLETE_DEAD.
1204          */
1205         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1206                 return NULL;
1207         return dst;
1208 }
1209 EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1210
1211 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1212 {
1213         struct ip_options opt;
1214         int res;
1215
1216         /* Recompile ip options since IPCB may not be valid anymore.
1217          * Also check we have a reasonable ipv4 header.
1218          */
1219         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1220             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1221                 return;
1222
1223         memset(&opt, 0, sizeof(opt));
1224         if (ip_hdr(skb)->ihl > 5) {
1225                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1226                         return;
1227                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1228
1229                 rcu_read_lock();
1230                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1231                 rcu_read_unlock();
1232
1233                 if (res)
1234                         return;
1235         }
1236         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1237 }
1238
1239 static void ipv4_link_failure(struct sk_buff *skb)
1240 {
1241         struct rtable *rt;
1242
1243         ipv4_send_dest_unreach(skb);
1244
1245         rt = skb_rtable(skb);
1246         if (rt)
1247                 dst_set_expires(&rt->dst, 0);
1248 }
1249
1250 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1251 {
1252         pr_debug("%s: %pI4 -> %pI4, %s\n",
1253                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1254                  skb->dev ? skb->dev->name : "?");
1255         kfree_skb(skb);
1256         WARN_ON(1);
1257         return 0;
1258 }
1259
1260 /*
1261    We do not cache source address of outgoing interface,
1262    because it is used only by IP RR, TS and SRR options,
1263    so that it out of fast path.
1264
1265    BTW remember: "addr" is allowed to be not aligned
1266    in IP options!
1267  */
1268
1269 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1270 {
1271         __be32 src;
1272
1273         if (rt_is_output_route(rt))
1274                 src = ip_hdr(skb)->saddr;
1275         else {
1276                 struct fib_result res;
1277                 struct iphdr *iph = ip_hdr(skb);
1278                 struct flowi4 fl4 = {
1279                         .daddr = iph->daddr,
1280                         .saddr = iph->saddr,
1281                         .flowi4_tos = RT_TOS(iph->tos),
1282                         .flowi4_oif = rt->dst.dev->ifindex,
1283                         .flowi4_iif = skb->dev->ifindex,
1284                         .flowi4_mark = skb->mark,
1285                 };
1286
1287                 rcu_read_lock();
1288                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1289                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1290                 else
1291                         src = inet_select_addr(rt->dst.dev,
1292                                                rt_nexthop(rt, iph->daddr),
1293                                                RT_SCOPE_UNIVERSE);
1294                 rcu_read_unlock();
1295         }
1296         memcpy(addr, &src, 4);
1297 }
1298
1299 #ifdef CONFIG_IP_ROUTE_CLASSID
1300 static void set_class_tag(struct rtable *rt, u32 tag)
1301 {
1302         if (!(rt->dst.tclassid & 0xFFFF))
1303                 rt->dst.tclassid |= tag & 0xFFFF;
1304         if (!(rt->dst.tclassid & 0xFFFF0000))
1305                 rt->dst.tclassid |= tag & 0xFFFF0000;
1306 }
1307 #endif
1308
1309 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1310 {
1311         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1312         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1313                                     ip_rt_min_advmss);
1314
1315         return min(advmss, IPV4_MAX_PMTU - header_size);
1316 }
1317
1318 INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1319 {
1320         const struct rtable *rt = (const struct rtable *)dst;
1321         unsigned int mtu = rt->rt_pmtu;
1322
1323         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1324                 mtu = dst_metric_raw(dst, RTAX_MTU);
1325
1326         if (mtu)
1327                 return mtu;
1328
1329         mtu = READ_ONCE(dst->dev->mtu);
1330
1331         if (unlikely(ip_mtu_locked(dst))) {
1332                 if (rt->rt_uses_gateway && mtu > 576)
1333                         mtu = 576;
1334         }
1335
1336         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1337
1338         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1339 }
1340 EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1341
1342 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1343 {
1344         struct fnhe_hash_bucket *hash;
1345         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1346         u32 hval = fnhe_hashfun(daddr);
1347
1348         spin_lock_bh(&fnhe_lock);
1349
1350         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1351                                          lockdep_is_held(&fnhe_lock));
1352         hash += hval;
1353
1354         fnhe_p = &hash->chain;
1355         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1356         while (fnhe) {
1357                 if (fnhe->fnhe_daddr == daddr) {
1358                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1359                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1360                         /* set fnhe_daddr to 0 to ensure it won't bind with
1361                          * new dsts in rt_bind_exception().
1362                          */
1363                         fnhe->fnhe_daddr = 0;
1364                         fnhe_flush_routes(fnhe);
1365                         kfree_rcu(fnhe, rcu);
1366                         break;
1367                 }
1368                 fnhe_p = &fnhe->fnhe_next;
1369                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1370                                                  lockdep_is_held(&fnhe_lock));
1371         }
1372
1373         spin_unlock_bh(&fnhe_lock);
1374 }
1375
1376 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1377                                                __be32 daddr)
1378 {
1379         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1380         struct fib_nh_exception *fnhe;
1381         u32 hval;
1382
1383         if (!hash)
1384                 return NULL;
1385
1386         hval = fnhe_hashfun(daddr);
1387
1388         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1389              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1390                 if (fnhe->fnhe_daddr == daddr) {
1391                         if (fnhe->fnhe_expires &&
1392                             time_after(jiffies, fnhe->fnhe_expires)) {
1393                                 ip_del_fnhe(nhc, daddr);
1394                                 break;
1395                         }
1396                         return fnhe;
1397                 }
1398         }
1399         return NULL;
1400 }
1401
1402 /* MTU selection:
1403  * 1. mtu on route is locked - use it
1404  * 2. mtu from nexthop exception
1405  * 3. mtu from egress device
1406  */
1407
1408 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1409 {
1410         struct fib_nh_common *nhc = res->nhc;
1411         struct net_device *dev = nhc->nhc_dev;
1412         struct fib_info *fi = res->fi;
1413         u32 mtu = 0;
1414
1415         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1416             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1417                 mtu = fi->fib_mtu;
1418
1419         if (likely(!mtu)) {
1420                 struct fib_nh_exception *fnhe;
1421
1422                 fnhe = find_exception(nhc, daddr);
1423                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1424                         mtu = fnhe->fnhe_pmtu;
1425         }
1426
1427         if (likely(!mtu))
1428                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1429
1430         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1431 }
1432
1433 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1434                               __be32 daddr, const bool do_cache)
1435 {
1436         bool ret = false;
1437
1438         spin_lock_bh(&fnhe_lock);
1439
1440         if (daddr == fnhe->fnhe_daddr) {
1441                 struct rtable __rcu **porig;
1442                 struct rtable *orig;
1443                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1444
1445                 if (rt_is_input_route(rt))
1446                         porig = &fnhe->fnhe_rth_input;
1447                 else
1448                         porig = &fnhe->fnhe_rth_output;
1449                 orig = rcu_dereference(*porig);
1450
1451                 if (fnhe->fnhe_genid != genid) {
1452                         fnhe->fnhe_genid = genid;
1453                         fnhe->fnhe_gw = 0;
1454                         fnhe->fnhe_pmtu = 0;
1455                         fnhe->fnhe_expires = 0;
1456                         fnhe->fnhe_mtu_locked = false;
1457                         fnhe_flush_routes(fnhe);
1458                         orig = NULL;
1459                 }
1460                 fill_route_from_fnhe(rt, fnhe);
1461                 if (!rt->rt_gw4) {
1462                         rt->rt_gw4 = daddr;
1463                         rt->rt_gw_family = AF_INET;
1464                 }
1465
1466                 if (do_cache) {
1467                         dst_hold(&rt->dst);
1468                         rcu_assign_pointer(*porig, rt);
1469                         if (orig) {
1470                                 dst_dev_put(&orig->dst);
1471                                 dst_release(&orig->dst);
1472                         }
1473                         ret = true;
1474                 }
1475
1476                 fnhe->fnhe_stamp = jiffies;
1477         }
1478         spin_unlock_bh(&fnhe_lock);
1479
1480         return ret;
1481 }
1482
1483 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1484 {
1485         struct rtable *orig, *prev, **p;
1486         bool ret = true;
1487
1488         if (rt_is_input_route(rt)) {
1489                 p = (struct rtable **)&nhc->nhc_rth_input;
1490         } else {
1491                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1492         }
1493         orig = *p;
1494
1495         /* hold dst before doing cmpxchg() to avoid race condition
1496          * on this dst
1497          */
1498         dst_hold(&rt->dst);
1499         prev = cmpxchg(p, orig, rt);
1500         if (prev == orig) {
1501                 if (orig) {
1502                         rt_add_uncached_list(orig);
1503                         dst_release(&orig->dst);
1504                 }
1505         } else {
1506                 dst_release(&rt->dst);
1507                 ret = false;
1508         }
1509
1510         return ret;
1511 }
1512
1513 struct uncached_list {
1514         spinlock_t              lock;
1515         struct list_head        head;
1516 };
1517
1518 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1519
1520 void rt_add_uncached_list(struct rtable *rt)
1521 {
1522         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1523
1524         rt->rt_uncached_list = ul;
1525
1526         spin_lock_bh(&ul->lock);
1527         list_add_tail(&rt->rt_uncached, &ul->head);
1528         spin_unlock_bh(&ul->lock);
1529 }
1530
1531 void rt_del_uncached_list(struct rtable *rt)
1532 {
1533         if (!list_empty(&rt->rt_uncached)) {
1534                 struct uncached_list *ul = rt->rt_uncached_list;
1535
1536                 spin_lock_bh(&ul->lock);
1537                 list_del(&rt->rt_uncached);
1538                 spin_unlock_bh(&ul->lock);
1539         }
1540 }
1541
1542 static void ipv4_dst_destroy(struct dst_entry *dst)
1543 {
1544         struct rtable *rt = (struct rtable *)dst;
1545
1546         ip_dst_metrics_put(dst);
1547         rt_del_uncached_list(rt);
1548 }
1549
1550 void rt_flush_dev(struct net_device *dev)
1551 {
1552         struct rtable *rt;
1553         int cpu;
1554
1555         for_each_possible_cpu(cpu) {
1556                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1557
1558                 spin_lock_bh(&ul->lock);
1559                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1560                         if (rt->dst.dev != dev)
1561                                 continue;
1562                         rt->dst.dev = blackhole_netdev;
1563                         dev_hold(rt->dst.dev);
1564                         dev_put(dev);
1565                 }
1566                 spin_unlock_bh(&ul->lock);
1567         }
1568 }
1569
1570 static bool rt_cache_valid(const struct rtable *rt)
1571 {
1572         return  rt &&
1573                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1574                 !rt_is_expired(rt);
1575 }
1576
1577 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1578                            const struct fib_result *res,
1579                            struct fib_nh_exception *fnhe,
1580                            struct fib_info *fi, u16 type, u32 itag,
1581                            const bool do_cache)
1582 {
1583         bool cached = false;
1584
1585         if (fi) {
1586                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1587
1588                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1589                         rt->rt_uses_gateway = 1;
1590                         rt->rt_gw_family = nhc->nhc_gw_family;
1591                         /* only INET and INET6 are supported */
1592                         if (likely(nhc->nhc_gw_family == AF_INET))
1593                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1594                         else
1595                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1596                 }
1597
1598                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1599
1600 #ifdef CONFIG_IP_ROUTE_CLASSID
1601                 if (nhc->nhc_family == AF_INET) {
1602                         struct fib_nh *nh;
1603
1604                         nh = container_of(nhc, struct fib_nh, nh_common);
1605                         rt->dst.tclassid = nh->nh_tclassid;
1606                 }
1607 #endif
1608                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1609                 if (unlikely(fnhe))
1610                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1611                 else if (do_cache)
1612                         cached = rt_cache_route(nhc, rt);
1613                 if (unlikely(!cached)) {
1614                         /* Routes we intend to cache in nexthop exception or
1615                          * FIB nexthop have the DST_NOCACHE bit clear.
1616                          * However, if we are unsuccessful at storing this
1617                          * route into the cache we really need to set it.
1618                          */
1619                         if (!rt->rt_gw4) {
1620                                 rt->rt_gw_family = AF_INET;
1621                                 rt->rt_gw4 = daddr;
1622                         }
1623                         rt_add_uncached_list(rt);
1624                 }
1625         } else
1626                 rt_add_uncached_list(rt);
1627
1628 #ifdef CONFIG_IP_ROUTE_CLASSID
1629 #ifdef CONFIG_IP_MULTIPLE_TABLES
1630         set_class_tag(rt, res->tclassid);
1631 #endif
1632         set_class_tag(rt, itag);
1633 #endif
1634 }
1635
1636 struct rtable *rt_dst_alloc(struct net_device *dev,
1637                             unsigned int flags, u16 type,
1638                             bool nopolicy, bool noxfrm)
1639 {
1640         struct rtable *rt;
1641
1642         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1643                        (nopolicy ? DST_NOPOLICY : 0) |
1644                        (noxfrm ? DST_NOXFRM : 0));
1645
1646         if (rt) {
1647                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1648                 rt->rt_flags = flags;
1649                 rt->rt_type = type;
1650                 rt->rt_is_input = 0;
1651                 rt->rt_iif = 0;
1652                 rt->rt_pmtu = 0;
1653                 rt->rt_mtu_locked = 0;
1654                 rt->rt_uses_gateway = 0;
1655                 rt->rt_gw_family = 0;
1656                 rt->rt_gw4 = 0;
1657                 INIT_LIST_HEAD(&rt->rt_uncached);
1658
1659                 rt->dst.output = ip_output;
1660                 if (flags & RTCF_LOCAL)
1661                         rt->dst.input = ip_local_deliver;
1662         }
1663
1664         return rt;
1665 }
1666 EXPORT_SYMBOL(rt_dst_alloc);
1667
1668 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1669 {
1670         struct rtable *new_rt;
1671
1672         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673                            rt->dst.flags);
1674
1675         if (new_rt) {
1676                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1677                 new_rt->rt_flags = rt->rt_flags;
1678                 new_rt->rt_type = rt->rt_type;
1679                 new_rt->rt_is_input = rt->rt_is_input;
1680                 new_rt->rt_iif = rt->rt_iif;
1681                 new_rt->rt_pmtu = rt->rt_pmtu;
1682                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1683                 new_rt->rt_gw_family = rt->rt_gw_family;
1684                 if (rt->rt_gw_family == AF_INET)
1685                         new_rt->rt_gw4 = rt->rt_gw4;
1686                 else if (rt->rt_gw_family == AF_INET6)
1687                         new_rt->rt_gw6 = rt->rt_gw6;
1688                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1689
1690                 new_rt->dst.input = rt->dst.input;
1691                 new_rt->dst.output = rt->dst.output;
1692                 new_rt->dst.error = rt->dst.error;
1693                 new_rt->dst.lastuse = jiffies;
1694                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1695         }
1696         return new_rt;
1697 }
1698 EXPORT_SYMBOL(rt_dst_clone);
1699
1700 /* called in rcu_read_lock() section */
1701 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1702                           u8 tos, struct net_device *dev,
1703                           struct in_device *in_dev, u32 *itag)
1704 {
1705         int err;
1706
1707         /* Primary sanity checks. */
1708         if (!in_dev)
1709                 return -EINVAL;
1710
1711         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1712             skb->protocol != htons(ETH_P_IP))
1713                 return -EINVAL;
1714
1715         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1716                 return -EINVAL;
1717
1718         if (ipv4_is_zeronet(saddr)) {
1719                 if (!ipv4_is_local_multicast(daddr) &&
1720                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1721                         return -EINVAL;
1722         } else {
1723                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1724                                           in_dev, itag);
1725                 if (err < 0)
1726                         return err;
1727         }
1728         return 0;
1729 }
1730
1731 /* called in rcu_read_lock() section */
1732 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1733                              u8 tos, struct net_device *dev, int our)
1734 {
1735         struct in_device *in_dev = __in_dev_get_rcu(dev);
1736         unsigned int flags = RTCF_MULTICAST;
1737         struct rtable *rth;
1738         u32 itag = 0;
1739         int err;
1740
1741         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1742         if (err)
1743                 return err;
1744
1745         if (our)
1746                 flags |= RTCF_LOCAL;
1747
1748         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1749                            IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1750         if (!rth)
1751                 return -ENOBUFS;
1752
1753 #ifdef CONFIG_IP_ROUTE_CLASSID
1754         rth->dst.tclassid = itag;
1755 #endif
1756         rth->dst.output = ip_rt_bug;
1757         rth->rt_is_input= 1;
1758
1759 #ifdef CONFIG_IP_MROUTE
1760         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1761                 rth->dst.input = ip_mr_input;
1762 #endif
1763         RT_CACHE_STAT_INC(in_slow_mc);
1764
1765         skb_dst_set(skb, &rth->dst);
1766         return 0;
1767 }
1768
1769
1770 static void ip_handle_martian_source(struct net_device *dev,
1771                                      struct in_device *in_dev,
1772                                      struct sk_buff *skb,
1773                                      __be32 daddr,
1774                                      __be32 saddr)
1775 {
1776         RT_CACHE_STAT_INC(in_martian_src);
1777 #ifdef CONFIG_IP_ROUTE_VERBOSE
1778         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1779                 /*
1780                  *      RFC1812 recommendation, if source is martian,
1781                  *      the only hint is MAC header.
1782                  */
1783                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1784                         &daddr, &saddr, dev->name);
1785                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1786                         print_hex_dump(KERN_WARNING, "ll header: ",
1787                                        DUMP_PREFIX_OFFSET, 16, 1,
1788                                        skb_mac_header(skb),
1789                                        dev->hard_header_len, false);
1790                 }
1791         }
1792 #endif
1793 }
1794
1795 /* called in rcu_read_lock() section */
1796 static int __mkroute_input(struct sk_buff *skb,
1797                            const struct fib_result *res,
1798                            struct in_device *in_dev,
1799                            __be32 daddr, __be32 saddr, u32 tos)
1800 {
1801         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1802         struct net_device *dev = nhc->nhc_dev;
1803         struct fib_nh_exception *fnhe;
1804         struct rtable *rth;
1805         int err;
1806         struct in_device *out_dev;
1807         bool do_cache;
1808         u32 itag = 0;
1809
1810         /* get a working reference to the output device */
1811         out_dev = __in_dev_get_rcu(dev);
1812         if (!out_dev) {
1813                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1814                 return -EINVAL;
1815         }
1816
1817         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1818                                   in_dev->dev, in_dev, &itag);
1819         if (err < 0) {
1820                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1821                                          saddr);
1822
1823                 goto cleanup;
1824         }
1825
1826         do_cache = res->fi && !itag;
1827         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1828             skb->protocol == htons(ETH_P_IP)) {
1829                 __be32 gw;
1830
1831                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1832                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1833                     inet_addr_onlink(out_dev, saddr, gw))
1834                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1835         }
1836
1837         if (skb->protocol != htons(ETH_P_IP)) {
1838                 /* Not IP (i.e. ARP). Do not create route, if it is
1839                  * invalid for proxy arp. DNAT routes are always valid.
1840                  *
1841                  * Proxy arp feature have been extended to allow, ARP
1842                  * replies back to the same interface, to support
1843                  * Private VLAN switch technologies. See arp.c.
1844                  */
1845                 if (out_dev == in_dev &&
1846                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1847                         err = -EINVAL;
1848                         goto cleanup;
1849                 }
1850         }
1851
1852         fnhe = find_exception(nhc, daddr);
1853         if (do_cache) {
1854                 if (fnhe)
1855                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1856                 else
1857                         rth = rcu_dereference(nhc->nhc_rth_input);
1858                 if (rt_cache_valid(rth)) {
1859                         skb_dst_set_noref(skb, &rth->dst);
1860                         goto out;
1861                 }
1862         }
1863
1864         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1865                            IN_DEV_ORCONF(in_dev, NOPOLICY),
1866                            IN_DEV_ORCONF(out_dev, NOXFRM));
1867         if (!rth) {
1868                 err = -ENOBUFS;
1869                 goto cleanup;
1870         }
1871
1872         rth->rt_is_input = 1;
1873         RT_CACHE_STAT_INC(in_slow_tot);
1874
1875         rth->dst.input = ip_forward;
1876
1877         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1878                        do_cache);
1879         lwtunnel_set_redirect(&rth->dst);
1880         skb_dst_set(skb, &rth->dst);
1881 out:
1882         err = 0;
1883  cleanup:
1884         return err;
1885 }
1886
1887 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1888 /* To make ICMP packets follow the right flow, the multipath hash is
1889  * calculated from the inner IP addresses.
1890  */
1891 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1892                                  struct flow_keys *hash_keys)
1893 {
1894         const struct iphdr *outer_iph = ip_hdr(skb);
1895         const struct iphdr *key_iph = outer_iph;
1896         const struct iphdr *inner_iph;
1897         const struct icmphdr *icmph;
1898         struct iphdr _inner_iph;
1899         struct icmphdr _icmph;
1900
1901         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1902                 goto out;
1903
1904         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1905                 goto out;
1906
1907         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1908                                    &_icmph);
1909         if (!icmph)
1910                 goto out;
1911
1912         if (!icmp_is_err(icmph->type))
1913                 goto out;
1914
1915         inner_iph = skb_header_pointer(skb,
1916                                        outer_iph->ihl * 4 + sizeof(_icmph),
1917                                        sizeof(_inner_iph), &_inner_iph);
1918         if (!inner_iph)
1919                 goto out;
1920
1921         key_iph = inner_iph;
1922 out:
1923         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1924         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1925 }
1926
1927 /* if skb is set it will be used and fl4 can be NULL */
1928 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1929                        const struct sk_buff *skb, struct flow_keys *flkeys)
1930 {
1931         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1932         struct flow_keys hash_keys;
1933         u32 mhash;
1934
1935         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1936         case 0:
1937                 memset(&hash_keys, 0, sizeof(hash_keys));
1938                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1939                 if (skb) {
1940                         ip_multipath_l3_keys(skb, &hash_keys);
1941                 } else {
1942                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1943                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1944                 }
1945                 break;
1946         case 1:
1947                 /* skb is currently provided only when forwarding */
1948                 if (skb) {
1949                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1950                         struct flow_keys keys;
1951
1952                         /* short-circuit if we already have L4 hash present */
1953                         if (skb->l4_hash)
1954                                 return skb_get_hash_raw(skb) >> 1;
1955
1956                         memset(&hash_keys, 0, sizeof(hash_keys));
1957
1958                         if (!flkeys) {
1959                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1960                                 flkeys = &keys;
1961                         }
1962
1963                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1964                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1965                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1966                         hash_keys.ports.src = flkeys->ports.src;
1967                         hash_keys.ports.dst = flkeys->ports.dst;
1968                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1969                 } else {
1970                         memset(&hash_keys, 0, sizeof(hash_keys));
1971                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1972                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1973                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1974                         hash_keys.ports.src = fl4->fl4_sport;
1975                         hash_keys.ports.dst = fl4->fl4_dport;
1976                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1977                 }
1978                 break;
1979         case 2:
1980                 memset(&hash_keys, 0, sizeof(hash_keys));
1981                 /* skb is currently provided only when forwarding */
1982                 if (skb) {
1983                         struct flow_keys keys;
1984
1985                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1986                         /* Inner can be v4 or v6 */
1987                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1988                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1990                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1991                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1992                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1993                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1994                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1995                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1996                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1997                         } else {
1998                                 /* Same as case 0 */
1999                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2000                                 ip_multipath_l3_keys(skb, &hash_keys);
2001                         }
2002                 } else {
2003                         /* Same as case 0 */
2004                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2005                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2006                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2007                 }
2008                 break;
2009         }
2010         mhash = flow_hash_from_keys(&hash_keys);
2011
2012         if (multipath_hash)
2013                 mhash = jhash_2words(mhash, multipath_hash, 0);
2014
2015         return mhash >> 1;
2016 }
2017 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2018
2019 static int ip_mkroute_input(struct sk_buff *skb,
2020                             struct fib_result *res,
2021                             struct in_device *in_dev,
2022                             __be32 daddr, __be32 saddr, u32 tos,
2023                             struct flow_keys *hkeys)
2024 {
2025 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2026         if (res->fi && fib_info_num_path(res->fi) > 1) {
2027                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2028
2029                 fib_select_multipath(res, h);
2030         }
2031 #endif
2032
2033         /* create a routing cache entry */
2034         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2035 }
2036
2037 /* Implements all the saddr-related checks as ip_route_input_slow(),
2038  * assuming daddr is valid and the destination is not a local broadcast one.
2039  * Uses the provided hint instead of performing a route lookup.
2040  */
2041 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2042                       u8 tos, struct net_device *dev,
2043                       const struct sk_buff *hint)
2044 {
2045         struct in_device *in_dev = __in_dev_get_rcu(dev);
2046         struct rtable *rt = skb_rtable(hint);
2047         struct net *net = dev_net(dev);
2048         int err = -EINVAL;
2049         u32 tag = 0;
2050
2051         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2052                 goto martian_source;
2053
2054         if (ipv4_is_zeronet(saddr))
2055                 goto martian_source;
2056
2057         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2058                 goto martian_source;
2059
2060         if (rt->rt_type != RTN_LOCAL)
2061                 goto skip_validate_source;
2062
2063         tos &= IPTOS_RT_MASK;
2064         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2065         if (err < 0)
2066                 goto martian_source;
2067
2068 skip_validate_source:
2069         skb_dst_copy(skb, hint);
2070         return 0;
2071
2072 martian_source:
2073         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2074         return err;
2075 }
2076
2077 /*
2078  *      NOTE. We drop all the packets that has local source
2079  *      addresses, because every properly looped back packet
2080  *      must have correct destination already attached by output routine.
2081  *      Changes in the enforced policies must be applied also to
2082  *      ip_route_use_hint().
2083  *
2084  *      Such approach solves two big problems:
2085  *      1. Not simplex devices are handled properly.
2086  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2087  *      called with rcu_read_lock()
2088  */
2089
2090 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091                                u8 tos, struct net_device *dev,
2092                                struct fib_result *res)
2093 {
2094         struct in_device *in_dev = __in_dev_get_rcu(dev);
2095         struct flow_keys *flkeys = NULL, _flkeys;
2096         struct net    *net = dev_net(dev);
2097         struct ip_tunnel_info *tun_info;
2098         int             err = -EINVAL;
2099         unsigned int    flags = 0;
2100         u32             itag = 0;
2101         struct rtable   *rth;
2102         struct flowi4   fl4;
2103         bool do_cache = true;
2104
2105         /* IP on this device is disabled. */
2106
2107         if (!in_dev)
2108                 goto out;
2109
2110         /* Check for the most weird martians, which can be not detected
2111            by fib_lookup.
2112          */
2113
2114         tun_info = skb_tunnel_info(skb);
2115         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2116                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2117         else
2118                 fl4.flowi4_tun_key.tun_id = 0;
2119         skb_dst_drop(skb);
2120
2121         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2122                 goto martian_source;
2123
2124         res->fi = NULL;
2125         res->table = NULL;
2126         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2127                 goto brd_input;
2128
2129         /* Accept zero addresses only to limited broadcast;
2130          * I even do not know to fix it or not. Waiting for complains :-)
2131          */
2132         if (ipv4_is_zeronet(saddr))
2133                 goto martian_source;
2134
2135         if (ipv4_is_zeronet(daddr))
2136                 goto martian_destination;
2137
2138         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2139          * and call it once if daddr or/and saddr are loopback addresses
2140          */
2141         if (ipv4_is_loopback(daddr)) {
2142                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2143                         goto martian_destination;
2144         } else if (ipv4_is_loopback(saddr)) {
2145                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2146                         goto martian_source;
2147         }
2148
2149         /*
2150          *      Now we are ready to route packet.
2151          */
2152         fl4.flowi4_oif = 0;
2153         fl4.flowi4_iif = dev->ifindex;
2154         fl4.flowi4_mark = skb->mark;
2155         fl4.flowi4_tos = tos;
2156         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2157         fl4.flowi4_flags = 0;
2158         fl4.daddr = daddr;
2159         fl4.saddr = saddr;
2160         fl4.flowi4_uid = sock_net_uid(net, NULL);
2161         fl4.flowi4_multipath_hash = 0;
2162
2163         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2164                 flkeys = &_flkeys;
2165         } else {
2166                 fl4.flowi4_proto = 0;
2167                 fl4.fl4_sport = 0;
2168                 fl4.fl4_dport = 0;
2169         }
2170
2171         err = fib_lookup(net, &fl4, res, 0);
2172         if (err != 0) {
2173                 if (!IN_DEV_FORWARD(in_dev))
2174                         err = -EHOSTUNREACH;
2175                 goto no_route;
2176         }
2177
2178         if (res->type == RTN_BROADCAST) {
2179                 if (IN_DEV_BFORWARD(in_dev))
2180                         goto make_route;
2181                 /* not do cache if bc_forwarding is enabled */
2182                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2183                         do_cache = false;
2184                 goto brd_input;
2185         }
2186
2187         if (res->type == RTN_LOCAL) {
2188                 err = fib_validate_source(skb, saddr, daddr, tos,
2189                                           0, dev, in_dev, &itag);
2190                 if (err < 0)
2191                         goto martian_source;
2192                 goto local_input;
2193         }
2194
2195         if (!IN_DEV_FORWARD(in_dev)) {
2196                 err = -EHOSTUNREACH;
2197                 goto no_route;
2198         }
2199         if (res->type != RTN_UNICAST)
2200                 goto martian_destination;
2201
2202 make_route:
2203         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2204 out:    return err;
2205
2206 brd_input:
2207         if (skb->protocol != htons(ETH_P_IP))
2208                 goto e_inval;
2209
2210         if (!ipv4_is_zeronet(saddr)) {
2211                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2212                                           in_dev, &itag);
2213                 if (err < 0)
2214                         goto martian_source;
2215         }
2216         flags |= RTCF_BROADCAST;
2217         res->type = RTN_BROADCAST;
2218         RT_CACHE_STAT_INC(in_brd);
2219
2220 local_input:
2221         do_cache &= res->fi && !itag;
2222         if (do_cache) {
2223                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2224
2225                 rth = rcu_dereference(nhc->nhc_rth_input);
2226                 if (rt_cache_valid(rth)) {
2227                         skb_dst_set_noref(skb, &rth->dst);
2228                         err = 0;
2229                         goto out;
2230                 }
2231         }
2232
2233         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2234                            flags | RTCF_LOCAL, res->type,
2235                            IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2236         if (!rth)
2237                 goto e_nobufs;
2238
2239         rth->dst.output= ip_rt_bug;
2240 #ifdef CONFIG_IP_ROUTE_CLASSID
2241         rth->dst.tclassid = itag;
2242 #endif
2243         rth->rt_is_input = 1;
2244
2245         RT_CACHE_STAT_INC(in_slow_tot);
2246         if (res->type == RTN_UNREACHABLE) {
2247                 rth->dst.input= ip_error;
2248                 rth->dst.error= -err;
2249                 rth->rt_flags   &= ~RTCF_LOCAL;
2250         }
2251
2252         if (do_cache) {
2253                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2254
2255                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2256                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2257                         WARN_ON(rth->dst.input == lwtunnel_input);
2258                         rth->dst.lwtstate->orig_input = rth->dst.input;
2259                         rth->dst.input = lwtunnel_input;
2260                 }
2261
2262                 if (unlikely(!rt_cache_route(nhc, rth)))
2263                         rt_add_uncached_list(rth);
2264         }
2265         skb_dst_set(skb, &rth->dst);
2266         err = 0;
2267         goto out;
2268
2269 no_route:
2270         RT_CACHE_STAT_INC(in_no_route);
2271         res->type = RTN_UNREACHABLE;
2272         res->fi = NULL;
2273         res->table = NULL;
2274         goto local_input;
2275
2276         /*
2277          *      Do not cache martian addresses: they should be logged (RFC1812)
2278          */
2279 martian_destination:
2280         RT_CACHE_STAT_INC(in_martian_dst);
2281 #ifdef CONFIG_IP_ROUTE_VERBOSE
2282         if (IN_DEV_LOG_MARTIANS(in_dev))
2283                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2284                                      &daddr, &saddr, dev->name);
2285 #endif
2286
2287 e_inval:
2288         err = -EINVAL;
2289         goto out;
2290
2291 e_nobufs:
2292         err = -ENOBUFS;
2293         goto out;
2294
2295 martian_source:
2296         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2297         goto out;
2298 }
2299
2300 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2301                          u8 tos, struct net_device *dev)
2302 {
2303         struct fib_result res;
2304         int err;
2305
2306         tos &= IPTOS_RT_MASK;
2307         rcu_read_lock();
2308         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2309         rcu_read_unlock();
2310
2311         return err;
2312 }
2313 EXPORT_SYMBOL(ip_route_input_noref);
2314
2315 /* called with rcu_read_lock held */
2316 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2317                        u8 tos, struct net_device *dev, struct fib_result *res)
2318 {
2319         /* Multicast recognition logic is moved from route cache to here.
2320            The problem was that too many Ethernet cards have broken/missing
2321            hardware multicast filters :-( As result the host on multicasting
2322            network acquires a lot of useless route cache entries, sort of
2323            SDR messages from all the world. Now we try to get rid of them.
2324            Really, provided software IP multicast filter is organized
2325            reasonably (at least, hashed), it does not result in a slowdown
2326            comparing with route cache reject entries.
2327            Note, that multicast routers are not affected, because
2328            route cache entry is created eventually.
2329          */
2330         if (ipv4_is_multicast(daddr)) {
2331                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2332                 int our = 0;
2333                 int err = -EINVAL;
2334
2335                 if (!in_dev)
2336                         return err;
2337                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2338                                       ip_hdr(skb)->protocol);
2339
2340                 /* check l3 master if no match yet */
2341                 if (!our && netif_is_l3_slave(dev)) {
2342                         struct in_device *l3_in_dev;
2343
2344                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2345                         if (l3_in_dev)
2346                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2347                                                       ip_hdr(skb)->protocol);
2348                 }
2349
2350                 if (our
2351 #ifdef CONFIG_IP_MROUTE
2352                         ||
2353                     (!ipv4_is_local_multicast(daddr) &&
2354                      IN_DEV_MFORWARD(in_dev))
2355 #endif
2356                    ) {
2357                         err = ip_route_input_mc(skb, daddr, saddr,
2358                                                 tos, dev, our);
2359                 }
2360                 return err;
2361         }
2362
2363         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2364 }
2365
2366 /* called with rcu_read_lock() */
2367 static struct rtable *__mkroute_output(const struct fib_result *res,
2368                                        const struct flowi4 *fl4, int orig_oif,
2369                                        struct net_device *dev_out,
2370                                        unsigned int flags)
2371 {
2372         struct fib_info *fi = res->fi;
2373         struct fib_nh_exception *fnhe;
2374         struct in_device *in_dev;
2375         u16 type = res->type;
2376         struct rtable *rth;
2377         bool do_cache;
2378
2379         in_dev = __in_dev_get_rcu(dev_out);
2380         if (!in_dev)
2381                 return ERR_PTR(-EINVAL);
2382
2383         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2384                 if (ipv4_is_loopback(fl4->saddr) &&
2385                     !(dev_out->flags & IFF_LOOPBACK) &&
2386                     !netif_is_l3_master(dev_out))
2387                         return ERR_PTR(-EINVAL);
2388
2389         if (ipv4_is_lbcast(fl4->daddr))
2390                 type = RTN_BROADCAST;
2391         else if (ipv4_is_multicast(fl4->daddr))
2392                 type = RTN_MULTICAST;
2393         else if (ipv4_is_zeronet(fl4->daddr))
2394                 return ERR_PTR(-EINVAL);
2395
2396         if (dev_out->flags & IFF_LOOPBACK)
2397                 flags |= RTCF_LOCAL;
2398
2399         do_cache = true;
2400         if (type == RTN_BROADCAST) {
2401                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2402                 fi = NULL;
2403         } else if (type == RTN_MULTICAST) {
2404                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2405                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2406                                      fl4->flowi4_proto))
2407                         flags &= ~RTCF_LOCAL;
2408                 else
2409                         do_cache = false;
2410                 /* If multicast route do not exist use
2411                  * default one, but do not gateway in this case.
2412                  * Yes, it is hack.
2413                  */
2414                 if (fi && res->prefixlen < 4)
2415                         fi = NULL;
2416         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2417                    (orig_oif != dev_out->ifindex)) {
2418                 /* For local routes that require a particular output interface
2419                  * we do not want to cache the result.  Caching the result
2420                  * causes incorrect behaviour when there are multiple source
2421                  * addresses on the interface, the end result being that if the
2422                  * intended recipient is waiting on that interface for the
2423                  * packet he won't receive it because it will be delivered on
2424                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2425                  * be set to the loopback interface as well.
2426                  */
2427                 do_cache = false;
2428         }
2429
2430         fnhe = NULL;
2431         do_cache &= fi != NULL;
2432         if (fi) {
2433                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2434                 struct rtable __rcu **prth;
2435
2436                 fnhe = find_exception(nhc, fl4->daddr);
2437                 if (!do_cache)
2438                         goto add;
2439                 if (fnhe) {
2440                         prth = &fnhe->fnhe_rth_output;
2441                 } else {
2442                         if (unlikely(fl4->flowi4_flags &
2443                                      FLOWI_FLAG_KNOWN_NH &&
2444                                      !(nhc->nhc_gw_family &&
2445                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2446                                 do_cache = false;
2447                                 goto add;
2448                         }
2449                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2450                 }
2451                 rth = rcu_dereference(*prth);
2452                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2453                         return rth;
2454         }
2455
2456 add:
2457         rth = rt_dst_alloc(dev_out, flags, type,
2458                            IN_DEV_ORCONF(in_dev, NOPOLICY),
2459                            IN_DEV_ORCONF(in_dev, NOXFRM));
2460         if (!rth)
2461                 return ERR_PTR(-ENOBUFS);
2462
2463         rth->rt_iif = orig_oif;
2464
2465         RT_CACHE_STAT_INC(out_slow_tot);
2466
2467         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2468                 if (flags & RTCF_LOCAL &&
2469                     !(dev_out->flags & IFF_LOOPBACK)) {
2470                         rth->dst.output = ip_mc_output;
2471                         RT_CACHE_STAT_INC(out_slow_mc);
2472                 }
2473 #ifdef CONFIG_IP_MROUTE
2474                 if (type == RTN_MULTICAST) {
2475                         if (IN_DEV_MFORWARD(in_dev) &&
2476                             !ipv4_is_local_multicast(fl4->daddr)) {
2477                                 rth->dst.input = ip_mr_input;
2478                                 rth->dst.output = ip_mc_output;
2479                         }
2480                 }
2481 #endif
2482         }
2483
2484         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2485         lwtunnel_set_redirect(&rth->dst);
2486
2487         return rth;
2488 }
2489
2490 /*
2491  * Major route resolver routine.
2492  */
2493
2494 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2495                                         const struct sk_buff *skb)
2496 {
2497         __u8 tos = RT_FL_TOS(fl4);
2498         struct fib_result res = {
2499                 .type           = RTN_UNSPEC,
2500                 .fi             = NULL,
2501                 .table          = NULL,
2502                 .tclassid       = 0,
2503         };
2504         struct rtable *rth;
2505
2506         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2507         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2508         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2509                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2510
2511         rcu_read_lock();
2512         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2513         rcu_read_unlock();
2514
2515         return rth;
2516 }
2517 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2518
2519 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2520                                             struct fib_result *res,
2521                                             const struct sk_buff *skb)
2522 {
2523         struct net_device *dev_out = NULL;
2524         int orig_oif = fl4->flowi4_oif;
2525         unsigned int flags = 0;
2526         struct rtable *rth;
2527         int err;
2528
2529         if (fl4->saddr) {
2530                 if (ipv4_is_multicast(fl4->saddr) ||
2531                     ipv4_is_lbcast(fl4->saddr) ||
2532                     ipv4_is_zeronet(fl4->saddr)) {
2533                         rth = ERR_PTR(-EINVAL);
2534                         goto out;
2535                 }
2536
2537                 rth = ERR_PTR(-ENETUNREACH);
2538
2539                 /* I removed check for oif == dev_out->oif here.
2540                    It was wrong for two reasons:
2541                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2542                       is assigned to multiple interfaces.
2543                    2. Moreover, we are allowed to send packets with saddr
2544                       of another iface. --ANK
2545                  */
2546
2547                 if (fl4->flowi4_oif == 0 &&
2548                     (ipv4_is_multicast(fl4->daddr) ||
2549                      ipv4_is_lbcast(fl4->daddr))) {
2550                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2551                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2552                         if (!dev_out)
2553                                 goto out;
2554
2555                         /* Special hack: user can direct multicasts
2556                            and limited broadcast via necessary interface
2557                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2558                            This hack is not just for fun, it allows
2559                            vic,vat and friends to work.
2560                            They bind socket to loopback, set ttl to zero
2561                            and expect that it will work.
2562                            From the viewpoint of routing cache they are broken,
2563                            because we are not allowed to build multicast path
2564                            with loopback source addr (look, routing cache
2565                            cannot know, that ttl is zero, so that packet
2566                            will not leave this host and route is valid).
2567                            Luckily, this hack is good workaround.
2568                          */
2569
2570                         fl4->flowi4_oif = dev_out->ifindex;
2571                         goto make_route;
2572                 }
2573
2574                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2575                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2576                         if (!__ip_dev_find(net, fl4->saddr, false))
2577                                 goto out;
2578                 }
2579         }
2580
2581
2582         if (fl4->flowi4_oif) {
2583                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2584                 rth = ERR_PTR(-ENODEV);
2585                 if (!dev_out)
2586                         goto out;
2587
2588                 /* RACE: Check return value of inet_select_addr instead. */
2589                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2590                         rth = ERR_PTR(-ENETUNREACH);
2591                         goto out;
2592                 }
2593                 if (ipv4_is_local_multicast(fl4->daddr) ||
2594                     ipv4_is_lbcast(fl4->daddr) ||
2595                     fl4->flowi4_proto == IPPROTO_IGMP) {
2596                         if (!fl4->saddr)
2597                                 fl4->saddr = inet_select_addr(dev_out, 0,
2598                                                               RT_SCOPE_LINK);
2599                         goto make_route;
2600                 }
2601                 if (!fl4->saddr) {
2602                         if (ipv4_is_multicast(fl4->daddr))
2603                                 fl4->saddr = inet_select_addr(dev_out, 0,
2604                                                               fl4->flowi4_scope);
2605                         else if (!fl4->daddr)
2606                                 fl4->saddr = inet_select_addr(dev_out, 0,
2607                                                               RT_SCOPE_HOST);
2608                 }
2609         }
2610
2611         if (!fl4->daddr) {
2612                 fl4->daddr = fl4->saddr;
2613                 if (!fl4->daddr)
2614                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2615                 dev_out = net->loopback_dev;
2616                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2617                 res->type = RTN_LOCAL;
2618                 flags |= RTCF_LOCAL;
2619                 goto make_route;
2620         }
2621
2622         err = fib_lookup(net, fl4, res, 0);
2623         if (err) {
2624                 res->fi = NULL;
2625                 res->table = NULL;
2626                 if (fl4->flowi4_oif &&
2627                     (ipv4_is_multicast(fl4->daddr) ||
2628                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2629                         /* Apparently, routing tables are wrong. Assume,
2630                            that the destination is on link.
2631
2632                            WHY? DW.
2633                            Because we are allowed to send to iface
2634                            even if it has NO routes and NO assigned
2635                            addresses. When oif is specified, routing
2636                            tables are looked up with only one purpose:
2637                            to catch if destination is gatewayed, rather than
2638                            direct. Moreover, if MSG_DONTROUTE is set,
2639                            we send packet, ignoring both routing tables
2640                            and ifaddr state. --ANK
2641
2642
2643                            We could make it even if oif is unknown,
2644                            likely IPv6, but we do not.
2645                          */
2646
2647                         if (fl4->saddr == 0)
2648                                 fl4->saddr = inet_select_addr(dev_out, 0,
2649                                                               RT_SCOPE_LINK);
2650                         res->type = RTN_UNICAST;
2651                         goto make_route;
2652                 }
2653                 rth = ERR_PTR(err);
2654                 goto out;
2655         }
2656
2657         if (res->type == RTN_LOCAL) {
2658                 if (!fl4->saddr) {
2659                         if (res->fi->fib_prefsrc)
2660                                 fl4->saddr = res->fi->fib_prefsrc;
2661                         else
2662                                 fl4->saddr = fl4->daddr;
2663                 }
2664
2665                 /* L3 master device is the loopback for that domain */
2666                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2667                         net->loopback_dev;
2668
2669                 /* make sure orig_oif points to fib result device even
2670                  * though packet rx/tx happens over loopback or l3mdev
2671                  */
2672                 orig_oif = FIB_RES_OIF(*res);
2673
2674                 fl4->flowi4_oif = dev_out->ifindex;
2675                 flags |= RTCF_LOCAL;
2676                 goto make_route;
2677         }
2678
2679         fib_select_path(net, res, fl4, skb);
2680
2681         dev_out = FIB_RES_DEV(*res);
2682
2683 make_route:
2684         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2685
2686 out:
2687         return rth;
2688 }
2689
2690 static struct dst_ops ipv4_dst_blackhole_ops = {
2691         .family                 = AF_INET,
2692         .default_advmss         = ipv4_default_advmss,
2693         .neigh_lookup           = ipv4_neigh_lookup,
2694         .check                  = dst_blackhole_check,
2695         .cow_metrics            = dst_blackhole_cow_metrics,
2696         .update_pmtu            = dst_blackhole_update_pmtu,
2697         .redirect               = dst_blackhole_redirect,
2698         .mtu                    = dst_blackhole_mtu,
2699 };
2700
2701 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2702 {
2703         struct rtable *ort = (struct rtable *) dst_orig;
2704         struct rtable *rt;
2705
2706         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2707         if (rt) {
2708                 struct dst_entry *new = &rt->dst;
2709
2710                 new->__use = 1;
2711                 new->input = dst_discard;
2712                 new->output = dst_discard_out;
2713
2714                 new->dev = net->loopback_dev;
2715                 if (new->dev)
2716                         dev_hold(new->dev);
2717
2718                 rt->rt_is_input = ort->rt_is_input;
2719                 rt->rt_iif = ort->rt_iif;
2720                 rt->rt_pmtu = ort->rt_pmtu;
2721                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2722
2723                 rt->rt_genid = rt_genid_ipv4(net);
2724                 rt->rt_flags = ort->rt_flags;
2725                 rt->rt_type = ort->rt_type;
2726                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2727                 rt->rt_gw_family = ort->rt_gw_family;
2728                 if (rt->rt_gw_family == AF_INET)
2729                         rt->rt_gw4 = ort->rt_gw4;
2730                 else if (rt->rt_gw_family == AF_INET6)
2731                         rt->rt_gw6 = ort->rt_gw6;
2732
2733                 INIT_LIST_HEAD(&rt->rt_uncached);
2734         }
2735
2736         dst_release(dst_orig);
2737
2738         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2739 }
2740
2741 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2742                                     const struct sock *sk)
2743 {
2744         struct rtable *rt = __ip_route_output_key(net, flp4);
2745
2746         if (IS_ERR(rt))
2747                 return rt;
2748
2749         if (flp4->flowi4_proto) {
2750                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2751                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2752                                                         flowi4_to_flowi(flp4),
2753                                                         sk, 0);
2754         }
2755
2756         return rt;
2757 }
2758 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2759
2760 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2761                                       struct net_device *dev,
2762                                       struct net *net, __be32 *saddr,
2763                                       const struct ip_tunnel_info *info,
2764                                       u8 protocol, bool use_cache)
2765 {
2766 #ifdef CONFIG_DST_CACHE
2767         struct dst_cache *dst_cache;
2768 #endif
2769         struct rtable *rt = NULL;
2770         struct flowi4 fl4;
2771         __u8 tos;
2772
2773 #ifdef CONFIG_DST_CACHE
2774         dst_cache = (struct dst_cache *)&info->dst_cache;
2775         if (use_cache) {
2776                 rt = dst_cache_get_ip4(dst_cache, saddr);
2777                 if (rt)
2778                         return rt;
2779         }
2780 #endif
2781         memset(&fl4, 0, sizeof(fl4));
2782         fl4.flowi4_mark = skb->mark;
2783         fl4.flowi4_proto = protocol;
2784         fl4.daddr = info->key.u.ipv4.dst;
2785         fl4.saddr = info->key.u.ipv4.src;
2786         tos = info->key.tos;
2787         fl4.flowi4_tos = RT_TOS(tos);
2788
2789         rt = ip_route_output_key(net, &fl4);
2790         if (IS_ERR(rt)) {
2791                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2792                 return ERR_PTR(-ENETUNREACH);
2793         }
2794         if (rt->dst.dev == dev) { /* is this necessary? */
2795                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2796                 ip_rt_put(rt);
2797                 return ERR_PTR(-ELOOP);
2798         }
2799 #ifdef CONFIG_DST_CACHE
2800         if (use_cache)
2801                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2802 #endif
2803         *saddr = fl4.saddr;
2804         return rt;
2805 }
2806 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2807
2808 /* called with rcu_read_lock held */
2809 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2810                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2811                         struct sk_buff *skb, u32 portid, u32 seq,
2812                         unsigned int flags)
2813 {
2814         struct rtmsg *r;
2815         struct nlmsghdr *nlh;
2816         unsigned long expires = 0;
2817         u32 error;
2818         u32 metrics[RTAX_MAX];
2819
2820         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2821         if (!nlh)
2822                 return -EMSGSIZE;
2823
2824         r = nlmsg_data(nlh);
2825         r->rtm_family    = AF_INET;
2826         r->rtm_dst_len  = 32;
2827         r->rtm_src_len  = 0;
2828         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2829         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2830         if (nla_put_u32(skb, RTA_TABLE, table_id))
2831                 goto nla_put_failure;
2832         r->rtm_type     = rt->rt_type;
2833         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2834         r->rtm_protocol = RTPROT_UNSPEC;
2835         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2836         if (rt->rt_flags & RTCF_NOTIFY)
2837                 r->rtm_flags |= RTM_F_NOTIFY;
2838         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2839                 r->rtm_flags |= RTCF_DOREDIRECT;
2840
2841         if (nla_put_in_addr(skb, RTA_DST, dst))
2842                 goto nla_put_failure;
2843         if (src) {
2844                 r->rtm_src_len = 32;
2845                 if (nla_put_in_addr(skb, RTA_SRC, src))
2846                         goto nla_put_failure;
2847         }
2848         if (rt->dst.dev &&
2849             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2850                 goto nla_put_failure;
2851         if (rt->dst.lwtstate &&
2852             lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2853                 goto nla_put_failure;
2854 #ifdef CONFIG_IP_ROUTE_CLASSID
2855         if (rt->dst.tclassid &&
2856             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2857                 goto nla_put_failure;
2858 #endif
2859         if (fl4 && !rt_is_input_route(rt) &&
2860             fl4->saddr != src) {
2861                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2862                         goto nla_put_failure;
2863         }
2864         if (rt->rt_uses_gateway) {
2865                 if (rt->rt_gw_family == AF_INET &&
2866                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2867                         goto nla_put_failure;
2868                 } else if (rt->rt_gw_family == AF_INET6) {
2869                         int alen = sizeof(struct in6_addr);
2870                         struct nlattr *nla;
2871                         struct rtvia *via;
2872
2873                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2874                         if (!nla)
2875                                 goto nla_put_failure;
2876
2877                         via = nla_data(nla);
2878                         via->rtvia_family = AF_INET6;
2879                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2880                 }
2881         }
2882
2883         expires = rt->dst.expires;
2884         if (expires) {
2885                 unsigned long now = jiffies;
2886
2887                 if (time_before(now, expires))
2888                         expires -= now;
2889                 else
2890                         expires = 0;
2891         }
2892
2893         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2894         if (rt->rt_pmtu && expires)
2895                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2896         if (rt->rt_mtu_locked && expires)
2897                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2898         if (rtnetlink_put_metrics(skb, metrics) < 0)
2899                 goto nla_put_failure;
2900
2901         if (fl4) {
2902                 if (fl4->flowi4_mark &&
2903                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2904                         goto nla_put_failure;
2905
2906                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2907                     nla_put_u32(skb, RTA_UID,
2908                                 from_kuid_munged(current_user_ns(),
2909                                                  fl4->flowi4_uid)))
2910                         goto nla_put_failure;
2911
2912                 if (rt_is_input_route(rt)) {
2913 #ifdef CONFIG_IP_MROUTE
2914                         if (ipv4_is_multicast(dst) &&
2915                             !ipv4_is_local_multicast(dst) &&
2916                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2917                                 int err = ipmr_get_route(net, skb,
2918                                                          fl4->saddr, fl4->daddr,
2919                                                          r, portid);
2920
2921                                 if (err <= 0) {
2922                                         if (err == 0)
2923                                                 return 0;
2924                                         goto nla_put_failure;
2925                                 }
2926                         } else
2927 #endif
2928                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2929                                         goto nla_put_failure;
2930                 }
2931         }
2932
2933         error = rt->dst.error;
2934
2935         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2936                 goto nla_put_failure;
2937
2938         nlmsg_end(skb, nlh);
2939         return 0;
2940
2941 nla_put_failure:
2942         nlmsg_cancel(skb, nlh);
2943         return -EMSGSIZE;
2944 }
2945
2946 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2947                             struct netlink_callback *cb, u32 table_id,
2948                             struct fnhe_hash_bucket *bucket, int genid,
2949                             int *fa_index, int fa_start, unsigned int flags)
2950 {
2951         int i;
2952
2953         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2954                 struct fib_nh_exception *fnhe;
2955
2956                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2957                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2958                         struct rtable *rt;
2959                         int err;
2960
2961                         if (*fa_index < fa_start)
2962                                 goto next;
2963
2964                         if (fnhe->fnhe_genid != genid)
2965                                 goto next;
2966
2967                         if (fnhe->fnhe_expires &&
2968                             time_after(jiffies, fnhe->fnhe_expires))
2969                                 goto next;
2970
2971                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2972                         if (!rt)
2973                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2974                         if (!rt)
2975                                 goto next;
2976
2977                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2978                                            table_id, NULL, skb,
2979                                            NETLINK_CB(cb->skb).portid,
2980                                            cb->nlh->nlmsg_seq, flags);
2981                         if (err)
2982                                 return err;
2983 next:
2984                         (*fa_index)++;
2985                 }
2986         }
2987
2988         return 0;
2989 }
2990
2991 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2992                        u32 table_id, struct fib_info *fi,
2993                        int *fa_index, int fa_start, unsigned int flags)
2994 {
2995         struct net *net = sock_net(cb->skb->sk);
2996         int nhsel, genid = fnhe_genid(net);
2997
2998         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2999                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3000                 struct fnhe_hash_bucket *bucket;
3001                 int err;
3002
3003                 if (nhc->nhc_flags & RTNH_F_DEAD)
3004                         continue;
3005
3006                 rcu_read_lock();
3007                 bucket = rcu_dereference(nhc->nhc_exceptions);
3008                 err = 0;
3009                 if (bucket)
3010                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3011                                                genid, fa_index, fa_start,
3012                                                flags);
3013                 rcu_read_unlock();
3014                 if (err)
3015                         return err;
3016         }
3017
3018         return 0;
3019 }
3020
3021 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3022                                                    u8 ip_proto, __be16 sport,
3023                                                    __be16 dport)
3024 {
3025         struct sk_buff *skb;
3026         struct iphdr *iph;
3027
3028         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3029         if (!skb)
3030                 return NULL;
3031
3032         /* Reserve room for dummy headers, this skb can pass
3033          * through good chunk of routing engine.
3034          */
3035         skb_reset_mac_header(skb);
3036         skb_reset_network_header(skb);
3037         skb->protocol = htons(ETH_P_IP);
3038         iph = skb_put(skb, sizeof(struct iphdr));
3039         iph->protocol = ip_proto;
3040         iph->saddr = src;
3041         iph->daddr = dst;
3042         iph->version = 0x4;
3043         iph->frag_off = 0;
3044         iph->ihl = 0x5;
3045         skb_set_transport_header(skb, skb->len);
3046
3047         switch (iph->protocol) {
3048         case IPPROTO_UDP: {
3049                 struct udphdr *udph;
3050
3051                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3052                 udph->source = sport;
3053                 udph->dest = dport;
3054                 udph->len = sizeof(struct udphdr);
3055                 udph->check = 0;
3056                 break;
3057         }
3058         case IPPROTO_TCP: {
3059                 struct tcphdr *tcph;
3060
3061                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3062                 tcph->source    = sport;
3063                 tcph->dest      = dport;
3064                 tcph->doff      = sizeof(struct tcphdr) / 4;
3065                 tcph->rst = 1;
3066                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3067                                             src, dst, 0);
3068                 break;
3069         }
3070         case IPPROTO_ICMP: {
3071                 struct icmphdr *icmph;
3072
3073                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3074                 icmph->type = ICMP_ECHO;
3075                 icmph->code = 0;
3076         }
3077         }
3078
3079         return skb;
3080 }
3081
3082 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3083                                        const struct nlmsghdr *nlh,
3084                                        struct nlattr **tb,
3085                                        struct netlink_ext_ack *extack)
3086 {
3087         struct rtmsg *rtm;
3088         int i, err;
3089
3090         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3091                 NL_SET_ERR_MSG(extack,
3092                                "ipv4: Invalid header for route get request");
3093                 return -EINVAL;
3094         }
3095
3096         if (!netlink_strict_get_check(skb))
3097                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3098                                               rtm_ipv4_policy, extack);
3099
3100         rtm = nlmsg_data(nlh);
3101         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3102             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3103             rtm->rtm_table || rtm->rtm_protocol ||
3104             rtm->rtm_scope || rtm->rtm_type) {
3105                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3106                 return -EINVAL;
3107         }
3108
3109         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3110                                RTM_F_LOOKUP_TABLE |
3111                                RTM_F_FIB_MATCH)) {
3112                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3113                 return -EINVAL;
3114         }
3115
3116         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3117                                             rtm_ipv4_policy, extack);
3118         if (err)
3119                 return err;
3120
3121         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3122             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3123                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3124                 return -EINVAL;
3125         }
3126
3127         for (i = 0; i <= RTA_MAX; i++) {
3128                 if (!tb[i])
3129                         continue;
3130
3131                 switch (i) {
3132                 case RTA_IIF:
3133                 case RTA_OIF:
3134                 case RTA_SRC:
3135                 case RTA_DST:
3136                 case RTA_IP_PROTO:
3137                 case RTA_SPORT:
3138                 case RTA_DPORT:
3139                 case RTA_MARK:
3140                 case RTA_UID:
3141                         break;
3142                 default:
3143                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3144                         return -EINVAL;
3145                 }
3146         }
3147
3148         return 0;
3149 }
3150
3151 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3152                              struct netlink_ext_ack *extack)
3153 {
3154         struct net *net = sock_net(in_skb->sk);
3155         struct nlattr *tb[RTA_MAX+1];
3156         u32 table_id = RT_TABLE_MAIN;
3157         __be16 sport = 0, dport = 0;
3158         struct fib_result res = {};
3159         u8 ip_proto = IPPROTO_UDP;
3160         struct rtable *rt = NULL;
3161         struct sk_buff *skb;
3162         struct rtmsg *rtm;
3163         struct flowi4 fl4 = {};
3164         __be32 dst = 0;
3165         __be32 src = 0;
3166         kuid_t uid;
3167         u32 iif;
3168         int err;
3169         int mark;
3170
3171         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3172         if (err < 0)
3173                 return err;
3174
3175         rtm = nlmsg_data(nlh);
3176         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3177         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3178         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3179         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3180         if (tb[RTA_UID])
3181                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3182         else
3183                 uid = (iif ? INVALID_UID : current_uid());
3184
3185         if (tb[RTA_IP_PROTO]) {
3186                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3187                                                   &ip_proto, AF_INET, extack);
3188                 if (err)
3189                         return err;
3190         }
3191
3192         if (tb[RTA_SPORT])
3193                 sport = nla_get_be16(tb[RTA_SPORT]);
3194
3195         if (tb[RTA_DPORT])
3196                 dport = nla_get_be16(tb[RTA_DPORT]);
3197
3198         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3199         if (!skb)
3200                 return -ENOBUFS;
3201
3202         fl4.daddr = dst;
3203         fl4.saddr = src;
3204         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3205         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3206         fl4.flowi4_mark = mark;
3207         fl4.flowi4_uid = uid;
3208         if (sport)
3209                 fl4.fl4_sport = sport;
3210         if (dport)
3211                 fl4.fl4_dport = dport;
3212         fl4.flowi4_proto = ip_proto;
3213
3214         rcu_read_lock();
3215
3216         if (iif) {
3217                 struct net_device *dev;
3218
3219                 dev = dev_get_by_index_rcu(net, iif);
3220                 if (!dev) {
3221                         err = -ENODEV;
3222                         goto errout_rcu;
3223                 }
3224
3225                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3226                 skb->dev        = dev;
3227                 skb->mark       = mark;
3228                 err = ip_route_input_rcu(skb, dst, src,
3229                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
3230                                          &res);
3231
3232                 rt = skb_rtable(skb);
3233                 if (err == 0 && rt->dst.error)
3234                         err = -rt->dst.error;
3235         } else {
3236                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3237                 skb->dev = net->loopback_dev;
3238                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3239                 err = 0;
3240                 if (IS_ERR(rt))
3241                         err = PTR_ERR(rt);
3242                 else
3243                         skb_dst_set(skb, &rt->dst);
3244         }
3245
3246         if (err)
3247                 goto errout_rcu;
3248
3249         if (rtm->rtm_flags & RTM_F_NOTIFY)
3250                 rt->rt_flags |= RTCF_NOTIFY;
3251
3252         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3253                 table_id = res.table ? res.table->tb_id : 0;
3254
3255         /* reset skb for netlink reply msg */
3256         skb_trim(skb, 0);
3257         skb_reset_network_header(skb);
3258         skb_reset_transport_header(skb);
3259         skb_reset_mac_header(skb);
3260
3261         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3262                 struct fib_rt_info fri;
3263
3264                 if (!res.fi) {
3265                         err = fib_props[res.type].error;
3266                         if (!err)
3267                                 err = -EHOSTUNREACH;
3268                         goto errout_rcu;
3269                 }
3270                 fri.fi = res.fi;
3271                 fri.tb_id = table_id;
3272                 fri.dst = res.prefix;
3273                 fri.dst_len = res.prefixlen;
3274                 fri.tos = fl4.flowi4_tos;
3275                 fri.type = rt->rt_type;
3276                 fri.offload = 0;
3277                 fri.trap = 0;
3278                 fri.offload_failed = 0;
3279                 if (res.fa_head) {
3280                         struct fib_alias *fa;
3281
3282                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3283                                 u8 slen = 32 - fri.dst_len;
3284
3285                                 if (fa->fa_slen == slen &&
3286                                     fa->tb_id == fri.tb_id &&
3287                                     fa->fa_tos == fri.tos &&
3288                                     fa->fa_info == res.fi &&
3289                                     fa->fa_type == fri.type) {
3290                                         fri.offload = fa->offload;
3291                                         fri.trap = fa->trap;
3292                                         break;
3293                                 }
3294                         }
3295                 }
3296                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3297                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3298         } else {
3299                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3300                                    NETLINK_CB(in_skb).portid,
3301                                    nlh->nlmsg_seq, 0);
3302         }
3303         if (err < 0)
3304                 goto errout_rcu;
3305
3306         rcu_read_unlock();
3307
3308         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3309
3310 errout_free:
3311         return err;
3312 errout_rcu:
3313         rcu_read_unlock();
3314         kfree_skb(skb);
3315         goto errout_free;
3316 }
3317
3318 void ip_rt_multicast_event(struct in_device *in_dev)
3319 {
3320         rt_cache_flush(dev_net(in_dev->dev));
3321 }
3322
3323 #ifdef CONFIG_SYSCTL
3324 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3325 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3326 static int ip_rt_gc_elasticity __read_mostly    = 8;
3327 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3328
3329 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3330                 void *buffer, size_t *lenp, loff_t *ppos)
3331 {
3332         struct net *net = (struct net *)__ctl->extra1;
3333
3334         if (write) {
3335                 rt_cache_flush(net);
3336                 fnhe_genid_bump(net);
3337                 return 0;
3338         }
3339
3340         return -EINVAL;
3341 }
3342
3343 static struct ctl_table ipv4_route_table[] = {
3344         {
3345                 .procname       = "gc_thresh",
3346                 .data           = &ipv4_dst_ops.gc_thresh,
3347                 .maxlen         = sizeof(int),
3348                 .mode           = 0644,
3349                 .proc_handler   = proc_dointvec,
3350         },
3351         {
3352                 .procname       = "max_size",
3353                 .data           = &ip_rt_max_size,
3354                 .maxlen         = sizeof(int),
3355                 .mode           = 0644,
3356                 .proc_handler   = proc_dointvec,
3357         },
3358         {
3359                 /*  Deprecated. Use gc_min_interval_ms */
3360
3361                 .procname       = "gc_min_interval",
3362                 .data           = &ip_rt_gc_min_interval,
3363                 .maxlen         = sizeof(int),
3364                 .mode           = 0644,
3365                 .proc_handler   = proc_dointvec_jiffies,
3366         },
3367         {
3368                 .procname       = "gc_min_interval_ms",
3369                 .data           = &ip_rt_gc_min_interval,
3370                 .maxlen         = sizeof(int),
3371                 .mode           = 0644,
3372                 .proc_handler   = proc_dointvec_ms_jiffies,
3373         },
3374         {
3375                 .procname       = "gc_timeout",
3376                 .data           = &ip_rt_gc_timeout,
3377                 .maxlen         = sizeof(int),
3378                 .mode           = 0644,
3379                 .proc_handler   = proc_dointvec_jiffies,
3380         },
3381         {
3382                 .procname       = "gc_interval",
3383                 .data           = &ip_rt_gc_interval,
3384                 .maxlen         = sizeof(int),
3385                 .mode           = 0644,
3386                 .proc_handler   = proc_dointvec_jiffies,
3387         },
3388         {
3389                 .procname       = "redirect_load",
3390                 .data           = &ip_rt_redirect_load,
3391                 .maxlen         = sizeof(int),
3392                 .mode           = 0644,
3393                 .proc_handler   = proc_dointvec,
3394         },
3395         {
3396                 .procname       = "redirect_number",
3397                 .data           = &ip_rt_redirect_number,
3398                 .maxlen         = sizeof(int),
3399                 .mode           = 0644,
3400                 .proc_handler   = proc_dointvec,
3401         },
3402         {
3403                 .procname       = "redirect_silence",
3404                 .data           = &ip_rt_redirect_silence,
3405                 .maxlen         = sizeof(int),
3406                 .mode           = 0644,
3407                 .proc_handler   = proc_dointvec,
3408         },
3409         {
3410                 .procname       = "error_cost",
3411                 .data           = &ip_rt_error_cost,
3412                 .maxlen         = sizeof(int),
3413                 .mode           = 0644,
3414                 .proc_handler   = proc_dointvec,
3415         },
3416         {
3417                 .procname       = "error_burst",
3418                 .data           = &ip_rt_error_burst,
3419                 .maxlen         = sizeof(int),
3420                 .mode           = 0644,
3421                 .proc_handler   = proc_dointvec,
3422         },
3423         {
3424                 .procname       = "gc_elasticity",
3425                 .data           = &ip_rt_gc_elasticity,
3426                 .maxlen         = sizeof(int),
3427                 .mode           = 0644,
3428                 .proc_handler   = proc_dointvec,
3429         },
3430         {
3431                 .procname       = "mtu_expires",
3432                 .data           = &ip_rt_mtu_expires,
3433                 .maxlen         = sizeof(int),
3434                 .mode           = 0644,
3435                 .proc_handler   = proc_dointvec_jiffies,
3436         },
3437         {
3438                 .procname       = "min_pmtu",
3439                 .data           = &ip_rt_min_pmtu,
3440                 .maxlen         = sizeof(int),
3441                 .mode           = 0644,
3442                 .proc_handler   = proc_dointvec_minmax,
3443                 .extra1         = &ip_min_valid_pmtu,
3444         },
3445         {
3446                 .procname       = "min_adv_mss",
3447                 .data           = &ip_rt_min_advmss,
3448                 .maxlen         = sizeof(int),
3449                 .mode           = 0644,
3450                 .proc_handler   = proc_dointvec,
3451         },
3452         { }
3453 };
3454
3455 static const char ipv4_route_flush_procname[] = "flush";
3456
3457 static struct ctl_table ipv4_route_flush_table[] = {
3458         {
3459                 .procname       = ipv4_route_flush_procname,
3460                 .maxlen         = sizeof(int),
3461                 .mode           = 0200,
3462                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3463         },
3464         { },
3465 };
3466
3467 static __net_init int sysctl_route_net_init(struct net *net)
3468 {
3469         struct ctl_table *tbl;
3470
3471         tbl = ipv4_route_flush_table;
3472         if (!net_eq(net, &init_net)) {
3473                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3474                 if (!tbl)
3475                         goto err_dup;
3476
3477                 /* Don't export non-whitelisted sysctls to unprivileged users */
3478                 if (net->user_ns != &init_user_ns) {
3479                         if (tbl[0].procname != ipv4_route_flush_procname)
3480                                 tbl[0].procname = NULL;
3481                 }
3482         }
3483         tbl[0].extra1 = net;
3484
3485         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3486         if (!net->ipv4.route_hdr)
3487                 goto err_reg;
3488         return 0;
3489
3490 err_reg:
3491         if (tbl != ipv4_route_flush_table)
3492                 kfree(tbl);
3493 err_dup:
3494         return -ENOMEM;
3495 }
3496
3497 static __net_exit void sysctl_route_net_exit(struct net *net)
3498 {
3499         struct ctl_table *tbl;
3500
3501         tbl = net->ipv4.route_hdr->ctl_table_arg;
3502         unregister_net_sysctl_table(net->ipv4.route_hdr);
3503         BUG_ON(tbl == ipv4_route_flush_table);
3504         kfree(tbl);
3505 }
3506
3507 static __net_initdata struct pernet_operations sysctl_route_ops = {
3508         .init = sysctl_route_net_init,
3509         .exit = sysctl_route_net_exit,
3510 };
3511 #endif
3512
3513 static __net_init int rt_genid_init(struct net *net)
3514 {
3515         atomic_set(&net->ipv4.rt_genid, 0);
3516         atomic_set(&net->fnhe_genid, 0);
3517         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3518         return 0;
3519 }
3520
3521 static __net_initdata struct pernet_operations rt_genid_ops = {
3522         .init = rt_genid_init,
3523 };
3524
3525 static int __net_init ipv4_inetpeer_init(struct net *net)
3526 {
3527         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3528
3529         if (!bp)
3530                 return -ENOMEM;
3531         inet_peer_base_init(bp);
3532         net->ipv4.peers = bp;
3533         return 0;
3534 }
3535
3536 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3537 {
3538         struct inet_peer_base *bp = net->ipv4.peers;
3539
3540         net->ipv4.peers = NULL;
3541         inetpeer_invalidate_tree(bp);
3542         kfree(bp);
3543 }
3544
3545 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3546         .init   =       ipv4_inetpeer_init,
3547         .exit   =       ipv4_inetpeer_exit,
3548 };
3549
3550 #ifdef CONFIG_IP_ROUTE_CLASSID
3551 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3552 #endif /* CONFIG_IP_ROUTE_CLASSID */
3553
3554 int __init ip_rt_init(void)
3555 {
3556         int cpu;
3557
3558         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3559                                   GFP_KERNEL);
3560         if (!ip_idents)
3561                 panic("IP: failed to allocate ip_idents\n");
3562
3563         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3564
3565         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3566         if (!ip_tstamps)
3567                 panic("IP: failed to allocate ip_tstamps\n");
3568
3569         for_each_possible_cpu(cpu) {
3570                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3571
3572                 INIT_LIST_HEAD(&ul->head);
3573                 spin_lock_init(&ul->lock);
3574         }
3575 #ifdef CONFIG_IP_ROUTE_CLASSID
3576         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3577         if (!ip_rt_acct)
3578                 panic("IP: failed to allocate ip_rt_acct\n");
3579 #endif
3580
3581         ipv4_dst_ops.kmem_cachep =
3582                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3583                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3584
3585         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3586
3587         if (dst_entries_init(&ipv4_dst_ops) < 0)
3588                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3589
3590         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3591                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3592
3593         ipv4_dst_ops.gc_thresh = ~0;
3594         ip_rt_max_size = INT_MAX;
3595
3596         devinet_init();
3597         ip_fib_init();
3598
3599         if (ip_rt_proc_init())
3600                 pr_err("Unable to create route proc files\n");
3601 #ifdef CONFIG_XFRM
3602         xfrm_init();
3603         xfrm4_init();
3604 #endif
3605         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3606                       RTNL_FLAG_DOIT_UNLOCKED);
3607
3608 #ifdef CONFIG_SYSCTL
3609         register_pernet_subsys(&sysctl_route_ops);
3610 #endif
3611         register_pernet_subsys(&rt_genid_ops);
3612         register_pernet_subsys(&ipv4_inetpeer_ops);
3613         return 0;
3614 }
3615
3616 #ifdef CONFIG_SYSCTL
3617 /*
3618  * We really need to sanitize the damn ipv4 init order, then all
3619  * this nonsense will go away.
3620  */
3621 void __init ip_static_sysctl_init(void)
3622 {
3623         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3624 }
3625 #endif