Merge tag 'scsi-misc' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[linux-2.6-microblaze.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 INDIRECT_CALLABLE_SCOPE
137 struct dst_entry        *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 INDIRECT_CALLABLE_SCOPE
140 unsigned int            ipv4_mtu(const struct dst_entry *dst);
141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
142 static void              ipv4_link_failure(struct sk_buff *skb);
143 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144                                            struct sk_buff *skb, u32 mtu,
145                                            bool confirm_neigh);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct proc_ops rt_cache_proc_ops = {
243         .proc_open      = rt_cache_seq_open,
244         .proc_read      = seq_read,
245         .proc_lseek     = seq_lseek,
246         .proc_release   = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         (*pos)++;
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    0, /* st->in_hit */
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    0, /* st->out_hit */
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    0, /* st->gc_total */
311                    0, /* st->gc_ignored */
312                    0, /* st->gc_goal_miss */
313                    0, /* st->gc_dst_overflow */
314                    0, /* st->in_hlist_search */
315                    0  /* st->out_hlist_search */
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct proc_ops rt_cpu_proc_ops = {
334         .proc_open      = rt_cpu_seq_open,
335         .proc_read      = seq_read,
336         .proc_lseek     = seq_lseek,
337         .proc_release   = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364 #endif
365
366 static int __net_init ip_rt_do_proc_init(struct net *net)
367 {
368         struct proc_dir_entry *pde;
369
370         pde = proc_create("rt_cache", 0444, net->proc_net,
371                           &rt_cache_proc_ops);
372         if (!pde)
373                 goto err1;
374
375         pde = proc_create("rt_cache", 0444,
376                           net->proc_net_stat, &rt_cpu_proc_ops);
377         if (!pde)
378                 goto err2;
379
380 #ifdef CONFIG_IP_ROUTE_CLASSID
381         pde = proc_create_single("rt_acct", 0, net->proc_net,
382                         rt_acct_proc_show);
383         if (!pde)
384                 goto err3;
385 #endif
386         return 0;
387
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389 err3:
390         remove_proc_entry("rt_cache", net->proc_net_stat);
391 #endif
392 err2:
393         remove_proc_entry("rt_cache", net->proc_net);
394 err1:
395         return -ENOMEM;
396 }
397
398 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 {
400         remove_proc_entry("rt_cache", net->proc_net_stat);
401         remove_proc_entry("rt_cache", net->proc_net);
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403         remove_proc_entry("rt_acct", net->proc_net);
404 #endif
405 }
406
407 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
408         .init = ip_rt_do_proc_init,
409         .exit = ip_rt_do_proc_exit,
410 };
411
412 static int __init ip_rt_proc_init(void)
413 {
414         return register_pernet_subsys(&ip_rt_proc_ops);
415 }
416
417 #else
418 static inline int ip_rt_proc_init(void)
419 {
420         return 0;
421 }
422 #endif /* CONFIG_PROC_FS */
423
424 static inline bool rt_is_expired(const struct rtable *rth)
425 {
426         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 }
428
429 void rt_cache_flush(struct net *net)
430 {
431         rt_genid_bump_ipv4(net);
432 }
433
434 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
435                                            struct sk_buff *skb,
436                                            const void *daddr)
437 {
438         const struct rtable *rt = container_of(dst, struct rtable, dst);
439         struct net_device *dev = dst->dev;
440         struct neighbour *n;
441
442         rcu_read_lock_bh();
443
444         if (likely(rt->rt_gw_family == AF_INET)) {
445                 n = ip_neigh_gw4(dev, rt->rt_gw4);
446         } else if (rt->rt_gw_family == AF_INET6) {
447                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
448         } else {
449                 __be32 pkey;
450
451                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
452                 n = ip_neigh_gw4(dev, pkey);
453         }
454
455         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456                 n = NULL;
457
458         rcu_read_unlock_bh();
459
460         return n;
461 }
462
463 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 {
465         const struct rtable *rt = container_of(dst, struct rtable, dst);
466         struct net_device *dev = dst->dev;
467         const __be32 *pkey = daddr;
468
469         if (rt->rt_gw_family == AF_INET) {
470                 pkey = (const __be32 *)&rt->rt_gw4;
471         } else if (rt->rt_gw_family == AF_INET6) {
472                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473         } else if (!daddr ||
474                  (rt->rt_flags &
475                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476                 return;
477         }
478         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 }
480
481 #define IP_IDENTS_SZ 2048u
482
483 static atomic_t *ip_idents __read_mostly;
484 static u32 *ip_tstamps __read_mostly;
485
486 /* In order to protect privacy, we add a perturbation to identifiers
487  * if one generator is seldom used. This makes hard for an attacker
488  * to infer how many packets were sent between two points in time.
489  */
490 u32 ip_idents_reserve(u32 hash, int segs)
491 {
492         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
493         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
494         u32 old = READ_ONCE(*p_tstamp);
495         u32 now = (u32)jiffies;
496         u32 delta = 0;
497
498         if (old != now && cmpxchg(p_tstamp, old, now) == old)
499                 delta = prandom_u32_max(now - old);
500
501         /* If UBSAN reports an error there, please make sure your compiler
502          * supports -fno-strict-overflow before reporting it that was a bug
503          * in UBSAN, and it has been fixed in GCC-8.
504          */
505         return atomic_add_return(segs + delta, p_id) - segs;
506 }
507 EXPORT_SYMBOL(ip_idents_reserve);
508
509 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
510 {
511         u32 hash, id;
512
513         /* Note the following code is not safe, but this is okay. */
514         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
515                 get_random_bytes(&net->ipv4.ip_id_key,
516                                  sizeof(net->ipv4.ip_id_key));
517
518         hash = siphash_3u32((__force u32)iph->daddr,
519                             (__force u32)iph->saddr,
520                             iph->protocol,
521                             &net->ipv4.ip_id_key);
522         id = ip_idents_reserve(hash, segs);
523         iph->id = htons(id);
524 }
525 EXPORT_SYMBOL(__ip_select_ident);
526
527 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
528                              const struct sock *sk,
529                              const struct iphdr *iph,
530                              int oif, u8 tos,
531                              u8 prot, u32 mark, int flow_flags)
532 {
533         if (sk) {
534                 const struct inet_sock *inet = inet_sk(sk);
535
536                 oif = sk->sk_bound_dev_if;
537                 mark = sk->sk_mark;
538                 tos = RT_CONN_FLAGS(sk);
539                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540         }
541         flowi4_init_output(fl4, oif, mark, tos,
542                            RT_SCOPE_UNIVERSE, prot,
543                            flow_flags,
544                            iph->daddr, iph->saddr, 0, 0,
545                            sock_net_uid(net, sk));
546 }
547
548 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
549                                const struct sock *sk)
550 {
551         const struct net *net = dev_net(skb->dev);
552         const struct iphdr *iph = ip_hdr(skb);
553         int oif = skb->dev->ifindex;
554         u8 tos = RT_TOS(iph->tos);
555         u8 prot = iph->protocol;
556         u32 mark = skb->mark;
557
558         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564         const struct ip_options_rcu *inet_opt;
565         __be32 daddr = inet->inet_daddr;
566
567         rcu_read_lock();
568         inet_opt = rcu_dereference(inet->inet_opt);
569         if (inet_opt && inet_opt->opt.srr)
570                 daddr = inet_opt->opt.faddr;
571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574                            inet_sk_flowi_flags(sk),
575                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
576         rcu_read_unlock();
577 }
578
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580                                  const struct sk_buff *skb)
581 {
582         if (skb)
583                 build_skb_flow_key(fl4, skb, sk);
584         else
585                 build_sk_flow_key(fl4, sk);
586 }
587
588 static DEFINE_SPINLOCK(fnhe_lock);
589
590 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
591 {
592         struct rtable *rt;
593
594         rt = rcu_dereference(fnhe->fnhe_rth_input);
595         if (rt) {
596                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
597                 dst_dev_put(&rt->dst);
598                 dst_release(&rt->dst);
599         }
600         rt = rcu_dereference(fnhe->fnhe_rth_output);
601         if (rt) {
602                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
603                 dst_dev_put(&rt->dst);
604                 dst_release(&rt->dst);
605         }
606 }
607
608 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
609 {
610         struct fib_nh_exception *fnhe, *oldest;
611
612         oldest = rcu_dereference(hash->chain);
613         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
614              fnhe = rcu_dereference(fnhe->fnhe_next)) {
615                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616                         oldest = fnhe;
617         }
618         fnhe_flush_routes(oldest);
619         return oldest;
620 }
621
622 static inline u32 fnhe_hashfun(__be32 daddr)
623 {
624         static u32 fnhe_hashrnd __read_mostly;
625         u32 hval;
626
627         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
628         hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
629         return hash_32(hval, FNHE_HASH_SHIFT);
630 }
631
632 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
633 {
634         rt->rt_pmtu = fnhe->fnhe_pmtu;
635         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
636         rt->dst.expires = fnhe->fnhe_expires;
637
638         if (fnhe->fnhe_gw) {
639                 rt->rt_flags |= RTCF_REDIRECTED;
640                 rt->rt_uses_gateway = 1;
641                 rt->rt_gw_family = AF_INET;
642                 rt->rt_gw4 = fnhe->fnhe_gw;
643         }
644 }
645
646 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647                                   __be32 gw, u32 pmtu, bool lock,
648                                   unsigned long expires)
649 {
650         struct fnhe_hash_bucket *hash;
651         struct fib_nh_exception *fnhe;
652         struct rtable *rt;
653         u32 genid, hval;
654         unsigned int i;
655         int depth;
656
657         genid = fnhe_genid(dev_net(nhc->nhc_dev));
658         hval = fnhe_hashfun(daddr);
659
660         spin_lock_bh(&fnhe_lock);
661
662         hash = rcu_dereference(nhc->nhc_exceptions);
663         if (!hash) {
664                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665                 if (!hash)
666                         goto out_unlock;
667                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
668         }
669
670         hash += hval;
671
672         depth = 0;
673         for (fnhe = rcu_dereference(hash->chain); fnhe;
674              fnhe = rcu_dereference(fnhe->fnhe_next)) {
675                 if (fnhe->fnhe_daddr == daddr)
676                         break;
677                 depth++;
678         }
679
680         if (fnhe) {
681                 if (fnhe->fnhe_genid != genid)
682                         fnhe->fnhe_genid = genid;
683                 if (gw)
684                         fnhe->fnhe_gw = gw;
685                 if (pmtu) {
686                         fnhe->fnhe_pmtu = pmtu;
687                         fnhe->fnhe_mtu_locked = lock;
688                 }
689                 fnhe->fnhe_expires = max(1UL, expires);
690                 /* Update all cached dsts too */
691                 rt = rcu_dereference(fnhe->fnhe_rth_input);
692                 if (rt)
693                         fill_route_from_fnhe(rt, fnhe);
694                 rt = rcu_dereference(fnhe->fnhe_rth_output);
695                 if (rt)
696                         fill_route_from_fnhe(rt, fnhe);
697         } else {
698                 if (depth > FNHE_RECLAIM_DEPTH)
699                         fnhe = fnhe_oldest(hash);
700                 else {
701                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702                         if (!fnhe)
703                                 goto out_unlock;
704
705                         fnhe->fnhe_next = hash->chain;
706                         rcu_assign_pointer(hash->chain, fnhe);
707                 }
708                 fnhe->fnhe_genid = genid;
709                 fnhe->fnhe_daddr = daddr;
710                 fnhe->fnhe_gw = gw;
711                 fnhe->fnhe_pmtu = pmtu;
712                 fnhe->fnhe_mtu_locked = lock;
713                 fnhe->fnhe_expires = max(1UL, expires);
714
715                 /* Exception created; mark the cached routes for the nexthop
716                  * stale, so anyone caching it rechecks if this exception
717                  * applies to them.
718                  */
719                 rt = rcu_dereference(nhc->nhc_rth_input);
720                 if (rt)
721                         rt->dst.obsolete = DST_OBSOLETE_KILL;
722
723                 for_each_possible_cpu(i) {
724                         struct rtable __rcu **prt;
725                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726                         rt = rcu_dereference(*prt);
727                         if (rt)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                 }
730         }
731
732         fnhe->fnhe_stamp = jiffies;
733
734 out_unlock:
735         spin_unlock_bh(&fnhe_lock);
736 }
737
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739                              bool kill_route)
740 {
741         __be32 new_gw = icmp_hdr(skb)->un.gateway;
742         __be32 old_gw = ip_hdr(skb)->saddr;
743         struct net_device *dev = skb->dev;
744         struct in_device *in_dev;
745         struct fib_result res;
746         struct neighbour *n;
747         struct net *net;
748
749         switch (icmp_hdr(skb)->code & 7) {
750         case ICMP_REDIR_NET:
751         case ICMP_REDIR_NETTOS:
752         case ICMP_REDIR_HOST:
753         case ICMP_REDIR_HOSTTOS:
754                 break;
755
756         default:
757                 return;
758         }
759
760         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761                 return;
762
763         in_dev = __in_dev_get_rcu(dev);
764         if (!in_dev)
765                 return;
766
767         net = dev_net(dev);
768         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770             ipv4_is_zeronet(new_gw))
771                 goto reject_redirect;
772
773         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775                         goto reject_redirect;
776                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777                         goto reject_redirect;
778         } else {
779                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780                         goto reject_redirect;
781         }
782
783         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784         if (!n)
785                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786         if (!IS_ERR(n)) {
787                 if (!(n->nud_state & NUD_VALID)) {
788                         neigh_event_send(n, NULL);
789                 } else {
790                         if (fib_lookup(net, fl4, &res, 0) == 0) {
791                                 struct fib_nh_common *nhc;
792
793                                 fib_select_path(net, &res, fl4, skb);
794                                 nhc = FIB_RES_NHC(res);
795                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
796                                                 0, false,
797                                                 jiffies + ip_rt_gc_timeout);
798                         }
799                         if (kill_route)
800                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
801                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
802                 }
803                 neigh_release(n);
804         }
805         return;
806
807 reject_redirect:
808 #ifdef CONFIG_IP_ROUTE_VERBOSE
809         if (IN_DEV_LOG_MARTIANS(in_dev)) {
810                 const struct iphdr *iph = (const struct iphdr *) skb->data;
811                 __be32 daddr = iph->daddr;
812                 __be32 saddr = iph->saddr;
813
814                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
815                                      "  Advised path = %pI4 -> %pI4\n",
816                                      &old_gw, dev->name, &new_gw,
817                                      &saddr, &daddr);
818         }
819 #endif
820         ;
821 }
822
823 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
824 {
825         struct rtable *rt;
826         struct flowi4 fl4;
827         const struct iphdr *iph = (const struct iphdr *) skb->data;
828         struct net *net = dev_net(skb->dev);
829         int oif = skb->dev->ifindex;
830         u8 tos = RT_TOS(iph->tos);
831         u8 prot = iph->protocol;
832         u32 mark = skb->mark;
833
834         rt = (struct rtable *) dst;
835
836         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
837         __ip_do_redirect(rt, skb, &fl4, true);
838 }
839
840 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
841 {
842         struct rtable *rt = (struct rtable *)dst;
843         struct dst_entry *ret = dst;
844
845         if (rt) {
846                 if (dst->obsolete > 0) {
847                         ip_rt_put(rt);
848                         ret = NULL;
849                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
850                            rt->dst.expires) {
851                         ip_rt_put(rt);
852                         ret = NULL;
853                 }
854         }
855         return ret;
856 }
857
858 /*
859  * Algorithm:
860  *      1. The first ip_rt_redirect_number redirects are sent
861  *         with exponential backoff, then we stop sending them at all,
862  *         assuming that the host ignores our redirects.
863  *      2. If we did not see packets requiring redirects
864  *         during ip_rt_redirect_silence, we assume that the host
865  *         forgot redirected route and start to send redirects again.
866  *
867  * This algorithm is much cheaper and more intelligent than dumb load limiting
868  * in icmp.c.
869  *
870  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
871  * and "frag. need" (breaks PMTU discovery) in icmp.c.
872  */
873
874 void ip_rt_send_redirect(struct sk_buff *skb)
875 {
876         struct rtable *rt = skb_rtable(skb);
877         struct in_device *in_dev;
878         struct inet_peer *peer;
879         struct net *net;
880         int log_martians;
881         int vif;
882
883         rcu_read_lock();
884         in_dev = __in_dev_get_rcu(rt->dst.dev);
885         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
886                 rcu_read_unlock();
887                 return;
888         }
889         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
890         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
891         rcu_read_unlock();
892
893         net = dev_net(rt->dst.dev);
894         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
895         if (!peer) {
896                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
897                           rt_nexthop(rt, ip_hdr(skb)->daddr));
898                 return;
899         }
900
901         /* No redirected packets during ip_rt_redirect_silence;
902          * reset the algorithm.
903          */
904         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
905                 peer->rate_tokens = 0;
906                 peer->n_redirects = 0;
907         }
908
909         /* Too many ignored redirects; do not send anything
910          * set dst.rate_last to the last seen redirected packet.
911          */
912         if (peer->n_redirects >= ip_rt_redirect_number) {
913                 peer->rate_last = jiffies;
914                 goto out_put_peer;
915         }
916
917         /* Check for load limit; set rate_last to the latest sent
918          * redirect.
919          */
920         if (peer->n_redirects == 0 ||
921             time_after(jiffies,
922                        (peer->rate_last +
923                         (ip_rt_redirect_load << peer->n_redirects)))) {
924                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
925
926                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
927                 peer->rate_last = jiffies;
928                 ++peer->n_redirects;
929 #ifdef CONFIG_IP_ROUTE_VERBOSE
930                 if (log_martians &&
931                     peer->n_redirects == ip_rt_redirect_number)
932                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
933                                              &ip_hdr(skb)->saddr, inet_iif(skb),
934                                              &ip_hdr(skb)->daddr, &gw);
935 #endif
936         }
937 out_put_peer:
938         inet_putpeer(peer);
939 }
940
941 static int ip_error(struct sk_buff *skb)
942 {
943         struct rtable *rt = skb_rtable(skb);
944         struct net_device *dev = skb->dev;
945         struct in_device *in_dev;
946         struct inet_peer *peer;
947         unsigned long now;
948         struct net *net;
949         bool send;
950         int code;
951
952         if (netif_is_l3_master(skb->dev)) {
953                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
954                 if (!dev)
955                         goto out;
956         }
957
958         in_dev = __in_dev_get_rcu(dev);
959
960         /* IP on this device is disabled. */
961         if (!in_dev)
962                 goto out;
963
964         net = dev_net(rt->dst.dev);
965         if (!IN_DEV_FORWARD(in_dev)) {
966                 switch (rt->dst.error) {
967                 case EHOSTUNREACH:
968                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
969                         break;
970
971                 case ENETUNREACH:
972                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
973                         break;
974                 }
975                 goto out;
976         }
977
978         switch (rt->dst.error) {
979         case EINVAL:
980         default:
981                 goto out;
982         case EHOSTUNREACH:
983                 code = ICMP_HOST_UNREACH;
984                 break;
985         case ENETUNREACH:
986                 code = ICMP_NET_UNREACH;
987                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
988                 break;
989         case EACCES:
990                 code = ICMP_PKT_FILTERED;
991                 break;
992         }
993
994         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
995                                l3mdev_master_ifindex(skb->dev), 1);
996
997         send = true;
998         if (peer) {
999                 now = jiffies;
1000                 peer->rate_tokens += now - peer->rate_last;
1001                 if (peer->rate_tokens > ip_rt_error_burst)
1002                         peer->rate_tokens = ip_rt_error_burst;
1003                 peer->rate_last = now;
1004                 if (peer->rate_tokens >= ip_rt_error_cost)
1005                         peer->rate_tokens -= ip_rt_error_cost;
1006                 else
1007                         send = false;
1008                 inet_putpeer(peer);
1009         }
1010         if (send)
1011                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1012
1013 out:    kfree_skb(skb);
1014         return 0;
1015 }
1016
1017 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1018 {
1019         struct dst_entry *dst = &rt->dst;
1020         struct net *net = dev_net(dst->dev);
1021         struct fib_result res;
1022         bool lock = false;
1023         u32 old_mtu;
1024
1025         if (ip_mtu_locked(dst))
1026                 return;
1027
1028         old_mtu = ipv4_mtu(dst);
1029         if (old_mtu < mtu)
1030                 return;
1031
1032         if (mtu < ip_rt_min_pmtu) {
1033                 lock = true;
1034                 mtu = min(old_mtu, ip_rt_min_pmtu);
1035         }
1036
1037         if (rt->rt_pmtu == mtu && !lock &&
1038             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1039                 return;
1040
1041         rcu_read_lock();
1042         if (fib_lookup(net, fl4, &res, 0) == 0) {
1043                 struct fib_nh_common *nhc;
1044
1045                 fib_select_path(net, &res, fl4, NULL);
1046                 nhc = FIB_RES_NHC(res);
1047                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1048                                       jiffies + ip_rt_mtu_expires);
1049         }
1050         rcu_read_unlock();
1051 }
1052
1053 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1054                               struct sk_buff *skb, u32 mtu,
1055                               bool confirm_neigh)
1056 {
1057         struct rtable *rt = (struct rtable *) dst;
1058         struct flowi4 fl4;
1059
1060         ip_rt_build_flow_key(&fl4, sk, skb);
1061
1062         /* Don't make lookup fail for bridged encapsulations */
1063         if (skb && netif_is_any_bridge_port(skb->dev))
1064                 fl4.flowi4_oif = 0;
1065
1066         __ip_rt_update_pmtu(rt, &fl4, mtu);
1067 }
1068
1069 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1070                       int oif, u8 protocol)
1071 {
1072         const struct iphdr *iph = (const struct iphdr *)skb->data;
1073         struct flowi4 fl4;
1074         struct rtable *rt;
1075         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1076
1077         __build_flow_key(net, &fl4, NULL, iph, oif,
1078                          RT_TOS(iph->tos), protocol, mark, 0);
1079         rt = __ip_route_output_key(net, &fl4);
1080         if (!IS_ERR(rt)) {
1081                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1082                 ip_rt_put(rt);
1083         }
1084 }
1085 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1086
1087 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1088 {
1089         const struct iphdr *iph = (const struct iphdr *)skb->data;
1090         struct flowi4 fl4;
1091         struct rtable *rt;
1092
1093         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1094
1095         if (!fl4.flowi4_mark)
1096                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1097
1098         rt = __ip_route_output_key(sock_net(sk), &fl4);
1099         if (!IS_ERR(rt)) {
1100                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1101                 ip_rt_put(rt);
1102         }
1103 }
1104
1105 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1106 {
1107         const struct iphdr *iph = (const struct iphdr *)skb->data;
1108         struct flowi4 fl4;
1109         struct rtable *rt;
1110         struct dst_entry *odst = NULL;
1111         bool new = false;
1112         struct net *net = sock_net(sk);
1113
1114         bh_lock_sock(sk);
1115
1116         if (!ip_sk_accept_pmtu(sk))
1117                 goto out;
1118
1119         odst = sk_dst_get(sk);
1120
1121         if (sock_owned_by_user(sk) || !odst) {
1122                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1123                 goto out;
1124         }
1125
1126         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1127
1128         rt = (struct rtable *)odst;
1129         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1130                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131                 if (IS_ERR(rt))
1132                         goto out;
1133
1134                 new = true;
1135         }
1136
1137         __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1138
1139         if (!dst_check(&rt->dst, 0)) {
1140                 if (new)
1141                         dst_release(&rt->dst);
1142
1143                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144                 if (IS_ERR(rt))
1145                         goto out;
1146
1147                 new = true;
1148         }
1149
1150         if (new)
1151                 sk_dst_set(sk, &rt->dst);
1152
1153 out:
1154         bh_unlock_sock(sk);
1155         dst_release(odst);
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1158
1159 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1160                    int oif, u8 protocol)
1161 {
1162         const struct iphdr *iph = (const struct iphdr *)skb->data;
1163         struct flowi4 fl4;
1164         struct rtable *rt;
1165
1166         __build_flow_key(net, &fl4, NULL, iph, oif,
1167                          RT_TOS(iph->tos), protocol, 0, 0);
1168         rt = __ip_route_output_key(net, &fl4);
1169         if (!IS_ERR(rt)) {
1170                 __ip_do_redirect(rt, skb, &fl4, false);
1171                 ip_rt_put(rt);
1172         }
1173 }
1174 EXPORT_SYMBOL_GPL(ipv4_redirect);
1175
1176 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1177 {
1178         const struct iphdr *iph = (const struct iphdr *)skb->data;
1179         struct flowi4 fl4;
1180         struct rtable *rt;
1181         struct net *net = sock_net(sk);
1182
1183         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1184         rt = __ip_route_output_key(net, &fl4);
1185         if (!IS_ERR(rt)) {
1186                 __ip_do_redirect(rt, skb, &fl4, false);
1187                 ip_rt_put(rt);
1188         }
1189 }
1190 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1191
1192 INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1193                                                          u32 cookie)
1194 {
1195         struct rtable *rt = (struct rtable *) dst;
1196
1197         /* All IPV4 dsts are created with ->obsolete set to the value
1198          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1199          * into this function always.
1200          *
1201          * When a PMTU/redirect information update invalidates a route,
1202          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1203          * DST_OBSOLETE_DEAD.
1204          */
1205         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1206                 return NULL;
1207         return dst;
1208 }
1209 EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1210
1211 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1212 {
1213         struct ip_options opt;
1214         int res;
1215
1216         /* Recompile ip options since IPCB may not be valid anymore.
1217          * Also check we have a reasonable ipv4 header.
1218          */
1219         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1220             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1221                 return;
1222
1223         memset(&opt, 0, sizeof(opt));
1224         if (ip_hdr(skb)->ihl > 5) {
1225                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1226                         return;
1227                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1228
1229                 rcu_read_lock();
1230                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1231                 rcu_read_unlock();
1232
1233                 if (res)
1234                         return;
1235         }
1236         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1237 }
1238
1239 static void ipv4_link_failure(struct sk_buff *skb)
1240 {
1241         struct rtable *rt;
1242
1243         ipv4_send_dest_unreach(skb);
1244
1245         rt = skb_rtable(skb);
1246         if (rt)
1247                 dst_set_expires(&rt->dst, 0);
1248 }
1249
1250 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1251 {
1252         pr_debug("%s: %pI4 -> %pI4, %s\n",
1253                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1254                  skb->dev ? skb->dev->name : "?");
1255         kfree_skb(skb);
1256         WARN_ON(1);
1257         return 0;
1258 }
1259
1260 /*
1261    We do not cache source address of outgoing interface,
1262    because it is used only by IP RR, TS and SRR options,
1263    so that it out of fast path.
1264
1265    BTW remember: "addr" is allowed to be not aligned
1266    in IP options!
1267  */
1268
1269 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1270 {
1271         __be32 src;
1272
1273         if (rt_is_output_route(rt))
1274                 src = ip_hdr(skb)->saddr;
1275         else {
1276                 struct fib_result res;
1277                 struct iphdr *iph = ip_hdr(skb);
1278                 struct flowi4 fl4 = {
1279                         .daddr = iph->daddr,
1280                         .saddr = iph->saddr,
1281                         .flowi4_tos = RT_TOS(iph->tos),
1282                         .flowi4_oif = rt->dst.dev->ifindex,
1283                         .flowi4_iif = skb->dev->ifindex,
1284                         .flowi4_mark = skb->mark,
1285                 };
1286
1287                 rcu_read_lock();
1288                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1289                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1290                 else
1291                         src = inet_select_addr(rt->dst.dev,
1292                                                rt_nexthop(rt, iph->daddr),
1293                                                RT_SCOPE_UNIVERSE);
1294                 rcu_read_unlock();
1295         }
1296         memcpy(addr, &src, 4);
1297 }
1298
1299 #ifdef CONFIG_IP_ROUTE_CLASSID
1300 static void set_class_tag(struct rtable *rt, u32 tag)
1301 {
1302         if (!(rt->dst.tclassid & 0xFFFF))
1303                 rt->dst.tclassid |= tag & 0xFFFF;
1304         if (!(rt->dst.tclassid & 0xFFFF0000))
1305                 rt->dst.tclassid |= tag & 0xFFFF0000;
1306 }
1307 #endif
1308
1309 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1310 {
1311         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1312         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1313                                     ip_rt_min_advmss);
1314
1315         return min(advmss, IPV4_MAX_PMTU - header_size);
1316 }
1317
1318 INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1319 {
1320         const struct rtable *rt = (const struct rtable *)dst;
1321         unsigned int mtu = rt->rt_pmtu;
1322
1323         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1324                 mtu = dst_metric_raw(dst, RTAX_MTU);
1325
1326         if (mtu)
1327                 return mtu;
1328
1329         mtu = READ_ONCE(dst->dev->mtu);
1330
1331         if (unlikely(ip_mtu_locked(dst))) {
1332                 if (rt->rt_uses_gateway && mtu > 576)
1333                         mtu = 576;
1334         }
1335
1336         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1337
1338         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1339 }
1340 EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1341
1342 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1343 {
1344         struct fnhe_hash_bucket *hash;
1345         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1346         u32 hval = fnhe_hashfun(daddr);
1347
1348         spin_lock_bh(&fnhe_lock);
1349
1350         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1351                                          lockdep_is_held(&fnhe_lock));
1352         hash += hval;
1353
1354         fnhe_p = &hash->chain;
1355         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1356         while (fnhe) {
1357                 if (fnhe->fnhe_daddr == daddr) {
1358                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1359                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1360                         /* set fnhe_daddr to 0 to ensure it won't bind with
1361                          * new dsts in rt_bind_exception().
1362                          */
1363                         fnhe->fnhe_daddr = 0;
1364                         fnhe_flush_routes(fnhe);
1365                         kfree_rcu(fnhe, rcu);
1366                         break;
1367                 }
1368                 fnhe_p = &fnhe->fnhe_next;
1369                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1370                                                  lockdep_is_held(&fnhe_lock));
1371         }
1372
1373         spin_unlock_bh(&fnhe_lock);
1374 }
1375
1376 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1377                                                __be32 daddr)
1378 {
1379         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1380         struct fib_nh_exception *fnhe;
1381         u32 hval;
1382
1383         if (!hash)
1384                 return NULL;
1385
1386         hval = fnhe_hashfun(daddr);
1387
1388         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1389              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1390                 if (fnhe->fnhe_daddr == daddr) {
1391                         if (fnhe->fnhe_expires &&
1392                             time_after(jiffies, fnhe->fnhe_expires)) {
1393                                 ip_del_fnhe(nhc, daddr);
1394                                 break;
1395                         }
1396                         return fnhe;
1397                 }
1398         }
1399         return NULL;
1400 }
1401
1402 /* MTU selection:
1403  * 1. mtu on route is locked - use it
1404  * 2. mtu from nexthop exception
1405  * 3. mtu from egress device
1406  */
1407
1408 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1409 {
1410         struct fib_nh_common *nhc = res->nhc;
1411         struct net_device *dev = nhc->nhc_dev;
1412         struct fib_info *fi = res->fi;
1413         u32 mtu = 0;
1414
1415         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1416             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1417                 mtu = fi->fib_mtu;
1418
1419         if (likely(!mtu)) {
1420                 struct fib_nh_exception *fnhe;
1421
1422                 fnhe = find_exception(nhc, daddr);
1423                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1424                         mtu = fnhe->fnhe_pmtu;
1425         }
1426
1427         if (likely(!mtu))
1428                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1429
1430         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1431 }
1432
1433 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1434                               __be32 daddr, const bool do_cache)
1435 {
1436         bool ret = false;
1437
1438         spin_lock_bh(&fnhe_lock);
1439
1440         if (daddr == fnhe->fnhe_daddr) {
1441                 struct rtable __rcu **porig;
1442                 struct rtable *orig;
1443                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1444
1445                 if (rt_is_input_route(rt))
1446                         porig = &fnhe->fnhe_rth_input;
1447                 else
1448                         porig = &fnhe->fnhe_rth_output;
1449                 orig = rcu_dereference(*porig);
1450
1451                 if (fnhe->fnhe_genid != genid) {
1452                         fnhe->fnhe_genid = genid;
1453                         fnhe->fnhe_gw = 0;
1454                         fnhe->fnhe_pmtu = 0;
1455                         fnhe->fnhe_expires = 0;
1456                         fnhe->fnhe_mtu_locked = false;
1457                         fnhe_flush_routes(fnhe);
1458                         orig = NULL;
1459                 }
1460                 fill_route_from_fnhe(rt, fnhe);
1461                 if (!rt->rt_gw4) {
1462                         rt->rt_gw4 = daddr;
1463                         rt->rt_gw_family = AF_INET;
1464                 }
1465
1466                 if (do_cache) {
1467                         dst_hold(&rt->dst);
1468                         rcu_assign_pointer(*porig, rt);
1469                         if (orig) {
1470                                 dst_dev_put(&orig->dst);
1471                                 dst_release(&orig->dst);
1472                         }
1473                         ret = true;
1474                 }
1475
1476                 fnhe->fnhe_stamp = jiffies;
1477         }
1478         spin_unlock_bh(&fnhe_lock);
1479
1480         return ret;
1481 }
1482
1483 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1484 {
1485         struct rtable *orig, *prev, **p;
1486         bool ret = true;
1487
1488         if (rt_is_input_route(rt)) {
1489                 p = (struct rtable **)&nhc->nhc_rth_input;
1490         } else {
1491                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1492         }
1493         orig = *p;
1494
1495         /* hold dst before doing cmpxchg() to avoid race condition
1496          * on this dst
1497          */
1498         dst_hold(&rt->dst);
1499         prev = cmpxchg(p, orig, rt);
1500         if (prev == orig) {
1501                 if (orig) {
1502                         rt_add_uncached_list(orig);
1503                         dst_release(&orig->dst);
1504                 }
1505         } else {
1506                 dst_release(&rt->dst);
1507                 ret = false;
1508         }
1509
1510         return ret;
1511 }
1512
1513 struct uncached_list {
1514         spinlock_t              lock;
1515         struct list_head        head;
1516 };
1517
1518 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1519
1520 void rt_add_uncached_list(struct rtable *rt)
1521 {
1522         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1523
1524         rt->rt_uncached_list = ul;
1525
1526         spin_lock_bh(&ul->lock);
1527         list_add_tail(&rt->rt_uncached, &ul->head);
1528         spin_unlock_bh(&ul->lock);
1529 }
1530
1531 void rt_del_uncached_list(struct rtable *rt)
1532 {
1533         if (!list_empty(&rt->rt_uncached)) {
1534                 struct uncached_list *ul = rt->rt_uncached_list;
1535
1536                 spin_lock_bh(&ul->lock);
1537                 list_del(&rt->rt_uncached);
1538                 spin_unlock_bh(&ul->lock);
1539         }
1540 }
1541
1542 static void ipv4_dst_destroy(struct dst_entry *dst)
1543 {
1544         struct rtable *rt = (struct rtable *)dst;
1545
1546         ip_dst_metrics_put(dst);
1547         rt_del_uncached_list(rt);
1548 }
1549
1550 void rt_flush_dev(struct net_device *dev)
1551 {
1552         struct rtable *rt;
1553         int cpu;
1554
1555         for_each_possible_cpu(cpu) {
1556                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1557
1558                 spin_lock_bh(&ul->lock);
1559                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1560                         if (rt->dst.dev != dev)
1561                                 continue;
1562                         rt->dst.dev = blackhole_netdev;
1563                         dev_hold(rt->dst.dev);
1564                         dev_put(dev);
1565                 }
1566                 spin_unlock_bh(&ul->lock);
1567         }
1568 }
1569
1570 static bool rt_cache_valid(const struct rtable *rt)
1571 {
1572         return  rt &&
1573                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1574                 !rt_is_expired(rt);
1575 }
1576
1577 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1578                            const struct fib_result *res,
1579                            struct fib_nh_exception *fnhe,
1580                            struct fib_info *fi, u16 type, u32 itag,
1581                            const bool do_cache)
1582 {
1583         bool cached = false;
1584
1585         if (fi) {
1586                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1587
1588                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1589                         rt->rt_uses_gateway = 1;
1590                         rt->rt_gw_family = nhc->nhc_gw_family;
1591                         /* only INET and INET6 are supported */
1592                         if (likely(nhc->nhc_gw_family == AF_INET))
1593                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1594                         else
1595                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1596                 }
1597
1598                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1599
1600 #ifdef CONFIG_IP_ROUTE_CLASSID
1601                 if (nhc->nhc_family == AF_INET) {
1602                         struct fib_nh *nh;
1603
1604                         nh = container_of(nhc, struct fib_nh, nh_common);
1605                         rt->dst.tclassid = nh->nh_tclassid;
1606                 }
1607 #endif
1608                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1609                 if (unlikely(fnhe))
1610                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1611                 else if (do_cache)
1612                         cached = rt_cache_route(nhc, rt);
1613                 if (unlikely(!cached)) {
1614                         /* Routes we intend to cache in nexthop exception or
1615                          * FIB nexthop have the DST_NOCACHE bit clear.
1616                          * However, if we are unsuccessful at storing this
1617                          * route into the cache we really need to set it.
1618                          */
1619                         if (!rt->rt_gw4) {
1620                                 rt->rt_gw_family = AF_INET;
1621                                 rt->rt_gw4 = daddr;
1622                         }
1623                         rt_add_uncached_list(rt);
1624                 }
1625         } else
1626                 rt_add_uncached_list(rt);
1627
1628 #ifdef CONFIG_IP_ROUTE_CLASSID
1629 #ifdef CONFIG_IP_MULTIPLE_TABLES
1630         set_class_tag(rt, res->tclassid);
1631 #endif
1632         set_class_tag(rt, itag);
1633 #endif
1634 }
1635
1636 struct rtable *rt_dst_alloc(struct net_device *dev,
1637                             unsigned int flags, u16 type,
1638                             bool nopolicy, bool noxfrm)
1639 {
1640         struct rtable *rt;
1641
1642         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1643                        (nopolicy ? DST_NOPOLICY : 0) |
1644                        (noxfrm ? DST_NOXFRM : 0));
1645
1646         if (rt) {
1647                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1648                 rt->rt_flags = flags;
1649                 rt->rt_type = type;
1650                 rt->rt_is_input = 0;
1651                 rt->rt_iif = 0;
1652                 rt->rt_pmtu = 0;
1653                 rt->rt_mtu_locked = 0;
1654                 rt->rt_uses_gateway = 0;
1655                 rt->rt_gw_family = 0;
1656                 rt->rt_gw4 = 0;
1657                 INIT_LIST_HEAD(&rt->rt_uncached);
1658
1659                 rt->dst.output = ip_output;
1660                 if (flags & RTCF_LOCAL)
1661                         rt->dst.input = ip_local_deliver;
1662         }
1663
1664         return rt;
1665 }
1666 EXPORT_SYMBOL(rt_dst_alloc);
1667
1668 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1669 {
1670         struct rtable *new_rt;
1671
1672         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673                            rt->dst.flags);
1674
1675         if (new_rt) {
1676                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1677                 new_rt->rt_flags = rt->rt_flags;
1678                 new_rt->rt_type = rt->rt_type;
1679                 new_rt->rt_is_input = rt->rt_is_input;
1680                 new_rt->rt_iif = rt->rt_iif;
1681                 new_rt->rt_pmtu = rt->rt_pmtu;
1682                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1683                 new_rt->rt_gw_family = rt->rt_gw_family;
1684                 if (rt->rt_gw_family == AF_INET)
1685                         new_rt->rt_gw4 = rt->rt_gw4;
1686                 else if (rt->rt_gw_family == AF_INET6)
1687                         new_rt->rt_gw6 = rt->rt_gw6;
1688                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1689
1690                 new_rt->dst.input = rt->dst.input;
1691                 new_rt->dst.output = rt->dst.output;
1692                 new_rt->dst.error = rt->dst.error;
1693                 new_rt->dst.lastuse = jiffies;
1694                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1695         }
1696         return new_rt;
1697 }
1698 EXPORT_SYMBOL(rt_dst_clone);
1699
1700 /* called in rcu_read_lock() section */
1701 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1702                           u8 tos, struct net_device *dev,
1703                           struct in_device *in_dev, u32 *itag)
1704 {
1705         int err;
1706
1707         /* Primary sanity checks. */
1708         if (!in_dev)
1709                 return -EINVAL;
1710
1711         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1712             skb->protocol != htons(ETH_P_IP))
1713                 return -EINVAL;
1714
1715         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1716                 return -EINVAL;
1717
1718         if (ipv4_is_zeronet(saddr)) {
1719                 if (!ipv4_is_local_multicast(daddr) &&
1720                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1721                         return -EINVAL;
1722         } else {
1723                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1724                                           in_dev, itag);
1725                 if (err < 0)
1726                         return err;
1727         }
1728         return 0;
1729 }
1730
1731 /* called in rcu_read_lock() section */
1732 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1733                              u8 tos, struct net_device *dev, int our)
1734 {
1735         struct in_device *in_dev = __in_dev_get_rcu(dev);
1736         unsigned int flags = RTCF_MULTICAST;
1737         struct rtable *rth;
1738         u32 itag = 0;
1739         int err;
1740
1741         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1742         if (err)
1743                 return err;
1744
1745         if (our)
1746                 flags |= RTCF_LOCAL;
1747
1748         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1749                            IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1750         if (!rth)
1751                 return -ENOBUFS;
1752
1753 #ifdef CONFIG_IP_ROUTE_CLASSID
1754         rth->dst.tclassid = itag;
1755 #endif
1756         rth->dst.output = ip_rt_bug;
1757         rth->rt_is_input= 1;
1758
1759 #ifdef CONFIG_IP_MROUTE
1760         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1761                 rth->dst.input = ip_mr_input;
1762 #endif
1763         RT_CACHE_STAT_INC(in_slow_mc);
1764
1765         skb_dst_set(skb, &rth->dst);
1766         return 0;
1767 }
1768
1769
1770 static void ip_handle_martian_source(struct net_device *dev,
1771                                      struct in_device *in_dev,
1772                                      struct sk_buff *skb,
1773                                      __be32 daddr,
1774                                      __be32 saddr)
1775 {
1776         RT_CACHE_STAT_INC(in_martian_src);
1777 #ifdef CONFIG_IP_ROUTE_VERBOSE
1778         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1779                 /*
1780                  *      RFC1812 recommendation, if source is martian,
1781                  *      the only hint is MAC header.
1782                  */
1783                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1784                         &daddr, &saddr, dev->name);
1785                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1786                         print_hex_dump(KERN_WARNING, "ll header: ",
1787                                        DUMP_PREFIX_OFFSET, 16, 1,
1788                                        skb_mac_header(skb),
1789                                        dev->hard_header_len, false);
1790                 }
1791         }
1792 #endif
1793 }
1794
1795 /* called in rcu_read_lock() section */
1796 static int __mkroute_input(struct sk_buff *skb,
1797                            const struct fib_result *res,
1798                            struct in_device *in_dev,
1799                            __be32 daddr, __be32 saddr, u32 tos)
1800 {
1801         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1802         struct net_device *dev = nhc->nhc_dev;
1803         struct fib_nh_exception *fnhe;
1804         struct rtable *rth;
1805         int err;
1806         struct in_device *out_dev;
1807         bool do_cache;
1808         u32 itag = 0;
1809
1810         /* get a working reference to the output device */
1811         out_dev = __in_dev_get_rcu(dev);
1812         if (!out_dev) {
1813                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1814                 return -EINVAL;
1815         }
1816
1817         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1818                                   in_dev->dev, in_dev, &itag);
1819         if (err < 0) {
1820                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1821                                          saddr);
1822
1823                 goto cleanup;
1824         }
1825
1826         do_cache = res->fi && !itag;
1827         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1828             skb->protocol == htons(ETH_P_IP)) {
1829                 __be32 gw;
1830
1831                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1832                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1833                     inet_addr_onlink(out_dev, saddr, gw))
1834                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1835         }
1836
1837         if (skb->protocol != htons(ETH_P_IP)) {
1838                 /* Not IP (i.e. ARP). Do not create route, if it is
1839                  * invalid for proxy arp. DNAT routes are always valid.
1840                  *
1841                  * Proxy arp feature have been extended to allow, ARP
1842                  * replies back to the same interface, to support
1843                  * Private VLAN switch technologies. See arp.c.
1844                  */
1845                 if (out_dev == in_dev &&
1846                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1847                         err = -EINVAL;
1848                         goto cleanup;
1849                 }
1850         }
1851
1852         fnhe = find_exception(nhc, daddr);
1853         if (do_cache) {
1854                 if (fnhe)
1855                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1856                 else
1857                         rth = rcu_dereference(nhc->nhc_rth_input);
1858                 if (rt_cache_valid(rth)) {
1859                         skb_dst_set_noref(skb, &rth->dst);
1860                         goto out;
1861                 }
1862         }
1863
1864         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1865                            IN_DEV_ORCONF(in_dev, NOPOLICY),
1866                            IN_DEV_ORCONF(out_dev, NOXFRM));
1867         if (!rth) {
1868                 err = -ENOBUFS;
1869                 goto cleanup;
1870         }
1871
1872         rth->rt_is_input = 1;
1873         RT_CACHE_STAT_INC(in_slow_tot);
1874
1875         rth->dst.input = ip_forward;
1876
1877         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1878                        do_cache);
1879         lwtunnel_set_redirect(&rth->dst);
1880         skb_dst_set(skb, &rth->dst);
1881 out:
1882         err = 0;
1883  cleanup:
1884         return err;
1885 }
1886
1887 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1888 /* To make ICMP packets follow the right flow, the multipath hash is
1889  * calculated from the inner IP addresses.
1890  */
1891 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1892                                  struct flow_keys *hash_keys)
1893 {
1894         const struct iphdr *outer_iph = ip_hdr(skb);
1895         const struct iphdr *key_iph = outer_iph;
1896         const struct iphdr *inner_iph;
1897         const struct icmphdr *icmph;
1898         struct iphdr _inner_iph;
1899         struct icmphdr _icmph;
1900
1901         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1902                 goto out;
1903
1904         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1905                 goto out;
1906
1907         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1908                                    &_icmph);
1909         if (!icmph)
1910                 goto out;
1911
1912         if (!icmp_is_err(icmph->type))
1913                 goto out;
1914
1915         inner_iph = skb_header_pointer(skb,
1916                                        outer_iph->ihl * 4 + sizeof(_icmph),
1917                                        sizeof(_inner_iph), &_inner_iph);
1918         if (!inner_iph)
1919                 goto out;
1920
1921         key_iph = inner_iph;
1922 out:
1923         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1924         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1925 }
1926
1927 /* if skb is set it will be used and fl4 can be NULL */
1928 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1929                        const struct sk_buff *skb, struct flow_keys *flkeys)
1930 {
1931         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1932         struct flow_keys hash_keys;
1933         u32 mhash;
1934
1935         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1936         case 0:
1937                 memset(&hash_keys, 0, sizeof(hash_keys));
1938                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1939                 if (skb) {
1940                         ip_multipath_l3_keys(skb, &hash_keys);
1941                 } else {
1942                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1943                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1944                 }
1945                 break;
1946         case 1:
1947                 /* skb is currently provided only when forwarding */
1948                 if (skb) {
1949                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1950                         struct flow_keys keys;
1951
1952                         /* short-circuit if we already have L4 hash present */
1953                         if (skb->l4_hash)
1954                                 return skb_get_hash_raw(skb) >> 1;
1955
1956                         memset(&hash_keys, 0, sizeof(hash_keys));
1957
1958                         if (!flkeys) {
1959                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1960                                 flkeys = &keys;
1961                         }
1962
1963                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1964                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1965                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1966                         hash_keys.ports.src = flkeys->ports.src;
1967                         hash_keys.ports.dst = flkeys->ports.dst;
1968                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1969                 } else {
1970                         memset(&hash_keys, 0, sizeof(hash_keys));
1971                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1972                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1973                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1974                         hash_keys.ports.src = fl4->fl4_sport;
1975                         hash_keys.ports.dst = fl4->fl4_dport;
1976                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1977                 }
1978                 break;
1979         case 2:
1980                 memset(&hash_keys, 0, sizeof(hash_keys));
1981                 /* skb is currently provided only when forwarding */
1982                 if (skb) {
1983                         struct flow_keys keys;
1984
1985                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1986                         /* Inner can be v4 or v6 */
1987                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1988                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1990                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1991                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1992                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1993                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1994                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1995                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1996                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1997                         } else {
1998                                 /* Same as case 0 */
1999                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2000                                 ip_multipath_l3_keys(skb, &hash_keys);
2001                         }
2002                 } else {
2003                         /* Same as case 0 */
2004                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2005                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2006                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2007                 }
2008                 break;
2009         }
2010         mhash = flow_hash_from_keys(&hash_keys);
2011
2012         if (multipath_hash)
2013                 mhash = jhash_2words(mhash, multipath_hash, 0);
2014
2015         return mhash >> 1;
2016 }
2017 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2018
2019 static int ip_mkroute_input(struct sk_buff *skb,
2020                             struct fib_result *res,
2021                             struct in_device *in_dev,
2022                             __be32 daddr, __be32 saddr, u32 tos,
2023                             struct flow_keys *hkeys)
2024 {
2025 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2026         if (res->fi && fib_info_num_path(res->fi) > 1) {
2027                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2028
2029                 fib_select_multipath(res, h);
2030         }
2031 #endif
2032
2033         /* create a routing cache entry */
2034         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2035 }
2036
2037 /* Implements all the saddr-related checks as ip_route_input_slow(),
2038  * assuming daddr is valid and the destination is not a local broadcast one.
2039  * Uses the provided hint instead of performing a route lookup.
2040  */
2041 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2042                       u8 tos, struct net_device *dev,
2043                       const struct sk_buff *hint)
2044 {
2045         struct in_device *in_dev = __in_dev_get_rcu(dev);
2046         struct rtable *rt = skb_rtable(hint);
2047         struct net *net = dev_net(dev);
2048         int err = -EINVAL;
2049         u32 tag = 0;
2050
2051         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2052                 goto martian_source;
2053
2054         if (ipv4_is_zeronet(saddr))
2055                 goto martian_source;
2056
2057         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2058                 goto martian_source;
2059
2060         if (rt->rt_type != RTN_LOCAL)
2061                 goto skip_validate_source;
2062
2063         tos &= IPTOS_RT_MASK;
2064         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2065         if (err < 0)
2066                 goto martian_source;
2067
2068 skip_validate_source:
2069         skb_dst_copy(skb, hint);
2070         return 0;
2071
2072 martian_source:
2073         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2074         return err;
2075 }
2076
2077 /*
2078  *      NOTE. We drop all the packets that has local source
2079  *      addresses, because every properly looped back packet
2080  *      must have correct destination already attached by output routine.
2081  *      Changes in the enforced policies must be applied also to
2082  *      ip_route_use_hint().
2083  *
2084  *      Such approach solves two big problems:
2085  *      1. Not simplex devices are handled properly.
2086  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2087  *      called with rcu_read_lock()
2088  */
2089
2090 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091                                u8 tos, struct net_device *dev,
2092                                struct fib_result *res)
2093 {
2094         struct in_device *in_dev = __in_dev_get_rcu(dev);
2095         struct flow_keys *flkeys = NULL, _flkeys;
2096         struct net    *net = dev_net(dev);
2097         struct ip_tunnel_info *tun_info;
2098         int             err = -EINVAL;
2099         unsigned int    flags = 0;
2100         u32             itag = 0;
2101         struct rtable   *rth;
2102         struct flowi4   fl4;
2103         bool do_cache = true;
2104
2105         /* IP on this device is disabled. */
2106
2107         if (!in_dev)
2108                 goto out;
2109
2110         /* Check for the most weird martians, which can be not detected
2111            by fib_lookup.
2112          */
2113
2114         tun_info = skb_tunnel_info(skb);
2115         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2116                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2117         else
2118                 fl4.flowi4_tun_key.tun_id = 0;
2119         skb_dst_drop(skb);
2120
2121         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2122                 goto martian_source;
2123
2124         res->fi = NULL;
2125         res->table = NULL;
2126         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2127                 goto brd_input;
2128
2129         /* Accept zero addresses only to limited broadcast;
2130          * I even do not know to fix it or not. Waiting for complains :-)
2131          */
2132         if (ipv4_is_zeronet(saddr))
2133                 goto martian_source;
2134
2135         if (ipv4_is_zeronet(daddr))
2136                 goto martian_destination;
2137
2138         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2139          * and call it once if daddr or/and saddr are loopback addresses
2140          */
2141         if (ipv4_is_loopback(daddr)) {
2142                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2143                         goto martian_destination;
2144         } else if (ipv4_is_loopback(saddr)) {
2145                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2146                         goto martian_source;
2147         }
2148
2149         /*
2150          *      Now we are ready to route packet.
2151          */
2152         fl4.flowi4_oif = 0;
2153         fl4.flowi4_iif = dev->ifindex;
2154         fl4.flowi4_mark = skb->mark;
2155         fl4.flowi4_tos = tos;
2156         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2157         fl4.flowi4_flags = 0;
2158         fl4.daddr = daddr;
2159         fl4.saddr = saddr;
2160         fl4.flowi4_uid = sock_net_uid(net, NULL);
2161         fl4.flowi4_multipath_hash = 0;
2162
2163         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2164                 flkeys = &_flkeys;
2165         } else {
2166                 fl4.flowi4_proto = 0;
2167                 fl4.fl4_sport = 0;
2168                 fl4.fl4_dport = 0;
2169         }
2170
2171         err = fib_lookup(net, &fl4, res, 0);
2172         if (err != 0) {
2173                 if (!IN_DEV_FORWARD(in_dev))
2174                         err = -EHOSTUNREACH;
2175                 goto no_route;
2176         }
2177
2178         if (res->type == RTN_BROADCAST) {
2179                 if (IN_DEV_BFORWARD(in_dev))
2180                         goto make_route;
2181                 /* not do cache if bc_forwarding is enabled */
2182                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2183                         do_cache = false;
2184                 goto brd_input;
2185         }
2186
2187         if (res->type == RTN_LOCAL) {
2188                 err = fib_validate_source(skb, saddr, daddr, tos,
2189                                           0, dev, in_dev, &itag);
2190                 if (err < 0)
2191                         goto martian_source;
2192                 goto local_input;
2193         }
2194
2195         if (!IN_DEV_FORWARD(in_dev)) {
2196                 err = -EHOSTUNREACH;
2197                 goto no_route;
2198         }
2199         if (res->type != RTN_UNICAST)
2200                 goto martian_destination;
2201
2202 make_route:
2203         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2204 out:    return err;
2205
2206 brd_input:
2207         if (skb->protocol != htons(ETH_P_IP))
2208                 goto e_inval;
2209
2210         if (!ipv4_is_zeronet(saddr)) {
2211                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2212                                           in_dev, &itag);
2213                 if (err < 0)
2214                         goto martian_source;
2215         }
2216         flags |= RTCF_BROADCAST;
2217         res->type = RTN_BROADCAST;
2218         RT_CACHE_STAT_INC(in_brd);
2219
2220 local_input:
2221         do_cache &= res->fi && !itag;
2222         if (do_cache) {
2223                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2224
2225                 rth = rcu_dereference(nhc->nhc_rth_input);
2226                 if (rt_cache_valid(rth)) {
2227                         skb_dst_set_noref(skb, &rth->dst);
2228                         err = 0;
2229                         goto out;
2230                 }
2231         }
2232
2233         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2234                            flags | RTCF_LOCAL, res->type,
2235                            IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2236         if (!rth)
2237                 goto e_nobufs;
2238
2239         rth->dst.output= ip_rt_bug;
2240 #ifdef CONFIG_IP_ROUTE_CLASSID
2241         rth->dst.tclassid = itag;
2242 #endif
2243         rth->rt_is_input = 1;
2244
2245         RT_CACHE_STAT_INC(in_slow_tot);
2246         if (res->type == RTN_UNREACHABLE) {
2247                 rth->dst.input= ip_error;
2248                 rth->dst.error= -err;
2249                 rth->rt_flags   &= ~RTCF_LOCAL;
2250         }
2251
2252         if (do_cache) {
2253                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2254
2255                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2256                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2257                         WARN_ON(rth->dst.input == lwtunnel_input);
2258                         rth->dst.lwtstate->orig_input = rth->dst.input;
2259                         rth->dst.input = lwtunnel_input;
2260                 }
2261
2262                 if (unlikely(!rt_cache_route(nhc, rth)))
2263                         rt_add_uncached_list(rth);
2264         }
2265         skb_dst_set(skb, &rth->dst);
2266         err = 0;
2267         goto out;
2268
2269 no_route:
2270         RT_CACHE_STAT_INC(in_no_route);
2271         res->type = RTN_UNREACHABLE;
2272         res->fi = NULL;
2273         res->table = NULL;
2274         goto local_input;
2275
2276         /*
2277          *      Do not cache martian addresses: they should be logged (RFC1812)
2278          */
2279 martian_destination:
2280         RT_CACHE_STAT_INC(in_martian_dst);
2281 #ifdef CONFIG_IP_ROUTE_VERBOSE
2282         if (IN_DEV_LOG_MARTIANS(in_dev))
2283                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2284                                      &daddr, &saddr, dev->name);
2285 #endif
2286
2287 e_inval:
2288         err = -EINVAL;
2289         goto out;
2290
2291 e_nobufs:
2292         err = -ENOBUFS;
2293         goto out;
2294
2295 martian_source:
2296         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2297         goto out;
2298 }
2299
2300 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2301                          u8 tos, struct net_device *dev)
2302 {
2303         struct fib_result res;
2304         int err;
2305
2306         tos &= IPTOS_RT_MASK;
2307         rcu_read_lock();
2308         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2309         rcu_read_unlock();
2310
2311         return err;
2312 }
2313 EXPORT_SYMBOL(ip_route_input_noref);
2314
2315 /* called with rcu_read_lock held */
2316 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2317                        u8 tos, struct net_device *dev, struct fib_result *res)
2318 {
2319         /* Multicast recognition logic is moved from route cache to here.
2320            The problem was that too many Ethernet cards have broken/missing
2321            hardware multicast filters :-( As result the host on multicasting
2322            network acquires a lot of useless route cache entries, sort of
2323            SDR messages from all the world. Now we try to get rid of them.
2324            Really, provided software IP multicast filter is organized
2325            reasonably (at least, hashed), it does not result in a slowdown
2326            comparing with route cache reject entries.
2327            Note, that multicast routers are not affected, because
2328            route cache entry is created eventually.
2329          */
2330         if (ipv4_is_multicast(daddr)) {
2331                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2332                 int our = 0;
2333                 int err = -EINVAL;
2334
2335                 if (!in_dev)
2336                         return err;
2337                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2338                                       ip_hdr(skb)->protocol);
2339
2340                 /* check l3 master if no match yet */
2341                 if (!our && netif_is_l3_slave(dev)) {
2342                         struct in_device *l3_in_dev;
2343
2344                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2345                         if (l3_in_dev)
2346                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2347                                                       ip_hdr(skb)->protocol);
2348                 }
2349
2350                 if (our
2351 #ifdef CONFIG_IP_MROUTE
2352                         ||
2353                     (!ipv4_is_local_multicast(daddr) &&
2354                      IN_DEV_MFORWARD(in_dev))
2355 #endif
2356                    ) {
2357                         err = ip_route_input_mc(skb, daddr, saddr,
2358                                                 tos, dev, our);
2359                 }
2360                 return err;
2361         }
2362
2363         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2364 }
2365
2366 /* called with rcu_read_lock() */
2367 static struct rtable *__mkroute_output(const struct fib_result *res,
2368                                        const struct flowi4 *fl4, int orig_oif,
2369                                        struct net_device *dev_out,
2370                                        unsigned int flags)
2371 {
2372         struct fib_info *fi = res->fi;
2373         struct fib_nh_exception *fnhe;
2374         struct in_device *in_dev;
2375         u16 type = res->type;
2376         struct rtable *rth;
2377         bool do_cache;
2378
2379         in_dev = __in_dev_get_rcu(dev_out);
2380         if (!in_dev)
2381                 return ERR_PTR(-EINVAL);
2382
2383         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2384                 if (ipv4_is_loopback(fl4->saddr) &&
2385                     !(dev_out->flags & IFF_LOOPBACK) &&
2386                     !netif_is_l3_master(dev_out))
2387                         return ERR_PTR(-EINVAL);
2388
2389         if (ipv4_is_lbcast(fl4->daddr))
2390                 type = RTN_BROADCAST;
2391         else if (ipv4_is_multicast(fl4->daddr))
2392                 type = RTN_MULTICAST;
2393         else if (ipv4_is_zeronet(fl4->daddr))
2394                 return ERR_PTR(-EINVAL);
2395
2396         if (dev_out->flags & IFF_LOOPBACK)
2397                 flags |= RTCF_LOCAL;
2398
2399         do_cache = true;
2400         if (type == RTN_BROADCAST) {
2401                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2402                 fi = NULL;
2403         } else if (type == RTN_MULTICAST) {
2404                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2405                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2406                                      fl4->flowi4_proto))
2407                         flags &= ~RTCF_LOCAL;
2408                 else
2409                         do_cache = false;
2410                 /* If multicast route do not exist use
2411                  * default one, but do not gateway in this case.
2412                  * Yes, it is hack.
2413                  */
2414                 if (fi && res->prefixlen < 4)
2415                         fi = NULL;
2416         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2417                    (orig_oif != dev_out->ifindex)) {
2418                 /* For local routes that require a particular output interface
2419                  * we do not want to cache the result.  Caching the result
2420                  * causes incorrect behaviour when there are multiple source
2421                  * addresses on the interface, the end result being that if the
2422                  * intended recipient is waiting on that interface for the
2423                  * packet he won't receive it because it will be delivered on
2424                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2425                  * be set to the loopback interface as well.
2426                  */
2427                 do_cache = false;
2428         }
2429
2430         fnhe = NULL;
2431         do_cache &= fi != NULL;
2432         if (fi) {
2433                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2434                 struct rtable __rcu **prth;
2435
2436                 fnhe = find_exception(nhc, fl4->daddr);
2437                 if (!do_cache)
2438                         goto add;
2439                 if (fnhe) {
2440                         prth = &fnhe->fnhe_rth_output;
2441                 } else {
2442                         if (unlikely(fl4->flowi4_flags &
2443                                      FLOWI_FLAG_KNOWN_NH &&
2444                                      !(nhc->nhc_gw_family &&
2445                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2446                                 do_cache = false;
2447                                 goto add;
2448                         }
2449                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2450                 }
2451                 rth = rcu_dereference(*prth);
2452                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2453                         return rth;
2454         }
2455
2456 add:
2457         rth = rt_dst_alloc(dev_out, flags, type,
2458                            IN_DEV_ORCONF(in_dev, NOPOLICY),
2459                            IN_DEV_ORCONF(in_dev, NOXFRM));
2460         if (!rth)
2461                 return ERR_PTR(-ENOBUFS);
2462
2463         rth->rt_iif = orig_oif;
2464
2465         RT_CACHE_STAT_INC(out_slow_tot);
2466
2467         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2468                 if (flags & RTCF_LOCAL &&
2469                     !(dev_out->flags & IFF_LOOPBACK)) {
2470                         rth->dst.output = ip_mc_output;
2471                         RT_CACHE_STAT_INC(out_slow_mc);
2472                 }
2473 #ifdef CONFIG_IP_MROUTE
2474                 if (type == RTN_MULTICAST) {
2475                         if (IN_DEV_MFORWARD(in_dev) &&
2476                             !ipv4_is_local_multicast(fl4->daddr)) {
2477                                 rth->dst.input = ip_mr_input;
2478                                 rth->dst.output = ip_mc_output;
2479                         }
2480                 }
2481 #endif
2482         }
2483
2484         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2485         lwtunnel_set_redirect(&rth->dst);
2486
2487         return rth;
2488 }
2489
2490 /*
2491  * Major route resolver routine.
2492  */
2493
2494 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2495                                         const struct sk_buff *skb)
2496 {
2497         __u8 tos = RT_FL_TOS(fl4);
2498         struct fib_result res = {
2499                 .type           = RTN_UNSPEC,
2500                 .fi             = NULL,
2501                 .table          = NULL,
2502                 .tclassid       = 0,
2503         };
2504         struct rtable *rth;
2505
2506         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2507         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2508         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2509                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2510
2511         rcu_read_lock();
2512         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2513         rcu_read_unlock();
2514
2515         return rth;
2516 }
2517 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2518
2519 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2520                                             struct fib_result *res,
2521                                             const struct sk_buff *skb)
2522 {
2523         struct net_device *dev_out = NULL;
2524         int orig_oif = fl4->flowi4_oif;
2525         unsigned int flags = 0;
2526         struct rtable *rth;
2527         int err;
2528
2529         if (fl4->saddr) {
2530                 if (ipv4_is_multicast(fl4->saddr) ||
2531                     ipv4_is_lbcast(fl4->saddr) ||
2532                     ipv4_is_zeronet(fl4->saddr)) {
2533                         rth = ERR_PTR(-EINVAL);
2534                         goto out;
2535                 }
2536
2537                 rth = ERR_PTR(-ENETUNREACH);
2538
2539                 /* I removed check for oif == dev_out->oif here.
2540                    It was wrong for two reasons:
2541                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2542                       is assigned to multiple interfaces.
2543                    2. Moreover, we are allowed to send packets with saddr
2544                       of another iface. --ANK
2545                  */
2546
2547                 if (fl4->flowi4_oif == 0 &&
2548                     (ipv4_is_multicast(fl4->daddr) ||
2549                      ipv4_is_lbcast(fl4->daddr))) {
2550                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2551                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2552                         if (!dev_out)
2553                                 goto out;
2554
2555                         /* Special hack: user can direct multicasts
2556                            and limited broadcast via necessary interface
2557                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2558                            This hack is not just for fun, it allows
2559                            vic,vat and friends to work.
2560                            They bind socket to loopback, set ttl to zero
2561                            and expect that it will work.
2562                            From the viewpoint of routing cache they are broken,
2563                            because we are not allowed to build multicast path
2564                            with loopback source addr (look, routing cache
2565                            cannot know, that ttl is zero, so that packet
2566                            will not leave this host and route is valid).
2567                            Luckily, this hack is good workaround.
2568                          */
2569
2570                         fl4->flowi4_oif = dev_out->ifindex;
2571                         goto make_route;
2572                 }
2573
2574                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2575                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2576                         if (!__ip_dev_find(net, fl4->saddr, false))
2577                                 goto out;
2578                 }
2579         }
2580
2581
2582         if (fl4->flowi4_oif) {
2583                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2584                 rth = ERR_PTR(-ENODEV);
2585                 if (!dev_out)
2586                         goto out;
2587
2588                 /* RACE: Check return value of inet_select_addr instead. */
2589                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2590                         rth = ERR_PTR(-ENETUNREACH);
2591                         goto out;
2592                 }
2593                 if (ipv4_is_local_multicast(fl4->daddr) ||
2594                     ipv4_is_lbcast(fl4->daddr) ||
2595                     fl4->flowi4_proto == IPPROTO_IGMP) {
2596                         if (!fl4->saddr)
2597                                 fl4->saddr = inet_select_addr(dev_out, 0,
2598                                                               RT_SCOPE_LINK);
2599                         goto make_route;
2600                 }
2601                 if (!fl4->saddr) {
2602                         if (ipv4_is_multicast(fl4->daddr))
2603                                 fl4->saddr = inet_select_addr(dev_out, 0,
2604                                                               fl4->flowi4_scope);
2605                         else if (!fl4->daddr)
2606                                 fl4->saddr = inet_select_addr(dev_out, 0,
2607                                                               RT_SCOPE_HOST);
2608                 }
2609         }
2610
2611         if (!fl4->daddr) {
2612                 fl4->daddr = fl4->saddr;
2613                 if (!fl4->daddr)
2614                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2615                 dev_out = net->loopback_dev;
2616                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2617                 res->type = RTN_LOCAL;
2618                 flags |= RTCF_LOCAL;
2619                 goto make_route;
2620         }
2621
2622         err = fib_lookup(net, fl4, res, 0);
2623         if (err) {
2624                 res->fi = NULL;
2625                 res->table = NULL;
2626                 if (fl4->flowi4_oif &&
2627                     (ipv4_is_multicast(fl4->daddr) ||
2628                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2629                         /* Apparently, routing tables are wrong. Assume,
2630                            that the destination is on link.
2631
2632                            WHY? DW.
2633                            Because we are allowed to send to iface
2634                            even if it has NO routes and NO assigned
2635                            addresses. When oif is specified, routing
2636                            tables are looked up with only one purpose:
2637                            to catch if destination is gatewayed, rather than
2638                            direct. Moreover, if MSG_DONTROUTE is set,
2639                            we send packet, ignoring both routing tables
2640                            and ifaddr state. --ANK
2641
2642
2643                            We could make it even if oif is unknown,
2644                            likely IPv6, but we do not.
2645                          */
2646
2647                         if (fl4->saddr == 0)
2648                                 fl4->saddr = inet_select_addr(dev_out, 0,
2649                                                               RT_SCOPE_LINK);
2650                         res->type = RTN_UNICAST;
2651                         goto make_route;
2652                 }
2653                 rth = ERR_PTR(err);
2654                 goto out;
2655         }
2656
2657         if (res->type == RTN_LOCAL) {
2658                 if (!fl4->saddr) {
2659                         if (res->fi->fib_prefsrc)
2660                                 fl4->saddr = res->fi->fib_prefsrc;
2661                         else
2662                                 fl4->saddr = fl4->daddr;
2663                 }
2664
2665                 /* L3 master device is the loopback for that domain */
2666                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2667                         net->loopback_dev;
2668
2669                 /* make sure orig_oif points to fib result device even
2670                  * though packet rx/tx happens over loopback or l3mdev
2671                  */
2672                 orig_oif = FIB_RES_OIF(*res);
2673
2674                 fl4->flowi4_oif = dev_out->ifindex;
2675                 flags |= RTCF_LOCAL;
2676                 goto make_route;
2677         }
2678
2679         fib_select_path(net, res, fl4, skb);
2680
2681         dev_out = FIB_RES_DEV(*res);
2682
2683 make_route:
2684         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2685
2686 out:
2687         return rth;
2688 }
2689
2690 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2691 {
2692         return NULL;
2693 }
2694
2695 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2696 {
2697         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2698
2699         return mtu ? : dst->dev->mtu;
2700 }
2701
2702 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2703                                           struct sk_buff *skb, u32 mtu,
2704                                           bool confirm_neigh)
2705 {
2706 }
2707
2708 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2709                                        struct sk_buff *skb)
2710 {
2711 }
2712
2713 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2714                                           unsigned long old)
2715 {
2716         return NULL;
2717 }
2718
2719 static struct dst_ops ipv4_dst_blackhole_ops = {
2720         .family                 =       AF_INET,
2721         .check                  =       ipv4_blackhole_dst_check,
2722         .mtu                    =       ipv4_blackhole_mtu,
2723         .default_advmss         =       ipv4_default_advmss,
2724         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2725         .redirect               =       ipv4_rt_blackhole_redirect,
2726         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2727         .neigh_lookup           =       ipv4_neigh_lookup,
2728 };
2729
2730 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2731 {
2732         struct rtable *ort = (struct rtable *) dst_orig;
2733         struct rtable *rt;
2734
2735         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2736         if (rt) {
2737                 struct dst_entry *new = &rt->dst;
2738
2739                 new->__use = 1;
2740                 new->input = dst_discard;
2741                 new->output = dst_discard_out;
2742
2743                 new->dev = net->loopback_dev;
2744                 if (new->dev)
2745                         dev_hold(new->dev);
2746
2747                 rt->rt_is_input = ort->rt_is_input;
2748                 rt->rt_iif = ort->rt_iif;
2749                 rt->rt_pmtu = ort->rt_pmtu;
2750                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2751
2752                 rt->rt_genid = rt_genid_ipv4(net);
2753                 rt->rt_flags = ort->rt_flags;
2754                 rt->rt_type = ort->rt_type;
2755                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2756                 rt->rt_gw_family = ort->rt_gw_family;
2757                 if (rt->rt_gw_family == AF_INET)
2758                         rt->rt_gw4 = ort->rt_gw4;
2759                 else if (rt->rt_gw_family == AF_INET6)
2760                         rt->rt_gw6 = ort->rt_gw6;
2761
2762                 INIT_LIST_HEAD(&rt->rt_uncached);
2763         }
2764
2765         dst_release(dst_orig);
2766
2767         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2768 }
2769
2770 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2771                                     const struct sock *sk)
2772 {
2773         struct rtable *rt = __ip_route_output_key(net, flp4);
2774
2775         if (IS_ERR(rt))
2776                 return rt;
2777
2778         if (flp4->flowi4_proto) {
2779                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2780                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2781                                                         flowi4_to_flowi(flp4),
2782                                                         sk, 0);
2783         }
2784
2785         return rt;
2786 }
2787 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2788
2789 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2790                                       struct net_device *dev,
2791                                       struct net *net, __be32 *saddr,
2792                                       const struct ip_tunnel_info *info,
2793                                       u8 protocol, bool use_cache)
2794 {
2795 #ifdef CONFIG_DST_CACHE
2796         struct dst_cache *dst_cache;
2797 #endif
2798         struct rtable *rt = NULL;
2799         struct flowi4 fl4;
2800         __u8 tos;
2801
2802 #ifdef CONFIG_DST_CACHE
2803         dst_cache = (struct dst_cache *)&info->dst_cache;
2804         if (use_cache) {
2805                 rt = dst_cache_get_ip4(dst_cache, saddr);
2806                 if (rt)
2807                         return rt;
2808         }
2809 #endif
2810         memset(&fl4, 0, sizeof(fl4));
2811         fl4.flowi4_mark = skb->mark;
2812         fl4.flowi4_proto = protocol;
2813         fl4.daddr = info->key.u.ipv4.dst;
2814         fl4.saddr = info->key.u.ipv4.src;
2815         tos = info->key.tos;
2816         fl4.flowi4_tos = RT_TOS(tos);
2817
2818         rt = ip_route_output_key(net, &fl4);
2819         if (IS_ERR(rt)) {
2820                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2821                 return ERR_PTR(-ENETUNREACH);
2822         }
2823         if (rt->dst.dev == dev) { /* is this necessary? */
2824                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2825                 ip_rt_put(rt);
2826                 return ERR_PTR(-ELOOP);
2827         }
2828 #ifdef CONFIG_DST_CACHE
2829         if (use_cache)
2830                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2831 #endif
2832         *saddr = fl4.saddr;
2833         return rt;
2834 }
2835 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2836
2837 /* called with rcu_read_lock held */
2838 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2839                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2840                         struct sk_buff *skb, u32 portid, u32 seq,
2841                         unsigned int flags)
2842 {
2843         struct rtmsg *r;
2844         struct nlmsghdr *nlh;
2845         unsigned long expires = 0;
2846         u32 error;
2847         u32 metrics[RTAX_MAX];
2848
2849         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2850         if (!nlh)
2851                 return -EMSGSIZE;
2852
2853         r = nlmsg_data(nlh);
2854         r->rtm_family    = AF_INET;
2855         r->rtm_dst_len  = 32;
2856         r->rtm_src_len  = 0;
2857         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2858         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2859         if (nla_put_u32(skb, RTA_TABLE, table_id))
2860                 goto nla_put_failure;
2861         r->rtm_type     = rt->rt_type;
2862         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2863         r->rtm_protocol = RTPROT_UNSPEC;
2864         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2865         if (rt->rt_flags & RTCF_NOTIFY)
2866                 r->rtm_flags |= RTM_F_NOTIFY;
2867         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2868                 r->rtm_flags |= RTCF_DOREDIRECT;
2869
2870         if (nla_put_in_addr(skb, RTA_DST, dst))
2871                 goto nla_put_failure;
2872         if (src) {
2873                 r->rtm_src_len = 32;
2874                 if (nla_put_in_addr(skb, RTA_SRC, src))
2875                         goto nla_put_failure;
2876         }
2877         if (rt->dst.dev &&
2878             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2879                 goto nla_put_failure;
2880         if (rt->dst.lwtstate &&
2881             lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2882                 goto nla_put_failure;
2883 #ifdef CONFIG_IP_ROUTE_CLASSID
2884         if (rt->dst.tclassid &&
2885             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2886                 goto nla_put_failure;
2887 #endif
2888         if (fl4 && !rt_is_input_route(rt) &&
2889             fl4->saddr != src) {
2890                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2891                         goto nla_put_failure;
2892         }
2893         if (rt->rt_uses_gateway) {
2894                 if (rt->rt_gw_family == AF_INET &&
2895                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2896                         goto nla_put_failure;
2897                 } else if (rt->rt_gw_family == AF_INET6) {
2898                         int alen = sizeof(struct in6_addr);
2899                         struct nlattr *nla;
2900                         struct rtvia *via;
2901
2902                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2903                         if (!nla)
2904                                 goto nla_put_failure;
2905
2906                         via = nla_data(nla);
2907                         via->rtvia_family = AF_INET6;
2908                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2909                 }
2910         }
2911
2912         expires = rt->dst.expires;
2913         if (expires) {
2914                 unsigned long now = jiffies;
2915
2916                 if (time_before(now, expires))
2917                         expires -= now;
2918                 else
2919                         expires = 0;
2920         }
2921
2922         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2923         if (rt->rt_pmtu && expires)
2924                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2925         if (rt->rt_mtu_locked && expires)
2926                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2927         if (rtnetlink_put_metrics(skb, metrics) < 0)
2928                 goto nla_put_failure;
2929
2930         if (fl4) {
2931                 if (fl4->flowi4_mark &&
2932                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2933                         goto nla_put_failure;
2934
2935                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2936                     nla_put_u32(skb, RTA_UID,
2937                                 from_kuid_munged(current_user_ns(),
2938                                                  fl4->flowi4_uid)))
2939                         goto nla_put_failure;
2940
2941                 if (rt_is_input_route(rt)) {
2942 #ifdef CONFIG_IP_MROUTE
2943                         if (ipv4_is_multicast(dst) &&
2944                             !ipv4_is_local_multicast(dst) &&
2945                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2946                                 int err = ipmr_get_route(net, skb,
2947                                                          fl4->saddr, fl4->daddr,
2948                                                          r, portid);
2949
2950                                 if (err <= 0) {
2951                                         if (err == 0)
2952                                                 return 0;
2953                                         goto nla_put_failure;
2954                                 }
2955                         } else
2956 #endif
2957                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2958                                         goto nla_put_failure;
2959                 }
2960         }
2961
2962         error = rt->dst.error;
2963
2964         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2965                 goto nla_put_failure;
2966
2967         nlmsg_end(skb, nlh);
2968         return 0;
2969
2970 nla_put_failure:
2971         nlmsg_cancel(skb, nlh);
2972         return -EMSGSIZE;
2973 }
2974
2975 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2976                             struct netlink_callback *cb, u32 table_id,
2977                             struct fnhe_hash_bucket *bucket, int genid,
2978                             int *fa_index, int fa_start, unsigned int flags)
2979 {
2980         int i;
2981
2982         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2983                 struct fib_nh_exception *fnhe;
2984
2985                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2986                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2987                         struct rtable *rt;
2988                         int err;
2989
2990                         if (*fa_index < fa_start)
2991                                 goto next;
2992
2993                         if (fnhe->fnhe_genid != genid)
2994                                 goto next;
2995
2996                         if (fnhe->fnhe_expires &&
2997                             time_after(jiffies, fnhe->fnhe_expires))
2998                                 goto next;
2999
3000                         rt = rcu_dereference(fnhe->fnhe_rth_input);
3001                         if (!rt)
3002                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
3003                         if (!rt)
3004                                 goto next;
3005
3006                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3007                                            table_id, NULL, skb,
3008                                            NETLINK_CB(cb->skb).portid,
3009                                            cb->nlh->nlmsg_seq, flags);
3010                         if (err)
3011                                 return err;
3012 next:
3013                         (*fa_index)++;
3014                 }
3015         }
3016
3017         return 0;
3018 }
3019
3020 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3021                        u32 table_id, struct fib_info *fi,
3022                        int *fa_index, int fa_start, unsigned int flags)
3023 {
3024         struct net *net = sock_net(cb->skb->sk);
3025         int nhsel, genid = fnhe_genid(net);
3026
3027         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3028                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3029                 struct fnhe_hash_bucket *bucket;
3030                 int err;
3031
3032                 if (nhc->nhc_flags & RTNH_F_DEAD)
3033                         continue;
3034
3035                 rcu_read_lock();
3036                 bucket = rcu_dereference(nhc->nhc_exceptions);
3037                 err = 0;
3038                 if (bucket)
3039                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3040                                                genid, fa_index, fa_start,
3041                                                flags);
3042                 rcu_read_unlock();
3043                 if (err)
3044                         return err;
3045         }
3046
3047         return 0;
3048 }
3049
3050 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3051                                                    u8 ip_proto, __be16 sport,
3052                                                    __be16 dport)
3053 {
3054         struct sk_buff *skb;
3055         struct iphdr *iph;
3056
3057         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3058         if (!skb)
3059                 return NULL;
3060
3061         /* Reserve room for dummy headers, this skb can pass
3062          * through good chunk of routing engine.
3063          */
3064         skb_reset_mac_header(skb);
3065         skb_reset_network_header(skb);
3066         skb->protocol = htons(ETH_P_IP);
3067         iph = skb_put(skb, sizeof(struct iphdr));
3068         iph->protocol = ip_proto;
3069         iph->saddr = src;
3070         iph->daddr = dst;
3071         iph->version = 0x4;
3072         iph->frag_off = 0;
3073         iph->ihl = 0x5;
3074         skb_set_transport_header(skb, skb->len);
3075
3076         switch (iph->protocol) {
3077         case IPPROTO_UDP: {
3078                 struct udphdr *udph;
3079
3080                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3081                 udph->source = sport;
3082                 udph->dest = dport;
3083                 udph->len = sizeof(struct udphdr);
3084                 udph->check = 0;
3085                 break;
3086         }
3087         case IPPROTO_TCP: {
3088                 struct tcphdr *tcph;
3089
3090                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3091                 tcph->source    = sport;
3092                 tcph->dest      = dport;
3093                 tcph->doff      = sizeof(struct tcphdr) / 4;
3094                 tcph->rst = 1;
3095                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3096                                             src, dst, 0);
3097                 break;
3098         }
3099         case IPPROTO_ICMP: {
3100                 struct icmphdr *icmph;
3101
3102                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3103                 icmph->type = ICMP_ECHO;
3104                 icmph->code = 0;
3105         }
3106         }
3107
3108         return skb;
3109 }
3110
3111 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3112                                        const struct nlmsghdr *nlh,
3113                                        struct nlattr **tb,
3114                                        struct netlink_ext_ack *extack)
3115 {
3116         struct rtmsg *rtm;
3117         int i, err;
3118
3119         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3120                 NL_SET_ERR_MSG(extack,
3121                                "ipv4: Invalid header for route get request");
3122                 return -EINVAL;
3123         }
3124
3125         if (!netlink_strict_get_check(skb))
3126                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3127                                               rtm_ipv4_policy, extack);
3128
3129         rtm = nlmsg_data(nlh);
3130         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3131             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3132             rtm->rtm_table || rtm->rtm_protocol ||
3133             rtm->rtm_scope || rtm->rtm_type) {
3134                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3135                 return -EINVAL;
3136         }
3137
3138         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3139                                RTM_F_LOOKUP_TABLE |
3140                                RTM_F_FIB_MATCH)) {
3141                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3142                 return -EINVAL;
3143         }
3144
3145         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3146                                             rtm_ipv4_policy, extack);
3147         if (err)
3148                 return err;
3149
3150         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3151             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3152                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3153                 return -EINVAL;
3154         }
3155
3156         for (i = 0; i <= RTA_MAX; i++) {
3157                 if (!tb[i])
3158                         continue;
3159
3160                 switch (i) {
3161                 case RTA_IIF:
3162                 case RTA_OIF:
3163                 case RTA_SRC:
3164                 case RTA_DST:
3165                 case RTA_IP_PROTO:
3166                 case RTA_SPORT:
3167                 case RTA_DPORT:
3168                 case RTA_MARK:
3169                 case RTA_UID:
3170                         break;
3171                 default:
3172                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3173                         return -EINVAL;
3174                 }
3175         }
3176
3177         return 0;
3178 }
3179
3180 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3181                              struct netlink_ext_ack *extack)
3182 {
3183         struct net *net = sock_net(in_skb->sk);
3184         struct nlattr *tb[RTA_MAX+1];
3185         u32 table_id = RT_TABLE_MAIN;
3186         __be16 sport = 0, dport = 0;
3187         struct fib_result res = {};
3188         u8 ip_proto = IPPROTO_UDP;
3189         struct rtable *rt = NULL;
3190         struct sk_buff *skb;
3191         struct rtmsg *rtm;
3192         struct flowi4 fl4 = {};
3193         __be32 dst = 0;
3194         __be32 src = 0;
3195         kuid_t uid;
3196         u32 iif;
3197         int err;
3198         int mark;
3199
3200         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3201         if (err < 0)
3202                 return err;
3203
3204         rtm = nlmsg_data(nlh);
3205         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3206         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3207         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3208         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3209         if (tb[RTA_UID])
3210                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3211         else
3212                 uid = (iif ? INVALID_UID : current_uid());
3213
3214         if (tb[RTA_IP_PROTO]) {
3215                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3216                                                   &ip_proto, AF_INET, extack);
3217                 if (err)
3218                         return err;
3219         }
3220
3221         if (tb[RTA_SPORT])
3222                 sport = nla_get_be16(tb[RTA_SPORT]);
3223
3224         if (tb[RTA_DPORT])
3225                 dport = nla_get_be16(tb[RTA_DPORT]);
3226
3227         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3228         if (!skb)
3229                 return -ENOBUFS;
3230
3231         fl4.daddr = dst;
3232         fl4.saddr = src;
3233         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3234         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3235         fl4.flowi4_mark = mark;
3236         fl4.flowi4_uid = uid;
3237         if (sport)
3238                 fl4.fl4_sport = sport;
3239         if (dport)
3240                 fl4.fl4_dport = dport;
3241         fl4.flowi4_proto = ip_proto;
3242
3243         rcu_read_lock();
3244
3245         if (iif) {
3246                 struct net_device *dev;
3247
3248                 dev = dev_get_by_index_rcu(net, iif);
3249                 if (!dev) {
3250                         err = -ENODEV;
3251                         goto errout_rcu;
3252                 }
3253
3254                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3255                 skb->dev        = dev;
3256                 skb->mark       = mark;
3257                 err = ip_route_input_rcu(skb, dst, src,
3258                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
3259                                          &res);
3260
3261                 rt = skb_rtable(skb);
3262                 if (err == 0 && rt->dst.error)
3263                         err = -rt->dst.error;
3264         } else {
3265                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3266                 skb->dev = net->loopback_dev;
3267                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3268                 err = 0;
3269                 if (IS_ERR(rt))
3270                         err = PTR_ERR(rt);
3271                 else
3272                         skb_dst_set(skb, &rt->dst);
3273         }
3274
3275         if (err)
3276                 goto errout_rcu;
3277
3278         if (rtm->rtm_flags & RTM_F_NOTIFY)
3279                 rt->rt_flags |= RTCF_NOTIFY;
3280
3281         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3282                 table_id = res.table ? res.table->tb_id : 0;
3283
3284         /* reset skb for netlink reply msg */
3285         skb_trim(skb, 0);
3286         skb_reset_network_header(skb);
3287         skb_reset_transport_header(skb);
3288         skb_reset_mac_header(skb);
3289
3290         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3291                 struct fib_rt_info fri;
3292
3293                 if (!res.fi) {
3294                         err = fib_props[res.type].error;
3295                         if (!err)
3296                                 err = -EHOSTUNREACH;
3297                         goto errout_rcu;
3298                 }
3299                 fri.fi = res.fi;
3300                 fri.tb_id = table_id;
3301                 fri.dst = res.prefix;
3302                 fri.dst_len = res.prefixlen;
3303                 fri.tos = fl4.flowi4_tos;
3304                 fri.type = rt->rt_type;
3305                 fri.offload = 0;
3306                 fri.trap = 0;
3307                 fri.offload_failed = 0;
3308                 if (res.fa_head) {
3309                         struct fib_alias *fa;
3310
3311                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3312                                 u8 slen = 32 - fri.dst_len;
3313
3314                                 if (fa->fa_slen == slen &&
3315                                     fa->tb_id == fri.tb_id &&
3316                                     fa->fa_tos == fri.tos &&
3317                                     fa->fa_info == res.fi &&
3318                                     fa->fa_type == fri.type) {
3319                                         fri.offload = fa->offload;
3320                                         fri.trap = fa->trap;
3321                                         break;
3322                                 }
3323                         }
3324                 }
3325                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3326                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3327         } else {
3328                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3329                                    NETLINK_CB(in_skb).portid,
3330                                    nlh->nlmsg_seq, 0);
3331         }
3332         if (err < 0)
3333                 goto errout_rcu;
3334
3335         rcu_read_unlock();
3336
3337         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3338
3339 errout_free:
3340         return err;
3341 errout_rcu:
3342         rcu_read_unlock();
3343         kfree_skb(skb);
3344         goto errout_free;
3345 }
3346
3347 void ip_rt_multicast_event(struct in_device *in_dev)
3348 {
3349         rt_cache_flush(dev_net(in_dev->dev));
3350 }
3351
3352 #ifdef CONFIG_SYSCTL
3353 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3354 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3355 static int ip_rt_gc_elasticity __read_mostly    = 8;
3356 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3357
3358 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3359                 void *buffer, size_t *lenp, loff_t *ppos)
3360 {
3361         struct net *net = (struct net *)__ctl->extra1;
3362
3363         if (write) {
3364                 rt_cache_flush(net);
3365                 fnhe_genid_bump(net);
3366                 return 0;
3367         }
3368
3369         return -EINVAL;
3370 }
3371
3372 static struct ctl_table ipv4_route_table[] = {
3373         {
3374                 .procname       = "gc_thresh",
3375                 .data           = &ipv4_dst_ops.gc_thresh,
3376                 .maxlen         = sizeof(int),
3377                 .mode           = 0644,
3378                 .proc_handler   = proc_dointvec,
3379         },
3380         {
3381                 .procname       = "max_size",
3382                 .data           = &ip_rt_max_size,
3383                 .maxlen         = sizeof(int),
3384                 .mode           = 0644,
3385                 .proc_handler   = proc_dointvec,
3386         },
3387         {
3388                 /*  Deprecated. Use gc_min_interval_ms */
3389
3390                 .procname       = "gc_min_interval",
3391                 .data           = &ip_rt_gc_min_interval,
3392                 .maxlen         = sizeof(int),
3393                 .mode           = 0644,
3394                 .proc_handler   = proc_dointvec_jiffies,
3395         },
3396         {
3397                 .procname       = "gc_min_interval_ms",
3398                 .data           = &ip_rt_gc_min_interval,
3399                 .maxlen         = sizeof(int),
3400                 .mode           = 0644,
3401                 .proc_handler   = proc_dointvec_ms_jiffies,
3402         },
3403         {
3404                 .procname       = "gc_timeout",
3405                 .data           = &ip_rt_gc_timeout,
3406                 .maxlen         = sizeof(int),
3407                 .mode           = 0644,
3408                 .proc_handler   = proc_dointvec_jiffies,
3409         },
3410         {
3411                 .procname       = "gc_interval",
3412                 .data           = &ip_rt_gc_interval,
3413                 .maxlen         = sizeof(int),
3414                 .mode           = 0644,
3415                 .proc_handler   = proc_dointvec_jiffies,
3416         },
3417         {
3418                 .procname       = "redirect_load",
3419                 .data           = &ip_rt_redirect_load,
3420                 .maxlen         = sizeof(int),
3421                 .mode           = 0644,
3422                 .proc_handler   = proc_dointvec,
3423         },
3424         {
3425                 .procname       = "redirect_number",
3426                 .data           = &ip_rt_redirect_number,
3427                 .maxlen         = sizeof(int),
3428                 .mode           = 0644,
3429                 .proc_handler   = proc_dointvec,
3430         },
3431         {
3432                 .procname       = "redirect_silence",
3433                 .data           = &ip_rt_redirect_silence,
3434                 .maxlen         = sizeof(int),
3435                 .mode           = 0644,
3436                 .proc_handler   = proc_dointvec,
3437         },
3438         {
3439                 .procname       = "error_cost",
3440                 .data           = &ip_rt_error_cost,
3441                 .maxlen         = sizeof(int),
3442                 .mode           = 0644,
3443                 .proc_handler   = proc_dointvec,
3444         },
3445         {
3446                 .procname       = "error_burst",
3447                 .data           = &ip_rt_error_burst,
3448                 .maxlen         = sizeof(int),
3449                 .mode           = 0644,
3450                 .proc_handler   = proc_dointvec,
3451         },
3452         {
3453                 .procname       = "gc_elasticity",
3454                 .data           = &ip_rt_gc_elasticity,
3455                 .maxlen         = sizeof(int),
3456                 .mode           = 0644,
3457                 .proc_handler   = proc_dointvec,
3458         },
3459         {
3460                 .procname       = "mtu_expires",
3461                 .data           = &ip_rt_mtu_expires,
3462                 .maxlen         = sizeof(int),
3463                 .mode           = 0644,
3464                 .proc_handler   = proc_dointvec_jiffies,
3465         },
3466         {
3467                 .procname       = "min_pmtu",
3468                 .data           = &ip_rt_min_pmtu,
3469                 .maxlen         = sizeof(int),
3470                 .mode           = 0644,
3471                 .proc_handler   = proc_dointvec_minmax,
3472                 .extra1         = &ip_min_valid_pmtu,
3473         },
3474         {
3475                 .procname       = "min_adv_mss",
3476                 .data           = &ip_rt_min_advmss,
3477                 .maxlen         = sizeof(int),
3478                 .mode           = 0644,
3479                 .proc_handler   = proc_dointvec,
3480         },
3481         { }
3482 };
3483
3484 static const char ipv4_route_flush_procname[] = "flush";
3485
3486 static struct ctl_table ipv4_route_flush_table[] = {
3487         {
3488                 .procname       = ipv4_route_flush_procname,
3489                 .maxlen         = sizeof(int),
3490                 .mode           = 0200,
3491                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3492         },
3493         { },
3494 };
3495
3496 static __net_init int sysctl_route_net_init(struct net *net)
3497 {
3498         struct ctl_table *tbl;
3499
3500         tbl = ipv4_route_flush_table;
3501         if (!net_eq(net, &init_net)) {
3502                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3503                 if (!tbl)
3504                         goto err_dup;
3505
3506                 /* Don't export non-whitelisted sysctls to unprivileged users */
3507                 if (net->user_ns != &init_user_ns) {
3508                         if (tbl[0].procname != ipv4_route_flush_procname)
3509                                 tbl[0].procname = NULL;
3510                 }
3511         }
3512         tbl[0].extra1 = net;
3513
3514         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3515         if (!net->ipv4.route_hdr)
3516                 goto err_reg;
3517         return 0;
3518
3519 err_reg:
3520         if (tbl != ipv4_route_flush_table)
3521                 kfree(tbl);
3522 err_dup:
3523         return -ENOMEM;
3524 }
3525
3526 static __net_exit void sysctl_route_net_exit(struct net *net)
3527 {
3528         struct ctl_table *tbl;
3529
3530         tbl = net->ipv4.route_hdr->ctl_table_arg;
3531         unregister_net_sysctl_table(net->ipv4.route_hdr);
3532         BUG_ON(tbl == ipv4_route_flush_table);
3533         kfree(tbl);
3534 }
3535
3536 static __net_initdata struct pernet_operations sysctl_route_ops = {
3537         .init = sysctl_route_net_init,
3538         .exit = sysctl_route_net_exit,
3539 };
3540 #endif
3541
3542 static __net_init int rt_genid_init(struct net *net)
3543 {
3544         atomic_set(&net->ipv4.rt_genid, 0);
3545         atomic_set(&net->fnhe_genid, 0);
3546         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3547         return 0;
3548 }
3549
3550 static __net_initdata struct pernet_operations rt_genid_ops = {
3551         .init = rt_genid_init,
3552 };
3553
3554 static int __net_init ipv4_inetpeer_init(struct net *net)
3555 {
3556         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3557
3558         if (!bp)
3559                 return -ENOMEM;
3560         inet_peer_base_init(bp);
3561         net->ipv4.peers = bp;
3562         return 0;
3563 }
3564
3565 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3566 {
3567         struct inet_peer_base *bp = net->ipv4.peers;
3568
3569         net->ipv4.peers = NULL;
3570         inetpeer_invalidate_tree(bp);
3571         kfree(bp);
3572 }
3573
3574 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3575         .init   =       ipv4_inetpeer_init,
3576         .exit   =       ipv4_inetpeer_exit,
3577 };
3578
3579 #ifdef CONFIG_IP_ROUTE_CLASSID
3580 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3581 #endif /* CONFIG_IP_ROUTE_CLASSID */
3582
3583 int __init ip_rt_init(void)
3584 {
3585         int cpu;
3586
3587         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3588                                   GFP_KERNEL);
3589         if (!ip_idents)
3590                 panic("IP: failed to allocate ip_idents\n");
3591
3592         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3593
3594         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3595         if (!ip_tstamps)
3596                 panic("IP: failed to allocate ip_tstamps\n");
3597
3598         for_each_possible_cpu(cpu) {
3599                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3600
3601                 INIT_LIST_HEAD(&ul->head);
3602                 spin_lock_init(&ul->lock);
3603         }
3604 #ifdef CONFIG_IP_ROUTE_CLASSID
3605         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3606         if (!ip_rt_acct)
3607                 panic("IP: failed to allocate ip_rt_acct\n");
3608 #endif
3609
3610         ipv4_dst_ops.kmem_cachep =
3611                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3612                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3613
3614         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3615
3616         if (dst_entries_init(&ipv4_dst_ops) < 0)
3617                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3618
3619         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3620                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3621
3622         ipv4_dst_ops.gc_thresh = ~0;
3623         ip_rt_max_size = INT_MAX;
3624
3625         devinet_init();
3626         ip_fib_init();
3627
3628         if (ip_rt_proc_init())
3629                 pr_err("Unable to create route proc files\n");
3630 #ifdef CONFIG_XFRM
3631         xfrm_init();
3632         xfrm4_init();
3633 #endif
3634         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3635                       RTNL_FLAG_DOIT_UNLOCKED);
3636
3637 #ifdef CONFIG_SYSCTL
3638         register_pernet_subsys(&sysctl_route_ops);
3639 #endif
3640         register_pernet_subsys(&rt_genid_ops);
3641         register_pernet_subsys(&ipv4_inetpeer_ops);
3642         return 0;
3643 }
3644
3645 #ifdef CONFIG_SYSCTL
3646 /*
3647  * We really need to sanitize the damn ipv4 init order, then all
3648  * this nonsense will go away.
3649  */
3650 void __init ip_static_sysctl_init(void)
3651 {
3652         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3653 }
3654 #endif