Merge tag 'powerpc-5.6-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[linux-2.6-microblaze.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu,
143                                            bool confirm_neigh);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct proc_ops rt_cache_proc_ops = {
241         .proc_open      = rt_cache_seq_open,
242         .proc_read      = seq_read,
243         .proc_lseek     = seq_lseek,
244         .proc_release   = seq_release,
245 };
246
247
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250         int cpu;
251
252         if (*pos == 0)
253                 return SEQ_START_TOKEN;
254
255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256                 if (!cpu_possible(cpu))
257                         continue;
258                 *pos = cpu+1;
259                 return &per_cpu(rt_cache_stat, cpu);
260         }
261         return NULL;
262 }
263
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266         int cpu;
267
268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269                 if (!cpu_possible(cpu))
270                         continue;
271                 *pos = cpu+1;
272                 return &per_cpu(rt_cache_stat, cpu);
273         }
274         (*pos)++;
275         return NULL;
276
277 }
278
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281
282 }
283
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324
325
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330
331 static const struct proc_ops rt_cpu_proc_ops = {
332         .proc_open      = rt_cpu_seq_open,
333         .proc_read      = seq_read,
334         .proc_lseek     = seq_lseek,
335         .proc_release   = seq_release,
336 };
337
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341         struct ip_rt_acct *dst, *src;
342         unsigned int i, j;
343
344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345         if (!dst)
346                 return -ENOMEM;
347
348         for_each_possible_cpu(i) {
349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350                 for (j = 0; j < 256; j++) {
351                         dst[j].o_bytes   += src[j].o_bytes;
352                         dst[j].o_packets += src[j].o_packets;
353                         dst[j].i_bytes   += src[j].i_bytes;
354                         dst[j].i_packets += src[j].i_packets;
355                 }
356         }
357
358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359         kfree(dst);
360         return 0;
361 }
362 #endif
363
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366         struct proc_dir_entry *pde;
367
368         pde = proc_create("rt_cache", 0444, net->proc_net,
369                           &rt_cache_proc_ops);
370         if (!pde)
371                 goto err1;
372
373         pde = proc_create("rt_cache", 0444,
374                           net->proc_net_stat, &rt_cpu_proc_ops);
375         if (!pde)
376                 goto err2;
377
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379         pde = proc_create_single("rt_acct", 0, net->proc_net,
380                         rt_acct_proc_show);
381         if (!pde)
382                 goto err3;
383 #endif
384         return 0;
385
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388         remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391         remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393         return -ENOMEM;
394 }
395
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399         remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401         remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406         .init = ip_rt_do_proc_init,
407         .exit = ip_rt_do_proc_exit,
408 };
409
410 static int __init ip_rt_proc_init(void)
411 {
412         return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418         return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426
427 void rt_cache_flush(struct net *net)
428 {
429         rt_genid_bump_ipv4(net);
430 }
431
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433                                            struct sk_buff *skb,
434                                            const void *daddr)
435 {
436         const struct rtable *rt = container_of(dst, struct rtable, dst);
437         struct net_device *dev = dst->dev;
438         struct neighbour *n;
439
440         rcu_read_lock_bh();
441
442         if (likely(rt->rt_gw_family == AF_INET)) {
443                 n = ip_neigh_gw4(dev, rt->rt_gw4);
444         } else if (rt->rt_gw_family == AF_INET6) {
445                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447                 __be32 pkey;
448
449                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450                 n = ip_neigh_gw4(dev, pkey);
451         }
452
453         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454                 n = NULL;
455
456         rcu_read_unlock_bh();
457
458         return n;
459 }
460
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463         const struct rtable *rt = container_of(dst, struct rtable, dst);
464         struct net_device *dev = dst->dev;
465         const __be32 *pkey = daddr;
466
467         if (rt->rt_gw_family == AF_INET) {
468                 pkey = (const __be32 *)&rt->rt_gw4;
469         } else if (rt->rt_gw_family == AF_INET6) {
470                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471         } else if (!daddr ||
472                  (rt->rt_flags &
473                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474                 return;
475         }
476         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478
479 #define IP_IDENTS_SZ 2048u
480
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492         u32 old = READ_ONCE(*p_tstamp);
493         u32 now = (u32)jiffies;
494         u32 new, delta = 0;
495
496         if (old != now && cmpxchg(p_tstamp, old, now) == old)
497                 delta = prandom_u32_max(now - old);
498
499         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
500         do {
501                 old = (u32)atomic_read(p_id);
502                 new = old + delta + segs;
503         } while (atomic_cmpxchg(p_id, old, new) != old);
504
505         return new - segs;
506 }
507 EXPORT_SYMBOL(ip_idents_reserve);
508
509 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
510 {
511         u32 hash, id;
512
513         /* Note the following code is not safe, but this is okay. */
514         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
515                 get_random_bytes(&net->ipv4.ip_id_key,
516                                  sizeof(net->ipv4.ip_id_key));
517
518         hash = siphash_3u32((__force u32)iph->daddr,
519                             (__force u32)iph->saddr,
520                             iph->protocol,
521                             &net->ipv4.ip_id_key);
522         id = ip_idents_reserve(hash, segs);
523         iph->id = htons(id);
524 }
525 EXPORT_SYMBOL(__ip_select_ident);
526
527 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
528                              const struct sock *sk,
529                              const struct iphdr *iph,
530                              int oif, u8 tos,
531                              u8 prot, u32 mark, int flow_flags)
532 {
533         if (sk) {
534                 const struct inet_sock *inet = inet_sk(sk);
535
536                 oif = sk->sk_bound_dev_if;
537                 mark = sk->sk_mark;
538                 tos = RT_CONN_FLAGS(sk);
539                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540         }
541         flowi4_init_output(fl4, oif, mark, tos,
542                            RT_SCOPE_UNIVERSE, prot,
543                            flow_flags,
544                            iph->daddr, iph->saddr, 0, 0,
545                            sock_net_uid(net, sk));
546 }
547
548 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
549                                const struct sock *sk)
550 {
551         const struct net *net = dev_net(skb->dev);
552         const struct iphdr *iph = ip_hdr(skb);
553         int oif = skb->dev->ifindex;
554         u8 tos = RT_TOS(iph->tos);
555         u8 prot = iph->protocol;
556         u32 mark = skb->mark;
557
558         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564         const struct ip_options_rcu *inet_opt;
565         __be32 daddr = inet->inet_daddr;
566
567         rcu_read_lock();
568         inet_opt = rcu_dereference(inet->inet_opt);
569         if (inet_opt && inet_opt->opt.srr)
570                 daddr = inet_opt->opt.faddr;
571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574                            inet_sk_flowi_flags(sk),
575                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
576         rcu_read_unlock();
577 }
578
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580                                  const struct sk_buff *skb)
581 {
582         if (skb)
583                 build_skb_flow_key(fl4, skb, sk);
584         else
585                 build_sk_flow_key(fl4, sk);
586 }
587
588 static DEFINE_SPINLOCK(fnhe_lock);
589
590 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
591 {
592         struct rtable *rt;
593
594         rt = rcu_dereference(fnhe->fnhe_rth_input);
595         if (rt) {
596                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
597                 dst_dev_put(&rt->dst);
598                 dst_release(&rt->dst);
599         }
600         rt = rcu_dereference(fnhe->fnhe_rth_output);
601         if (rt) {
602                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
603                 dst_dev_put(&rt->dst);
604                 dst_release(&rt->dst);
605         }
606 }
607
608 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
609 {
610         struct fib_nh_exception *fnhe, *oldest;
611
612         oldest = rcu_dereference(hash->chain);
613         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
614              fnhe = rcu_dereference(fnhe->fnhe_next)) {
615                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616                         oldest = fnhe;
617         }
618         fnhe_flush_routes(oldest);
619         return oldest;
620 }
621
622 static inline u32 fnhe_hashfun(__be32 daddr)
623 {
624         static u32 fnhe_hashrnd __read_mostly;
625         u32 hval;
626
627         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
628         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
629         return hash_32(hval, FNHE_HASH_SHIFT);
630 }
631
632 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
633 {
634         rt->rt_pmtu = fnhe->fnhe_pmtu;
635         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
636         rt->dst.expires = fnhe->fnhe_expires;
637
638         if (fnhe->fnhe_gw) {
639                 rt->rt_flags |= RTCF_REDIRECTED;
640                 rt->rt_uses_gateway = 1;
641                 rt->rt_gw_family = AF_INET;
642                 rt->rt_gw4 = fnhe->fnhe_gw;
643         }
644 }
645
646 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647                                   __be32 gw, u32 pmtu, bool lock,
648                                   unsigned long expires)
649 {
650         struct fnhe_hash_bucket *hash;
651         struct fib_nh_exception *fnhe;
652         struct rtable *rt;
653         u32 genid, hval;
654         unsigned int i;
655         int depth;
656
657         genid = fnhe_genid(dev_net(nhc->nhc_dev));
658         hval = fnhe_hashfun(daddr);
659
660         spin_lock_bh(&fnhe_lock);
661
662         hash = rcu_dereference(nhc->nhc_exceptions);
663         if (!hash) {
664                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665                 if (!hash)
666                         goto out_unlock;
667                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
668         }
669
670         hash += hval;
671
672         depth = 0;
673         for (fnhe = rcu_dereference(hash->chain); fnhe;
674              fnhe = rcu_dereference(fnhe->fnhe_next)) {
675                 if (fnhe->fnhe_daddr == daddr)
676                         break;
677                 depth++;
678         }
679
680         if (fnhe) {
681                 if (fnhe->fnhe_genid != genid)
682                         fnhe->fnhe_genid = genid;
683                 if (gw)
684                         fnhe->fnhe_gw = gw;
685                 if (pmtu) {
686                         fnhe->fnhe_pmtu = pmtu;
687                         fnhe->fnhe_mtu_locked = lock;
688                 }
689                 fnhe->fnhe_expires = max(1UL, expires);
690                 /* Update all cached dsts too */
691                 rt = rcu_dereference(fnhe->fnhe_rth_input);
692                 if (rt)
693                         fill_route_from_fnhe(rt, fnhe);
694                 rt = rcu_dereference(fnhe->fnhe_rth_output);
695                 if (rt)
696                         fill_route_from_fnhe(rt, fnhe);
697         } else {
698                 if (depth > FNHE_RECLAIM_DEPTH)
699                         fnhe = fnhe_oldest(hash);
700                 else {
701                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702                         if (!fnhe)
703                                 goto out_unlock;
704
705                         fnhe->fnhe_next = hash->chain;
706                         rcu_assign_pointer(hash->chain, fnhe);
707                 }
708                 fnhe->fnhe_genid = genid;
709                 fnhe->fnhe_daddr = daddr;
710                 fnhe->fnhe_gw = gw;
711                 fnhe->fnhe_pmtu = pmtu;
712                 fnhe->fnhe_mtu_locked = lock;
713                 fnhe->fnhe_expires = max(1UL, expires);
714
715                 /* Exception created; mark the cached routes for the nexthop
716                  * stale, so anyone caching it rechecks if this exception
717                  * applies to them.
718                  */
719                 rt = rcu_dereference(nhc->nhc_rth_input);
720                 if (rt)
721                         rt->dst.obsolete = DST_OBSOLETE_KILL;
722
723                 for_each_possible_cpu(i) {
724                         struct rtable __rcu **prt;
725                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726                         rt = rcu_dereference(*prt);
727                         if (rt)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                 }
730         }
731
732         fnhe->fnhe_stamp = jiffies;
733
734 out_unlock:
735         spin_unlock_bh(&fnhe_lock);
736 }
737
738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739                              bool kill_route)
740 {
741         __be32 new_gw = icmp_hdr(skb)->un.gateway;
742         __be32 old_gw = ip_hdr(skb)->saddr;
743         struct net_device *dev = skb->dev;
744         struct in_device *in_dev;
745         struct fib_result res;
746         struct neighbour *n;
747         struct net *net;
748
749         switch (icmp_hdr(skb)->code & 7) {
750         case ICMP_REDIR_NET:
751         case ICMP_REDIR_NETTOS:
752         case ICMP_REDIR_HOST:
753         case ICMP_REDIR_HOSTTOS:
754                 break;
755
756         default:
757                 return;
758         }
759
760         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761                 return;
762
763         in_dev = __in_dev_get_rcu(dev);
764         if (!in_dev)
765                 return;
766
767         net = dev_net(dev);
768         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770             ipv4_is_zeronet(new_gw))
771                 goto reject_redirect;
772
773         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775                         goto reject_redirect;
776                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777                         goto reject_redirect;
778         } else {
779                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780                         goto reject_redirect;
781         }
782
783         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784         if (!n)
785                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786         if (!IS_ERR(n)) {
787                 if (!(n->nud_state & NUD_VALID)) {
788                         neigh_event_send(n, NULL);
789                 } else {
790                         if (fib_lookup(net, fl4, &res, 0) == 0) {
791                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
792
793                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
794                                                 0, false,
795                                                 jiffies + ip_rt_gc_timeout);
796                         }
797                         if (kill_route)
798                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
799                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800                 }
801                 neigh_release(n);
802         }
803         return;
804
805 reject_redirect:
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807         if (IN_DEV_LOG_MARTIANS(in_dev)) {
808                 const struct iphdr *iph = (const struct iphdr *) skb->data;
809                 __be32 daddr = iph->daddr;
810                 __be32 saddr = iph->saddr;
811
812                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813                                      "  Advised path = %pI4 -> %pI4\n",
814                                      &old_gw, dev->name, &new_gw,
815                                      &saddr, &daddr);
816         }
817 #endif
818         ;
819 }
820
821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822 {
823         struct rtable *rt;
824         struct flowi4 fl4;
825         const struct iphdr *iph = (const struct iphdr *) skb->data;
826         struct net *net = dev_net(skb->dev);
827         int oif = skb->dev->ifindex;
828         u8 tos = RT_TOS(iph->tos);
829         u8 prot = iph->protocol;
830         u32 mark = skb->mark;
831
832         rt = (struct rtable *) dst;
833
834         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
835         __ip_do_redirect(rt, skb, &fl4, true);
836 }
837
838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839 {
840         struct rtable *rt = (struct rtable *)dst;
841         struct dst_entry *ret = dst;
842
843         if (rt) {
844                 if (dst->obsolete > 0) {
845                         ip_rt_put(rt);
846                         ret = NULL;
847                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848                            rt->dst.expires) {
849                         ip_rt_put(rt);
850                         ret = NULL;
851                 }
852         }
853         return ret;
854 }
855
856 /*
857  * Algorithm:
858  *      1. The first ip_rt_redirect_number redirects are sent
859  *         with exponential backoff, then we stop sending them at all,
860  *         assuming that the host ignores our redirects.
861  *      2. If we did not see packets requiring redirects
862  *         during ip_rt_redirect_silence, we assume that the host
863  *         forgot redirected route and start to send redirects again.
864  *
865  * This algorithm is much cheaper and more intelligent than dumb load limiting
866  * in icmp.c.
867  *
868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
870  */
871
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874         struct rtable *rt = skb_rtable(skb);
875         struct in_device *in_dev;
876         struct inet_peer *peer;
877         struct net *net;
878         int log_martians;
879         int vif;
880
881         rcu_read_lock();
882         in_dev = __in_dev_get_rcu(rt->dst.dev);
883         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884                 rcu_read_unlock();
885                 return;
886         }
887         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
888         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889         rcu_read_unlock();
890
891         net = dev_net(rt->dst.dev);
892         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
893         if (!peer) {
894                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895                           rt_nexthop(rt, ip_hdr(skb)->daddr));
896                 return;
897         }
898
899         /* No redirected packets during ip_rt_redirect_silence;
900          * reset the algorithm.
901          */
902         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
903                 peer->rate_tokens = 0;
904                 peer->n_redirects = 0;
905         }
906
907         /* Too many ignored redirects; do not send anything
908          * set dst.rate_last to the last seen redirected packet.
909          */
910         if (peer->n_redirects >= ip_rt_redirect_number) {
911                 peer->rate_last = jiffies;
912                 goto out_put_peer;
913         }
914
915         /* Check for load limit; set rate_last to the latest sent
916          * redirect.
917          */
918         if (peer->rate_tokens == 0 ||
919             time_after(jiffies,
920                        (peer->rate_last +
921                         (ip_rt_redirect_load << peer->n_redirects)))) {
922                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
923
924                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
925                 peer->rate_last = jiffies;
926                 ++peer->n_redirects;
927 #ifdef CONFIG_IP_ROUTE_VERBOSE
928                 if (log_martians &&
929                     peer->n_redirects == ip_rt_redirect_number)
930                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
931                                              &ip_hdr(skb)->saddr, inet_iif(skb),
932                                              &ip_hdr(skb)->daddr, &gw);
933 #endif
934         }
935 out_put_peer:
936         inet_putpeer(peer);
937 }
938
939 static int ip_error(struct sk_buff *skb)
940 {
941         struct rtable *rt = skb_rtable(skb);
942         struct net_device *dev = skb->dev;
943         struct in_device *in_dev;
944         struct inet_peer *peer;
945         unsigned long now;
946         struct net *net;
947         bool send;
948         int code;
949
950         if (netif_is_l3_master(skb->dev)) {
951                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
952                 if (!dev)
953                         goto out;
954         }
955
956         in_dev = __in_dev_get_rcu(dev);
957
958         /* IP on this device is disabled. */
959         if (!in_dev)
960                 goto out;
961
962         net = dev_net(rt->dst.dev);
963         if (!IN_DEV_FORWARD(in_dev)) {
964                 switch (rt->dst.error) {
965                 case EHOSTUNREACH:
966                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
967                         break;
968
969                 case ENETUNREACH:
970                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
971                         break;
972                 }
973                 goto out;
974         }
975
976         switch (rt->dst.error) {
977         case EINVAL:
978         default:
979                 goto out;
980         case EHOSTUNREACH:
981                 code = ICMP_HOST_UNREACH;
982                 break;
983         case ENETUNREACH:
984                 code = ICMP_NET_UNREACH;
985                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
986                 break;
987         case EACCES:
988                 code = ICMP_PKT_FILTERED;
989                 break;
990         }
991
992         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
993                                l3mdev_master_ifindex(skb->dev), 1);
994
995         send = true;
996         if (peer) {
997                 now = jiffies;
998                 peer->rate_tokens += now - peer->rate_last;
999                 if (peer->rate_tokens > ip_rt_error_burst)
1000                         peer->rate_tokens = ip_rt_error_burst;
1001                 peer->rate_last = now;
1002                 if (peer->rate_tokens >= ip_rt_error_cost)
1003                         peer->rate_tokens -= ip_rt_error_cost;
1004                 else
1005                         send = false;
1006                 inet_putpeer(peer);
1007         }
1008         if (send)
1009                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1010
1011 out:    kfree_skb(skb);
1012         return 0;
1013 }
1014
1015 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1016 {
1017         struct dst_entry *dst = &rt->dst;
1018         u32 old_mtu = ipv4_mtu(dst);
1019         struct fib_result res;
1020         bool lock = false;
1021
1022         if (ip_mtu_locked(dst))
1023                 return;
1024
1025         if (old_mtu < mtu)
1026                 return;
1027
1028         if (mtu < ip_rt_min_pmtu) {
1029                 lock = true;
1030                 mtu = min(old_mtu, ip_rt_min_pmtu);
1031         }
1032
1033         if (rt->rt_pmtu == mtu && !lock &&
1034             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1035                 return;
1036
1037         rcu_read_lock();
1038         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1039                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1040
1041                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1042                                       jiffies + ip_rt_mtu_expires);
1043         }
1044         rcu_read_unlock();
1045 }
1046
1047 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1048                               struct sk_buff *skb, u32 mtu,
1049                               bool confirm_neigh)
1050 {
1051         struct rtable *rt = (struct rtable *) dst;
1052         struct flowi4 fl4;
1053
1054         ip_rt_build_flow_key(&fl4, sk, skb);
1055         __ip_rt_update_pmtu(rt, &fl4, mtu);
1056 }
1057
1058 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1059                       int oif, u8 protocol)
1060 {
1061         const struct iphdr *iph = (const struct iphdr *) skb->data;
1062         struct flowi4 fl4;
1063         struct rtable *rt;
1064         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1065
1066         __build_flow_key(net, &fl4, NULL, iph, oif,
1067                          RT_TOS(iph->tos), protocol, mark, 0);
1068         rt = __ip_route_output_key(net, &fl4);
1069         if (!IS_ERR(rt)) {
1070                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1071                 ip_rt_put(rt);
1072         }
1073 }
1074 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1075
1076 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1077 {
1078         const struct iphdr *iph = (const struct iphdr *) skb->data;
1079         struct flowi4 fl4;
1080         struct rtable *rt;
1081
1082         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1083
1084         if (!fl4.flowi4_mark)
1085                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1086
1087         rt = __ip_route_output_key(sock_net(sk), &fl4);
1088         if (!IS_ERR(rt)) {
1089                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1090                 ip_rt_put(rt);
1091         }
1092 }
1093
1094 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1095 {
1096         const struct iphdr *iph = (const struct iphdr *) skb->data;
1097         struct flowi4 fl4;
1098         struct rtable *rt;
1099         struct dst_entry *odst = NULL;
1100         bool new = false;
1101         struct net *net = sock_net(sk);
1102
1103         bh_lock_sock(sk);
1104
1105         if (!ip_sk_accept_pmtu(sk))
1106                 goto out;
1107
1108         odst = sk_dst_get(sk);
1109
1110         if (sock_owned_by_user(sk) || !odst) {
1111                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1112                 goto out;
1113         }
1114
1115         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1116
1117         rt = (struct rtable *)odst;
1118         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1119                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1120                 if (IS_ERR(rt))
1121                         goto out;
1122
1123                 new = true;
1124         }
1125
1126         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1127
1128         if (!dst_check(&rt->dst, 0)) {
1129                 if (new)
1130                         dst_release(&rt->dst);
1131
1132                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1133                 if (IS_ERR(rt))
1134                         goto out;
1135
1136                 new = true;
1137         }
1138
1139         if (new)
1140                 sk_dst_set(sk, &rt->dst);
1141
1142 out:
1143         bh_unlock_sock(sk);
1144         dst_release(odst);
1145 }
1146 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1147
1148 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1149                    int oif, u8 protocol)
1150 {
1151         const struct iphdr *iph = (const struct iphdr *) skb->data;
1152         struct flowi4 fl4;
1153         struct rtable *rt;
1154
1155         __build_flow_key(net, &fl4, NULL, iph, oif,
1156                          RT_TOS(iph->tos), protocol, 0, 0);
1157         rt = __ip_route_output_key(net, &fl4);
1158         if (!IS_ERR(rt)) {
1159                 __ip_do_redirect(rt, skb, &fl4, false);
1160                 ip_rt_put(rt);
1161         }
1162 }
1163 EXPORT_SYMBOL_GPL(ipv4_redirect);
1164
1165 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1166 {
1167         const struct iphdr *iph = (const struct iphdr *) skb->data;
1168         struct flowi4 fl4;
1169         struct rtable *rt;
1170         struct net *net = sock_net(sk);
1171
1172         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1173         rt = __ip_route_output_key(net, &fl4);
1174         if (!IS_ERR(rt)) {
1175                 __ip_do_redirect(rt, skb, &fl4, false);
1176                 ip_rt_put(rt);
1177         }
1178 }
1179 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1180
1181 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182 {
1183         struct rtable *rt = (struct rtable *) dst;
1184
1185         /* All IPV4 dsts are created with ->obsolete set to the value
1186          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1187          * into this function always.
1188          *
1189          * When a PMTU/redirect information update invalidates a route,
1190          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1191          * DST_OBSOLETE_DEAD.
1192          */
1193         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1194                 return NULL;
1195         return dst;
1196 }
1197
1198 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1199 {
1200         struct ip_options opt;
1201         int res;
1202
1203         /* Recompile ip options since IPCB may not be valid anymore.
1204          * Also check we have a reasonable ipv4 header.
1205          */
1206         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1207             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1208                 return;
1209
1210         memset(&opt, 0, sizeof(opt));
1211         if (ip_hdr(skb)->ihl > 5) {
1212                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1213                         return;
1214                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1215
1216                 rcu_read_lock();
1217                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1218                 rcu_read_unlock();
1219
1220                 if (res)
1221                         return;
1222         }
1223         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1224 }
1225
1226 static void ipv4_link_failure(struct sk_buff *skb)
1227 {
1228         struct rtable *rt;
1229
1230         ipv4_send_dest_unreach(skb);
1231
1232         rt = skb_rtable(skb);
1233         if (rt)
1234                 dst_set_expires(&rt->dst, 0);
1235 }
1236
1237 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1238 {
1239         pr_debug("%s: %pI4 -> %pI4, %s\n",
1240                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1241                  skb->dev ? skb->dev->name : "?");
1242         kfree_skb(skb);
1243         WARN_ON(1);
1244         return 0;
1245 }
1246
1247 /*
1248    We do not cache source address of outgoing interface,
1249    because it is used only by IP RR, TS and SRR options,
1250    so that it out of fast path.
1251
1252    BTW remember: "addr" is allowed to be not aligned
1253    in IP options!
1254  */
1255
1256 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1257 {
1258         __be32 src;
1259
1260         if (rt_is_output_route(rt))
1261                 src = ip_hdr(skb)->saddr;
1262         else {
1263                 struct fib_result res;
1264                 struct iphdr *iph = ip_hdr(skb);
1265                 struct flowi4 fl4 = {
1266                         .daddr = iph->daddr,
1267                         .saddr = iph->saddr,
1268                         .flowi4_tos = RT_TOS(iph->tos),
1269                         .flowi4_oif = rt->dst.dev->ifindex,
1270                         .flowi4_iif = skb->dev->ifindex,
1271                         .flowi4_mark = skb->mark,
1272                 };
1273
1274                 rcu_read_lock();
1275                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1276                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1277                 else
1278                         src = inet_select_addr(rt->dst.dev,
1279                                                rt_nexthop(rt, iph->daddr),
1280                                                RT_SCOPE_UNIVERSE);
1281                 rcu_read_unlock();
1282         }
1283         memcpy(addr, &src, 4);
1284 }
1285
1286 #ifdef CONFIG_IP_ROUTE_CLASSID
1287 static void set_class_tag(struct rtable *rt, u32 tag)
1288 {
1289         if (!(rt->dst.tclassid & 0xFFFF))
1290                 rt->dst.tclassid |= tag & 0xFFFF;
1291         if (!(rt->dst.tclassid & 0xFFFF0000))
1292                 rt->dst.tclassid |= tag & 0xFFFF0000;
1293 }
1294 #endif
1295
1296 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1297 {
1298         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1299         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1300                                     ip_rt_min_advmss);
1301
1302         return min(advmss, IPV4_MAX_PMTU - header_size);
1303 }
1304
1305 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1306 {
1307         const struct rtable *rt = (const struct rtable *) dst;
1308         unsigned int mtu = rt->rt_pmtu;
1309
1310         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1311                 mtu = dst_metric_raw(dst, RTAX_MTU);
1312
1313         if (mtu)
1314                 return mtu;
1315
1316         mtu = READ_ONCE(dst->dev->mtu);
1317
1318         if (unlikely(ip_mtu_locked(dst))) {
1319                 if (rt->rt_uses_gateway && mtu > 576)
1320                         mtu = 576;
1321         }
1322
1323         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1324
1325         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1326 }
1327
1328 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1329 {
1330         struct fnhe_hash_bucket *hash;
1331         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332         u32 hval = fnhe_hashfun(daddr);
1333
1334         spin_lock_bh(&fnhe_lock);
1335
1336         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1337                                          lockdep_is_held(&fnhe_lock));
1338         hash += hval;
1339
1340         fnhe_p = &hash->chain;
1341         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342         while (fnhe) {
1343                 if (fnhe->fnhe_daddr == daddr) {
1344                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1346                         /* set fnhe_daddr to 0 to ensure it won't bind with
1347                          * new dsts in rt_bind_exception().
1348                          */
1349                         fnhe->fnhe_daddr = 0;
1350                         fnhe_flush_routes(fnhe);
1351                         kfree_rcu(fnhe, rcu);
1352                         break;
1353                 }
1354                 fnhe_p = &fnhe->fnhe_next;
1355                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356                                                  lockdep_is_held(&fnhe_lock));
1357         }
1358
1359         spin_unlock_bh(&fnhe_lock);
1360 }
1361
1362 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1363                                                __be32 daddr)
1364 {
1365         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1366         struct fib_nh_exception *fnhe;
1367         u32 hval;
1368
1369         if (!hash)
1370                 return NULL;
1371
1372         hval = fnhe_hashfun(daddr);
1373
1374         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1375              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1376                 if (fnhe->fnhe_daddr == daddr) {
1377                         if (fnhe->fnhe_expires &&
1378                             time_after(jiffies, fnhe->fnhe_expires)) {
1379                                 ip_del_fnhe(nhc, daddr);
1380                                 break;
1381                         }
1382                         return fnhe;
1383                 }
1384         }
1385         return NULL;
1386 }
1387
1388 /* MTU selection:
1389  * 1. mtu on route is locked - use it
1390  * 2. mtu from nexthop exception
1391  * 3. mtu from egress device
1392  */
1393
1394 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1395 {
1396         struct fib_nh_common *nhc = res->nhc;
1397         struct net_device *dev = nhc->nhc_dev;
1398         struct fib_info *fi = res->fi;
1399         u32 mtu = 0;
1400
1401         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1402             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1403                 mtu = fi->fib_mtu;
1404
1405         if (likely(!mtu)) {
1406                 struct fib_nh_exception *fnhe;
1407
1408                 fnhe = find_exception(nhc, daddr);
1409                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1410                         mtu = fnhe->fnhe_pmtu;
1411         }
1412
1413         if (likely(!mtu))
1414                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1415
1416         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1417 }
1418
1419 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1420                               __be32 daddr, const bool do_cache)
1421 {
1422         bool ret = false;
1423
1424         spin_lock_bh(&fnhe_lock);
1425
1426         if (daddr == fnhe->fnhe_daddr) {
1427                 struct rtable __rcu **porig;
1428                 struct rtable *orig;
1429                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1430
1431                 if (rt_is_input_route(rt))
1432                         porig = &fnhe->fnhe_rth_input;
1433                 else
1434                         porig = &fnhe->fnhe_rth_output;
1435                 orig = rcu_dereference(*porig);
1436
1437                 if (fnhe->fnhe_genid != genid) {
1438                         fnhe->fnhe_genid = genid;
1439                         fnhe->fnhe_gw = 0;
1440                         fnhe->fnhe_pmtu = 0;
1441                         fnhe->fnhe_expires = 0;
1442                         fnhe->fnhe_mtu_locked = false;
1443                         fnhe_flush_routes(fnhe);
1444                         orig = NULL;
1445                 }
1446                 fill_route_from_fnhe(rt, fnhe);
1447                 if (!rt->rt_gw4) {
1448                         rt->rt_gw4 = daddr;
1449                         rt->rt_gw_family = AF_INET;
1450                 }
1451
1452                 if (do_cache) {
1453                         dst_hold(&rt->dst);
1454                         rcu_assign_pointer(*porig, rt);
1455                         if (orig) {
1456                                 dst_dev_put(&orig->dst);
1457                                 dst_release(&orig->dst);
1458                         }
1459                         ret = true;
1460                 }
1461
1462                 fnhe->fnhe_stamp = jiffies;
1463         }
1464         spin_unlock_bh(&fnhe_lock);
1465
1466         return ret;
1467 }
1468
1469 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1470 {
1471         struct rtable *orig, *prev, **p;
1472         bool ret = true;
1473
1474         if (rt_is_input_route(rt)) {
1475                 p = (struct rtable **)&nhc->nhc_rth_input;
1476         } else {
1477                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1478         }
1479         orig = *p;
1480
1481         /* hold dst before doing cmpxchg() to avoid race condition
1482          * on this dst
1483          */
1484         dst_hold(&rt->dst);
1485         prev = cmpxchg(p, orig, rt);
1486         if (prev == orig) {
1487                 if (orig) {
1488                         rt_add_uncached_list(orig);
1489                         dst_release(&orig->dst);
1490                 }
1491         } else {
1492                 dst_release(&rt->dst);
1493                 ret = false;
1494         }
1495
1496         return ret;
1497 }
1498
1499 struct uncached_list {
1500         spinlock_t              lock;
1501         struct list_head        head;
1502 };
1503
1504 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1505
1506 void rt_add_uncached_list(struct rtable *rt)
1507 {
1508         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1509
1510         rt->rt_uncached_list = ul;
1511
1512         spin_lock_bh(&ul->lock);
1513         list_add_tail(&rt->rt_uncached, &ul->head);
1514         spin_unlock_bh(&ul->lock);
1515 }
1516
1517 void rt_del_uncached_list(struct rtable *rt)
1518 {
1519         if (!list_empty(&rt->rt_uncached)) {
1520                 struct uncached_list *ul = rt->rt_uncached_list;
1521
1522                 spin_lock_bh(&ul->lock);
1523                 list_del(&rt->rt_uncached);
1524                 spin_unlock_bh(&ul->lock);
1525         }
1526 }
1527
1528 static void ipv4_dst_destroy(struct dst_entry *dst)
1529 {
1530         struct rtable *rt = (struct rtable *)dst;
1531
1532         ip_dst_metrics_put(dst);
1533         rt_del_uncached_list(rt);
1534 }
1535
1536 void rt_flush_dev(struct net_device *dev)
1537 {
1538         struct rtable *rt;
1539         int cpu;
1540
1541         for_each_possible_cpu(cpu) {
1542                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1543
1544                 spin_lock_bh(&ul->lock);
1545                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1546                         if (rt->dst.dev != dev)
1547                                 continue;
1548                         rt->dst.dev = blackhole_netdev;
1549                         dev_hold(rt->dst.dev);
1550                         dev_put(dev);
1551                 }
1552                 spin_unlock_bh(&ul->lock);
1553         }
1554 }
1555
1556 static bool rt_cache_valid(const struct rtable *rt)
1557 {
1558         return  rt &&
1559                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1560                 !rt_is_expired(rt);
1561 }
1562
1563 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1564                            const struct fib_result *res,
1565                            struct fib_nh_exception *fnhe,
1566                            struct fib_info *fi, u16 type, u32 itag,
1567                            const bool do_cache)
1568 {
1569         bool cached = false;
1570
1571         if (fi) {
1572                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1573
1574                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1575                         rt->rt_uses_gateway = 1;
1576                         rt->rt_gw_family = nhc->nhc_gw_family;
1577                         /* only INET and INET6 are supported */
1578                         if (likely(nhc->nhc_gw_family == AF_INET))
1579                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1580                         else
1581                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1582                 }
1583
1584                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1585
1586 #ifdef CONFIG_IP_ROUTE_CLASSID
1587                 if (nhc->nhc_family == AF_INET) {
1588                         struct fib_nh *nh;
1589
1590                         nh = container_of(nhc, struct fib_nh, nh_common);
1591                         rt->dst.tclassid = nh->nh_tclassid;
1592                 }
1593 #endif
1594                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1595                 if (unlikely(fnhe))
1596                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1597                 else if (do_cache)
1598                         cached = rt_cache_route(nhc, rt);
1599                 if (unlikely(!cached)) {
1600                         /* Routes we intend to cache in nexthop exception or
1601                          * FIB nexthop have the DST_NOCACHE bit clear.
1602                          * However, if we are unsuccessful at storing this
1603                          * route into the cache we really need to set it.
1604                          */
1605                         if (!rt->rt_gw4) {
1606                                 rt->rt_gw_family = AF_INET;
1607                                 rt->rt_gw4 = daddr;
1608                         }
1609                         rt_add_uncached_list(rt);
1610                 }
1611         } else
1612                 rt_add_uncached_list(rt);
1613
1614 #ifdef CONFIG_IP_ROUTE_CLASSID
1615 #ifdef CONFIG_IP_MULTIPLE_TABLES
1616         set_class_tag(rt, res->tclassid);
1617 #endif
1618         set_class_tag(rt, itag);
1619 #endif
1620 }
1621
1622 struct rtable *rt_dst_alloc(struct net_device *dev,
1623                             unsigned int flags, u16 type,
1624                             bool nopolicy, bool noxfrm, bool will_cache)
1625 {
1626         struct rtable *rt;
1627
1628         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1629                        (will_cache ? 0 : DST_HOST) |
1630                        (nopolicy ? DST_NOPOLICY : 0) |
1631                        (noxfrm ? DST_NOXFRM : 0));
1632
1633         if (rt) {
1634                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1635                 rt->rt_flags = flags;
1636                 rt->rt_type = type;
1637                 rt->rt_is_input = 0;
1638                 rt->rt_iif = 0;
1639                 rt->rt_pmtu = 0;
1640                 rt->rt_mtu_locked = 0;
1641                 rt->rt_uses_gateway = 0;
1642                 rt->rt_gw_family = 0;
1643                 rt->rt_gw4 = 0;
1644                 INIT_LIST_HEAD(&rt->rt_uncached);
1645
1646                 rt->dst.output = ip_output;
1647                 if (flags & RTCF_LOCAL)
1648                         rt->dst.input = ip_local_deliver;
1649         }
1650
1651         return rt;
1652 }
1653 EXPORT_SYMBOL(rt_dst_alloc);
1654
1655 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1656 {
1657         struct rtable *new_rt;
1658
1659         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1660                            rt->dst.flags);
1661
1662         if (new_rt) {
1663                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1664                 new_rt->rt_flags = rt->rt_flags;
1665                 new_rt->rt_type = rt->rt_type;
1666                 new_rt->rt_is_input = rt->rt_is_input;
1667                 new_rt->rt_iif = rt->rt_iif;
1668                 new_rt->rt_pmtu = rt->rt_pmtu;
1669                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1670                 new_rt->rt_gw_family = rt->rt_gw_family;
1671                 if (rt->rt_gw_family == AF_INET)
1672                         new_rt->rt_gw4 = rt->rt_gw4;
1673                 else if (rt->rt_gw_family == AF_INET6)
1674                         new_rt->rt_gw6 = rt->rt_gw6;
1675                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1676
1677                 new_rt->dst.flags |= DST_HOST;
1678                 new_rt->dst.input = rt->dst.input;
1679                 new_rt->dst.output = rt->dst.output;
1680                 new_rt->dst.error = rt->dst.error;
1681                 new_rt->dst.lastuse = jiffies;
1682                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1683         }
1684         return new_rt;
1685 }
1686 EXPORT_SYMBOL(rt_dst_clone);
1687
1688 /* called in rcu_read_lock() section */
1689 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1690                           u8 tos, struct net_device *dev,
1691                           struct in_device *in_dev, u32 *itag)
1692 {
1693         int err;
1694
1695         /* Primary sanity checks. */
1696         if (!in_dev)
1697                 return -EINVAL;
1698
1699         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1700             skb->protocol != htons(ETH_P_IP))
1701                 return -EINVAL;
1702
1703         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1704                 return -EINVAL;
1705
1706         if (ipv4_is_zeronet(saddr)) {
1707                 if (!ipv4_is_local_multicast(daddr) &&
1708                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1709                         return -EINVAL;
1710         } else {
1711                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1712                                           in_dev, itag);
1713                 if (err < 0)
1714                         return err;
1715         }
1716         return 0;
1717 }
1718
1719 /* called in rcu_read_lock() section */
1720 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1721                              u8 tos, struct net_device *dev, int our)
1722 {
1723         struct in_device *in_dev = __in_dev_get_rcu(dev);
1724         unsigned int flags = RTCF_MULTICAST;
1725         struct rtable *rth;
1726         u32 itag = 0;
1727         int err;
1728
1729         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1730         if (err)
1731                 return err;
1732
1733         if (our)
1734                 flags |= RTCF_LOCAL;
1735
1736         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1737                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1738         if (!rth)
1739                 return -ENOBUFS;
1740
1741 #ifdef CONFIG_IP_ROUTE_CLASSID
1742         rth->dst.tclassid = itag;
1743 #endif
1744         rth->dst.output = ip_rt_bug;
1745         rth->rt_is_input= 1;
1746
1747 #ifdef CONFIG_IP_MROUTE
1748         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1749                 rth->dst.input = ip_mr_input;
1750 #endif
1751         RT_CACHE_STAT_INC(in_slow_mc);
1752
1753         skb_dst_set(skb, &rth->dst);
1754         return 0;
1755 }
1756
1757
1758 static void ip_handle_martian_source(struct net_device *dev,
1759                                      struct in_device *in_dev,
1760                                      struct sk_buff *skb,
1761                                      __be32 daddr,
1762                                      __be32 saddr)
1763 {
1764         RT_CACHE_STAT_INC(in_martian_src);
1765 #ifdef CONFIG_IP_ROUTE_VERBOSE
1766         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1767                 /*
1768                  *      RFC1812 recommendation, if source is martian,
1769                  *      the only hint is MAC header.
1770                  */
1771                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1772                         &daddr, &saddr, dev->name);
1773                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1774                         print_hex_dump(KERN_WARNING, "ll header: ",
1775                                        DUMP_PREFIX_OFFSET, 16, 1,
1776                                        skb_mac_header(skb),
1777                                        dev->hard_header_len, false);
1778                 }
1779         }
1780 #endif
1781 }
1782
1783 /* called in rcu_read_lock() section */
1784 static int __mkroute_input(struct sk_buff *skb,
1785                            const struct fib_result *res,
1786                            struct in_device *in_dev,
1787                            __be32 daddr, __be32 saddr, u32 tos)
1788 {
1789         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1790         struct net_device *dev = nhc->nhc_dev;
1791         struct fib_nh_exception *fnhe;
1792         struct rtable *rth;
1793         int err;
1794         struct in_device *out_dev;
1795         bool do_cache;
1796         u32 itag = 0;
1797
1798         /* get a working reference to the output device */
1799         out_dev = __in_dev_get_rcu(dev);
1800         if (!out_dev) {
1801                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1802                 return -EINVAL;
1803         }
1804
1805         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1806                                   in_dev->dev, in_dev, &itag);
1807         if (err < 0) {
1808                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1809                                          saddr);
1810
1811                 goto cleanup;
1812         }
1813
1814         do_cache = res->fi && !itag;
1815         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1816             skb->protocol == htons(ETH_P_IP)) {
1817                 __be32 gw;
1818
1819                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1820                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1821                     inet_addr_onlink(out_dev, saddr, gw))
1822                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1823         }
1824
1825         if (skb->protocol != htons(ETH_P_IP)) {
1826                 /* Not IP (i.e. ARP). Do not create route, if it is
1827                  * invalid for proxy arp. DNAT routes are always valid.
1828                  *
1829                  * Proxy arp feature have been extended to allow, ARP
1830                  * replies back to the same interface, to support
1831                  * Private VLAN switch technologies. See arp.c.
1832                  */
1833                 if (out_dev == in_dev &&
1834                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1835                         err = -EINVAL;
1836                         goto cleanup;
1837                 }
1838         }
1839
1840         fnhe = find_exception(nhc, daddr);
1841         if (do_cache) {
1842                 if (fnhe)
1843                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1844                 else
1845                         rth = rcu_dereference(nhc->nhc_rth_input);
1846                 if (rt_cache_valid(rth)) {
1847                         skb_dst_set_noref(skb, &rth->dst);
1848                         goto out;
1849                 }
1850         }
1851
1852         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1853                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1854                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1855         if (!rth) {
1856                 err = -ENOBUFS;
1857                 goto cleanup;
1858         }
1859
1860         rth->rt_is_input = 1;
1861         RT_CACHE_STAT_INC(in_slow_tot);
1862
1863         rth->dst.input = ip_forward;
1864
1865         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1866                        do_cache);
1867         lwtunnel_set_redirect(&rth->dst);
1868         skb_dst_set(skb, &rth->dst);
1869 out:
1870         err = 0;
1871  cleanup:
1872         return err;
1873 }
1874
1875 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1876 /* To make ICMP packets follow the right flow, the multipath hash is
1877  * calculated from the inner IP addresses.
1878  */
1879 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1880                                  struct flow_keys *hash_keys)
1881 {
1882         const struct iphdr *outer_iph = ip_hdr(skb);
1883         const struct iphdr *key_iph = outer_iph;
1884         const struct iphdr *inner_iph;
1885         const struct icmphdr *icmph;
1886         struct iphdr _inner_iph;
1887         struct icmphdr _icmph;
1888
1889         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1890                 goto out;
1891
1892         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1893                 goto out;
1894
1895         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1896                                    &_icmph);
1897         if (!icmph)
1898                 goto out;
1899
1900         if (!icmp_is_err(icmph->type))
1901                 goto out;
1902
1903         inner_iph = skb_header_pointer(skb,
1904                                        outer_iph->ihl * 4 + sizeof(_icmph),
1905                                        sizeof(_inner_iph), &_inner_iph);
1906         if (!inner_iph)
1907                 goto out;
1908
1909         key_iph = inner_iph;
1910 out:
1911         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1912         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1913 }
1914
1915 /* if skb is set it will be used and fl4 can be NULL */
1916 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1917                        const struct sk_buff *skb, struct flow_keys *flkeys)
1918 {
1919         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1920         struct flow_keys hash_keys;
1921         u32 mhash;
1922
1923         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1924         case 0:
1925                 memset(&hash_keys, 0, sizeof(hash_keys));
1926                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1927                 if (skb) {
1928                         ip_multipath_l3_keys(skb, &hash_keys);
1929                 } else {
1930                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1931                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1932                 }
1933                 break;
1934         case 1:
1935                 /* skb is currently provided only when forwarding */
1936                 if (skb) {
1937                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1938                         struct flow_keys keys;
1939
1940                         /* short-circuit if we already have L4 hash present */
1941                         if (skb->l4_hash)
1942                                 return skb_get_hash_raw(skb) >> 1;
1943
1944                         memset(&hash_keys, 0, sizeof(hash_keys));
1945
1946                         if (!flkeys) {
1947                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1948                                 flkeys = &keys;
1949                         }
1950
1951                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1952                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1953                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1954                         hash_keys.ports.src = flkeys->ports.src;
1955                         hash_keys.ports.dst = flkeys->ports.dst;
1956                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1957                 } else {
1958                         memset(&hash_keys, 0, sizeof(hash_keys));
1959                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1960                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1961                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1962                         hash_keys.ports.src = fl4->fl4_sport;
1963                         hash_keys.ports.dst = fl4->fl4_dport;
1964                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1965                 }
1966                 break;
1967         case 2:
1968                 memset(&hash_keys, 0, sizeof(hash_keys));
1969                 /* skb is currently provided only when forwarding */
1970                 if (skb) {
1971                         struct flow_keys keys;
1972
1973                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1974                         /* Inner can be v4 or v6 */
1975                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1978                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1979                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1980                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1981                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1982                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1983                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1984                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1985                         } else {
1986                                 /* Same as case 0 */
1987                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1988                                 ip_multipath_l3_keys(skb, &hash_keys);
1989                         }
1990                 } else {
1991                         /* Same as case 0 */
1992                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1993                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1994                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1995                 }
1996                 break;
1997         }
1998         mhash = flow_hash_from_keys(&hash_keys);
1999
2000         if (multipath_hash)
2001                 mhash = jhash_2words(mhash, multipath_hash, 0);
2002
2003         return mhash >> 1;
2004 }
2005 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2006
2007 static int ip_mkroute_input(struct sk_buff *skb,
2008                             struct fib_result *res,
2009                             struct in_device *in_dev,
2010                             __be32 daddr, __be32 saddr, u32 tos,
2011                             struct flow_keys *hkeys)
2012 {
2013 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2014         if (res->fi && fib_info_num_path(res->fi) > 1) {
2015                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2016
2017                 fib_select_multipath(res, h);
2018         }
2019 #endif
2020
2021         /* create a routing cache entry */
2022         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2023 }
2024
2025 /* Implements all the saddr-related checks as ip_route_input_slow(),
2026  * assuming daddr is valid and the destination is not a local broadcast one.
2027  * Uses the provided hint instead of performing a route lookup.
2028  */
2029 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2030                       u8 tos, struct net_device *dev,
2031                       const struct sk_buff *hint)
2032 {
2033         struct in_device *in_dev = __in_dev_get_rcu(dev);
2034         struct rtable *rt = (struct rtable *)hint;
2035         struct net *net = dev_net(dev);
2036         int err = -EINVAL;
2037         u32 tag = 0;
2038
2039         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2040                 goto martian_source;
2041
2042         if (ipv4_is_zeronet(saddr))
2043                 goto martian_source;
2044
2045         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2046                 goto martian_source;
2047
2048         if (rt->rt_type != RTN_LOCAL)
2049                 goto skip_validate_source;
2050
2051         tos &= IPTOS_RT_MASK;
2052         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2053         if (err < 0)
2054                 goto martian_source;
2055
2056 skip_validate_source:
2057         skb_dst_copy(skb, hint);
2058         return 0;
2059
2060 martian_source:
2061         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2062         return err;
2063 }
2064
2065 /*
2066  *      NOTE. We drop all the packets that has local source
2067  *      addresses, because every properly looped back packet
2068  *      must have correct destination already attached by output routine.
2069  *      Changes in the enforced policies must be applied also to
2070  *      ip_route_use_hint().
2071  *
2072  *      Such approach solves two big problems:
2073  *      1. Not simplex devices are handled properly.
2074  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2075  *      called with rcu_read_lock()
2076  */
2077
2078 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2079                                u8 tos, struct net_device *dev,
2080                                struct fib_result *res)
2081 {
2082         struct in_device *in_dev = __in_dev_get_rcu(dev);
2083         struct flow_keys *flkeys = NULL, _flkeys;
2084         struct net    *net = dev_net(dev);
2085         struct ip_tunnel_info *tun_info;
2086         int             err = -EINVAL;
2087         unsigned int    flags = 0;
2088         u32             itag = 0;
2089         struct rtable   *rth;
2090         struct flowi4   fl4;
2091         bool do_cache = true;
2092
2093         /* IP on this device is disabled. */
2094
2095         if (!in_dev)
2096                 goto out;
2097
2098         /* Check for the most weird martians, which can be not detected
2099            by fib_lookup.
2100          */
2101
2102         tun_info = skb_tunnel_info(skb);
2103         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2104                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2105         else
2106                 fl4.flowi4_tun_key.tun_id = 0;
2107         skb_dst_drop(skb);
2108
2109         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2110                 goto martian_source;
2111
2112         res->fi = NULL;
2113         res->table = NULL;
2114         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2115                 goto brd_input;
2116
2117         /* Accept zero addresses only to limited broadcast;
2118          * I even do not know to fix it or not. Waiting for complains :-)
2119          */
2120         if (ipv4_is_zeronet(saddr))
2121                 goto martian_source;
2122
2123         if (ipv4_is_zeronet(daddr))
2124                 goto martian_destination;
2125
2126         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2127          * and call it once if daddr or/and saddr are loopback addresses
2128          */
2129         if (ipv4_is_loopback(daddr)) {
2130                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2131                         goto martian_destination;
2132         } else if (ipv4_is_loopback(saddr)) {
2133                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2134                         goto martian_source;
2135         }
2136
2137         /*
2138          *      Now we are ready to route packet.
2139          */
2140         fl4.flowi4_oif = 0;
2141         fl4.flowi4_iif = dev->ifindex;
2142         fl4.flowi4_mark = skb->mark;
2143         fl4.flowi4_tos = tos;
2144         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2145         fl4.flowi4_flags = 0;
2146         fl4.daddr = daddr;
2147         fl4.saddr = saddr;
2148         fl4.flowi4_uid = sock_net_uid(net, NULL);
2149
2150         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2151                 flkeys = &_flkeys;
2152         } else {
2153                 fl4.flowi4_proto = 0;
2154                 fl4.fl4_sport = 0;
2155                 fl4.fl4_dport = 0;
2156         }
2157
2158         err = fib_lookup(net, &fl4, res, 0);
2159         if (err != 0) {
2160                 if (!IN_DEV_FORWARD(in_dev))
2161                         err = -EHOSTUNREACH;
2162                 goto no_route;
2163         }
2164
2165         if (res->type == RTN_BROADCAST) {
2166                 if (IN_DEV_BFORWARD(in_dev))
2167                         goto make_route;
2168                 /* not do cache if bc_forwarding is enabled */
2169                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2170                         do_cache = false;
2171                 goto brd_input;
2172         }
2173
2174         if (res->type == RTN_LOCAL) {
2175                 err = fib_validate_source(skb, saddr, daddr, tos,
2176                                           0, dev, in_dev, &itag);
2177                 if (err < 0)
2178                         goto martian_source;
2179                 goto local_input;
2180         }
2181
2182         if (!IN_DEV_FORWARD(in_dev)) {
2183                 err = -EHOSTUNREACH;
2184                 goto no_route;
2185         }
2186         if (res->type != RTN_UNICAST)
2187                 goto martian_destination;
2188
2189 make_route:
2190         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2191 out:    return err;
2192
2193 brd_input:
2194         if (skb->protocol != htons(ETH_P_IP))
2195                 goto e_inval;
2196
2197         if (!ipv4_is_zeronet(saddr)) {
2198                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2199                                           in_dev, &itag);
2200                 if (err < 0)
2201                         goto martian_source;
2202         }
2203         flags |= RTCF_BROADCAST;
2204         res->type = RTN_BROADCAST;
2205         RT_CACHE_STAT_INC(in_brd);
2206
2207 local_input:
2208         do_cache &= res->fi && !itag;
2209         if (do_cache) {
2210                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2211
2212                 rth = rcu_dereference(nhc->nhc_rth_input);
2213                 if (rt_cache_valid(rth)) {
2214                         skb_dst_set_noref(skb, &rth->dst);
2215                         err = 0;
2216                         goto out;
2217                 }
2218         }
2219
2220         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2221                            flags | RTCF_LOCAL, res->type,
2222                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2223         if (!rth)
2224                 goto e_nobufs;
2225
2226         rth->dst.output= ip_rt_bug;
2227 #ifdef CONFIG_IP_ROUTE_CLASSID
2228         rth->dst.tclassid = itag;
2229 #endif
2230         rth->rt_is_input = 1;
2231
2232         RT_CACHE_STAT_INC(in_slow_tot);
2233         if (res->type == RTN_UNREACHABLE) {
2234                 rth->dst.input= ip_error;
2235                 rth->dst.error= -err;
2236                 rth->rt_flags   &= ~RTCF_LOCAL;
2237         }
2238
2239         if (do_cache) {
2240                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2241
2242                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2243                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2244                         WARN_ON(rth->dst.input == lwtunnel_input);
2245                         rth->dst.lwtstate->orig_input = rth->dst.input;
2246                         rth->dst.input = lwtunnel_input;
2247                 }
2248
2249                 if (unlikely(!rt_cache_route(nhc, rth)))
2250                         rt_add_uncached_list(rth);
2251         }
2252         skb_dst_set(skb, &rth->dst);
2253         err = 0;
2254         goto out;
2255
2256 no_route:
2257         RT_CACHE_STAT_INC(in_no_route);
2258         res->type = RTN_UNREACHABLE;
2259         res->fi = NULL;
2260         res->table = NULL;
2261         goto local_input;
2262
2263         /*
2264          *      Do not cache martian addresses: they should be logged (RFC1812)
2265          */
2266 martian_destination:
2267         RT_CACHE_STAT_INC(in_martian_dst);
2268 #ifdef CONFIG_IP_ROUTE_VERBOSE
2269         if (IN_DEV_LOG_MARTIANS(in_dev))
2270                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2271                                      &daddr, &saddr, dev->name);
2272 #endif
2273
2274 e_inval:
2275         err = -EINVAL;
2276         goto out;
2277
2278 e_nobufs:
2279         err = -ENOBUFS;
2280         goto out;
2281
2282 martian_source:
2283         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2284         goto out;
2285 }
2286
2287 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2288                          u8 tos, struct net_device *dev)
2289 {
2290         struct fib_result res;
2291         int err;
2292
2293         tos &= IPTOS_RT_MASK;
2294         rcu_read_lock();
2295         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2296         rcu_read_unlock();
2297
2298         return err;
2299 }
2300 EXPORT_SYMBOL(ip_route_input_noref);
2301
2302 /* called with rcu_read_lock held */
2303 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2304                        u8 tos, struct net_device *dev, struct fib_result *res)
2305 {
2306         /* Multicast recognition logic is moved from route cache to here.
2307            The problem was that too many Ethernet cards have broken/missing
2308            hardware multicast filters :-( As result the host on multicasting
2309            network acquires a lot of useless route cache entries, sort of
2310            SDR messages from all the world. Now we try to get rid of them.
2311            Really, provided software IP multicast filter is organized
2312            reasonably (at least, hashed), it does not result in a slowdown
2313            comparing with route cache reject entries.
2314            Note, that multicast routers are not affected, because
2315            route cache entry is created eventually.
2316          */
2317         if (ipv4_is_multicast(daddr)) {
2318                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2319                 int our = 0;
2320                 int err = -EINVAL;
2321
2322                 if (!in_dev)
2323                         return err;
2324                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2325                                       ip_hdr(skb)->protocol);
2326
2327                 /* check l3 master if no match yet */
2328                 if (!our && netif_is_l3_slave(dev)) {
2329                         struct in_device *l3_in_dev;
2330
2331                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2332                         if (l3_in_dev)
2333                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2334                                                       ip_hdr(skb)->protocol);
2335                 }
2336
2337                 if (our
2338 #ifdef CONFIG_IP_MROUTE
2339                         ||
2340                     (!ipv4_is_local_multicast(daddr) &&
2341                      IN_DEV_MFORWARD(in_dev))
2342 #endif
2343                    ) {
2344                         err = ip_route_input_mc(skb, daddr, saddr,
2345                                                 tos, dev, our);
2346                 }
2347                 return err;
2348         }
2349
2350         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2351 }
2352
2353 /* called with rcu_read_lock() */
2354 static struct rtable *__mkroute_output(const struct fib_result *res,
2355                                        const struct flowi4 *fl4, int orig_oif,
2356                                        struct net_device *dev_out,
2357                                        unsigned int flags)
2358 {
2359         struct fib_info *fi = res->fi;
2360         struct fib_nh_exception *fnhe;
2361         struct in_device *in_dev;
2362         u16 type = res->type;
2363         struct rtable *rth;
2364         bool do_cache;
2365
2366         in_dev = __in_dev_get_rcu(dev_out);
2367         if (!in_dev)
2368                 return ERR_PTR(-EINVAL);
2369
2370         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2371                 if (ipv4_is_loopback(fl4->saddr) &&
2372                     !(dev_out->flags & IFF_LOOPBACK) &&
2373                     !netif_is_l3_master(dev_out))
2374                         return ERR_PTR(-EINVAL);
2375
2376         if (ipv4_is_lbcast(fl4->daddr))
2377                 type = RTN_BROADCAST;
2378         else if (ipv4_is_multicast(fl4->daddr))
2379                 type = RTN_MULTICAST;
2380         else if (ipv4_is_zeronet(fl4->daddr))
2381                 return ERR_PTR(-EINVAL);
2382
2383         if (dev_out->flags & IFF_LOOPBACK)
2384                 flags |= RTCF_LOCAL;
2385
2386         do_cache = true;
2387         if (type == RTN_BROADCAST) {
2388                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2389                 fi = NULL;
2390         } else if (type == RTN_MULTICAST) {
2391                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2392                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2393                                      fl4->flowi4_proto))
2394                         flags &= ~RTCF_LOCAL;
2395                 else
2396                         do_cache = false;
2397                 /* If multicast route do not exist use
2398                  * default one, but do not gateway in this case.
2399                  * Yes, it is hack.
2400                  */
2401                 if (fi && res->prefixlen < 4)
2402                         fi = NULL;
2403         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2404                    (orig_oif != dev_out->ifindex)) {
2405                 /* For local routes that require a particular output interface
2406                  * we do not want to cache the result.  Caching the result
2407                  * causes incorrect behaviour when there are multiple source
2408                  * addresses on the interface, the end result being that if the
2409                  * intended recipient is waiting on that interface for the
2410                  * packet he won't receive it because it will be delivered on
2411                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2412                  * be set to the loopback interface as well.
2413                  */
2414                 do_cache = false;
2415         }
2416
2417         fnhe = NULL;
2418         do_cache &= fi != NULL;
2419         if (fi) {
2420                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2421                 struct rtable __rcu **prth;
2422
2423                 fnhe = find_exception(nhc, fl4->daddr);
2424                 if (!do_cache)
2425                         goto add;
2426                 if (fnhe) {
2427                         prth = &fnhe->fnhe_rth_output;
2428                 } else {
2429                         if (unlikely(fl4->flowi4_flags &
2430                                      FLOWI_FLAG_KNOWN_NH &&
2431                                      !(nhc->nhc_gw_family &&
2432                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2433                                 do_cache = false;
2434                                 goto add;
2435                         }
2436                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2437                 }
2438                 rth = rcu_dereference(*prth);
2439                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2440                         return rth;
2441         }
2442
2443 add:
2444         rth = rt_dst_alloc(dev_out, flags, type,
2445                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2446                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2447                            do_cache);
2448         if (!rth)
2449                 return ERR_PTR(-ENOBUFS);
2450
2451         rth->rt_iif = orig_oif;
2452
2453         RT_CACHE_STAT_INC(out_slow_tot);
2454
2455         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2456                 if (flags & RTCF_LOCAL &&
2457                     !(dev_out->flags & IFF_LOOPBACK)) {
2458                         rth->dst.output = ip_mc_output;
2459                         RT_CACHE_STAT_INC(out_slow_mc);
2460                 }
2461 #ifdef CONFIG_IP_MROUTE
2462                 if (type == RTN_MULTICAST) {
2463                         if (IN_DEV_MFORWARD(in_dev) &&
2464                             !ipv4_is_local_multicast(fl4->daddr)) {
2465                                 rth->dst.input = ip_mr_input;
2466                                 rth->dst.output = ip_mc_output;
2467                         }
2468                 }
2469 #endif
2470         }
2471
2472         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2473         lwtunnel_set_redirect(&rth->dst);
2474
2475         return rth;
2476 }
2477
2478 /*
2479  * Major route resolver routine.
2480  */
2481
2482 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2483                                         const struct sk_buff *skb)
2484 {
2485         __u8 tos = RT_FL_TOS(fl4);
2486         struct fib_result res = {
2487                 .type           = RTN_UNSPEC,
2488                 .fi             = NULL,
2489                 .table          = NULL,
2490                 .tclassid       = 0,
2491         };
2492         struct rtable *rth;
2493
2494         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2495         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2496         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2497                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2498
2499         rcu_read_lock();
2500         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2501         rcu_read_unlock();
2502
2503         return rth;
2504 }
2505 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2506
2507 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2508                                             struct fib_result *res,
2509                                             const struct sk_buff *skb)
2510 {
2511         struct net_device *dev_out = NULL;
2512         int orig_oif = fl4->flowi4_oif;
2513         unsigned int flags = 0;
2514         struct rtable *rth;
2515         int err;
2516
2517         if (fl4->saddr) {
2518                 if (ipv4_is_multicast(fl4->saddr) ||
2519                     ipv4_is_lbcast(fl4->saddr) ||
2520                     ipv4_is_zeronet(fl4->saddr)) {
2521                         rth = ERR_PTR(-EINVAL);
2522                         goto out;
2523                 }
2524
2525                 rth = ERR_PTR(-ENETUNREACH);
2526
2527                 /* I removed check for oif == dev_out->oif here.
2528                    It was wrong for two reasons:
2529                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2530                       is assigned to multiple interfaces.
2531                    2. Moreover, we are allowed to send packets with saddr
2532                       of another iface. --ANK
2533                  */
2534
2535                 if (fl4->flowi4_oif == 0 &&
2536                     (ipv4_is_multicast(fl4->daddr) ||
2537                      ipv4_is_lbcast(fl4->daddr))) {
2538                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2539                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2540                         if (!dev_out)
2541                                 goto out;
2542
2543                         /* Special hack: user can direct multicasts
2544                            and limited broadcast via necessary interface
2545                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2546                            This hack is not just for fun, it allows
2547                            vic,vat and friends to work.
2548                            They bind socket to loopback, set ttl to zero
2549                            and expect that it will work.
2550                            From the viewpoint of routing cache they are broken,
2551                            because we are not allowed to build multicast path
2552                            with loopback source addr (look, routing cache
2553                            cannot know, that ttl is zero, so that packet
2554                            will not leave this host and route is valid).
2555                            Luckily, this hack is good workaround.
2556                          */
2557
2558                         fl4->flowi4_oif = dev_out->ifindex;
2559                         goto make_route;
2560                 }
2561
2562                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2563                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2564                         if (!__ip_dev_find(net, fl4->saddr, false))
2565                                 goto out;
2566                 }
2567         }
2568
2569
2570         if (fl4->flowi4_oif) {
2571                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2572                 rth = ERR_PTR(-ENODEV);
2573                 if (!dev_out)
2574                         goto out;
2575
2576                 /* RACE: Check return value of inet_select_addr instead. */
2577                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2578                         rth = ERR_PTR(-ENETUNREACH);
2579                         goto out;
2580                 }
2581                 if (ipv4_is_local_multicast(fl4->daddr) ||
2582                     ipv4_is_lbcast(fl4->daddr) ||
2583                     fl4->flowi4_proto == IPPROTO_IGMP) {
2584                         if (!fl4->saddr)
2585                                 fl4->saddr = inet_select_addr(dev_out, 0,
2586                                                               RT_SCOPE_LINK);
2587                         goto make_route;
2588                 }
2589                 if (!fl4->saddr) {
2590                         if (ipv4_is_multicast(fl4->daddr))
2591                                 fl4->saddr = inet_select_addr(dev_out, 0,
2592                                                               fl4->flowi4_scope);
2593                         else if (!fl4->daddr)
2594                                 fl4->saddr = inet_select_addr(dev_out, 0,
2595                                                               RT_SCOPE_HOST);
2596                 }
2597         }
2598
2599         if (!fl4->daddr) {
2600                 fl4->daddr = fl4->saddr;
2601                 if (!fl4->daddr)
2602                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2603                 dev_out = net->loopback_dev;
2604                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2605                 res->type = RTN_LOCAL;
2606                 flags |= RTCF_LOCAL;
2607                 goto make_route;
2608         }
2609
2610         err = fib_lookup(net, fl4, res, 0);
2611         if (err) {
2612                 res->fi = NULL;
2613                 res->table = NULL;
2614                 if (fl4->flowi4_oif &&
2615                     (ipv4_is_multicast(fl4->daddr) ||
2616                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2617                         /* Apparently, routing tables are wrong. Assume,
2618                            that the destination is on link.
2619
2620                            WHY? DW.
2621                            Because we are allowed to send to iface
2622                            even if it has NO routes and NO assigned
2623                            addresses. When oif is specified, routing
2624                            tables are looked up with only one purpose:
2625                            to catch if destination is gatewayed, rather than
2626                            direct. Moreover, if MSG_DONTROUTE is set,
2627                            we send packet, ignoring both routing tables
2628                            and ifaddr state. --ANK
2629
2630
2631                            We could make it even if oif is unknown,
2632                            likely IPv6, but we do not.
2633                          */
2634
2635                         if (fl4->saddr == 0)
2636                                 fl4->saddr = inet_select_addr(dev_out, 0,
2637                                                               RT_SCOPE_LINK);
2638                         res->type = RTN_UNICAST;
2639                         goto make_route;
2640                 }
2641                 rth = ERR_PTR(err);
2642                 goto out;
2643         }
2644
2645         if (res->type == RTN_LOCAL) {
2646                 if (!fl4->saddr) {
2647                         if (res->fi->fib_prefsrc)
2648                                 fl4->saddr = res->fi->fib_prefsrc;
2649                         else
2650                                 fl4->saddr = fl4->daddr;
2651                 }
2652
2653                 /* L3 master device is the loopback for that domain */
2654                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2655                         net->loopback_dev;
2656
2657                 /* make sure orig_oif points to fib result device even
2658                  * though packet rx/tx happens over loopback or l3mdev
2659                  */
2660                 orig_oif = FIB_RES_OIF(*res);
2661
2662                 fl4->flowi4_oif = dev_out->ifindex;
2663                 flags |= RTCF_LOCAL;
2664                 goto make_route;
2665         }
2666
2667         fib_select_path(net, res, fl4, skb);
2668
2669         dev_out = FIB_RES_DEV(*res);
2670         fl4->flowi4_oif = dev_out->ifindex;
2671
2672
2673 make_route:
2674         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2675
2676 out:
2677         return rth;
2678 }
2679
2680 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2681 {
2682         return NULL;
2683 }
2684
2685 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2686 {
2687         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2688
2689         return mtu ? : dst->dev->mtu;
2690 }
2691
2692 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2693                                           struct sk_buff *skb, u32 mtu,
2694                                           bool confirm_neigh)
2695 {
2696 }
2697
2698 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2699                                        struct sk_buff *skb)
2700 {
2701 }
2702
2703 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2704                                           unsigned long old)
2705 {
2706         return NULL;
2707 }
2708
2709 static struct dst_ops ipv4_dst_blackhole_ops = {
2710         .family                 =       AF_INET,
2711         .check                  =       ipv4_blackhole_dst_check,
2712         .mtu                    =       ipv4_blackhole_mtu,
2713         .default_advmss         =       ipv4_default_advmss,
2714         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2715         .redirect               =       ipv4_rt_blackhole_redirect,
2716         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2717         .neigh_lookup           =       ipv4_neigh_lookup,
2718 };
2719
2720 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2721 {
2722         struct rtable *ort = (struct rtable *) dst_orig;
2723         struct rtable *rt;
2724
2725         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2726         if (rt) {
2727                 struct dst_entry *new = &rt->dst;
2728
2729                 new->__use = 1;
2730                 new->input = dst_discard;
2731                 new->output = dst_discard_out;
2732
2733                 new->dev = net->loopback_dev;
2734                 if (new->dev)
2735                         dev_hold(new->dev);
2736
2737                 rt->rt_is_input = ort->rt_is_input;
2738                 rt->rt_iif = ort->rt_iif;
2739                 rt->rt_pmtu = ort->rt_pmtu;
2740                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2741
2742                 rt->rt_genid = rt_genid_ipv4(net);
2743                 rt->rt_flags = ort->rt_flags;
2744                 rt->rt_type = ort->rt_type;
2745                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2746                 rt->rt_gw_family = ort->rt_gw_family;
2747                 if (rt->rt_gw_family == AF_INET)
2748                         rt->rt_gw4 = ort->rt_gw4;
2749                 else if (rt->rt_gw_family == AF_INET6)
2750                         rt->rt_gw6 = ort->rt_gw6;
2751
2752                 INIT_LIST_HEAD(&rt->rt_uncached);
2753         }
2754
2755         dst_release(dst_orig);
2756
2757         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2758 }
2759
2760 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2761                                     const struct sock *sk)
2762 {
2763         struct rtable *rt = __ip_route_output_key(net, flp4);
2764
2765         if (IS_ERR(rt))
2766                 return rt;
2767
2768         if (flp4->flowi4_proto)
2769                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2770                                                         flowi4_to_flowi(flp4),
2771                                                         sk, 0);
2772
2773         return rt;
2774 }
2775 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2776
2777 /* called with rcu_read_lock held */
2778 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2779                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2780                         struct sk_buff *skb, u32 portid, u32 seq,
2781                         unsigned int flags)
2782 {
2783         struct rtmsg *r;
2784         struct nlmsghdr *nlh;
2785         unsigned long expires = 0;
2786         u32 error;
2787         u32 metrics[RTAX_MAX];
2788
2789         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2790         if (!nlh)
2791                 return -EMSGSIZE;
2792
2793         r = nlmsg_data(nlh);
2794         r->rtm_family    = AF_INET;
2795         r->rtm_dst_len  = 32;
2796         r->rtm_src_len  = 0;
2797         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2798         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2799         if (nla_put_u32(skb, RTA_TABLE, table_id))
2800                 goto nla_put_failure;
2801         r->rtm_type     = rt->rt_type;
2802         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2803         r->rtm_protocol = RTPROT_UNSPEC;
2804         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2805         if (rt->rt_flags & RTCF_NOTIFY)
2806                 r->rtm_flags |= RTM_F_NOTIFY;
2807         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2808                 r->rtm_flags |= RTCF_DOREDIRECT;
2809
2810         if (nla_put_in_addr(skb, RTA_DST, dst))
2811                 goto nla_put_failure;
2812         if (src) {
2813                 r->rtm_src_len = 32;
2814                 if (nla_put_in_addr(skb, RTA_SRC, src))
2815                         goto nla_put_failure;
2816         }
2817         if (rt->dst.dev &&
2818             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2819                 goto nla_put_failure;
2820 #ifdef CONFIG_IP_ROUTE_CLASSID
2821         if (rt->dst.tclassid &&
2822             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2823                 goto nla_put_failure;
2824 #endif
2825         if (fl4 && !rt_is_input_route(rt) &&
2826             fl4->saddr != src) {
2827                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2828                         goto nla_put_failure;
2829         }
2830         if (rt->rt_uses_gateway) {
2831                 if (rt->rt_gw_family == AF_INET &&
2832                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2833                         goto nla_put_failure;
2834                 } else if (rt->rt_gw_family == AF_INET6) {
2835                         int alen = sizeof(struct in6_addr);
2836                         struct nlattr *nla;
2837                         struct rtvia *via;
2838
2839                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2840                         if (!nla)
2841                                 goto nla_put_failure;
2842
2843                         via = nla_data(nla);
2844                         via->rtvia_family = AF_INET6;
2845                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2846                 }
2847         }
2848
2849         expires = rt->dst.expires;
2850         if (expires) {
2851                 unsigned long now = jiffies;
2852
2853                 if (time_before(now, expires))
2854                         expires -= now;
2855                 else
2856                         expires = 0;
2857         }
2858
2859         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2860         if (rt->rt_pmtu && expires)
2861                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2862         if (rt->rt_mtu_locked && expires)
2863                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2864         if (rtnetlink_put_metrics(skb, metrics) < 0)
2865                 goto nla_put_failure;
2866
2867         if (fl4) {
2868                 if (fl4->flowi4_mark &&
2869                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2870                         goto nla_put_failure;
2871
2872                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2873                     nla_put_u32(skb, RTA_UID,
2874                                 from_kuid_munged(current_user_ns(),
2875                                                  fl4->flowi4_uid)))
2876                         goto nla_put_failure;
2877
2878                 if (rt_is_input_route(rt)) {
2879 #ifdef CONFIG_IP_MROUTE
2880                         if (ipv4_is_multicast(dst) &&
2881                             !ipv4_is_local_multicast(dst) &&
2882                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2883                                 int err = ipmr_get_route(net, skb,
2884                                                          fl4->saddr, fl4->daddr,
2885                                                          r, portid);
2886
2887                                 if (err <= 0) {
2888                                         if (err == 0)
2889                                                 return 0;
2890                                         goto nla_put_failure;
2891                                 }
2892                         } else
2893 #endif
2894                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2895                                         goto nla_put_failure;
2896                 }
2897         }
2898
2899         error = rt->dst.error;
2900
2901         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2902                 goto nla_put_failure;
2903
2904         nlmsg_end(skb, nlh);
2905         return 0;
2906
2907 nla_put_failure:
2908         nlmsg_cancel(skb, nlh);
2909         return -EMSGSIZE;
2910 }
2911
2912 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2913                             struct netlink_callback *cb, u32 table_id,
2914                             struct fnhe_hash_bucket *bucket, int genid,
2915                             int *fa_index, int fa_start, unsigned int flags)
2916 {
2917         int i;
2918
2919         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2920                 struct fib_nh_exception *fnhe;
2921
2922                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2923                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2924                         struct rtable *rt;
2925                         int err;
2926
2927                         if (*fa_index < fa_start)
2928                                 goto next;
2929
2930                         if (fnhe->fnhe_genid != genid)
2931                                 goto next;
2932
2933                         if (fnhe->fnhe_expires &&
2934                             time_after(jiffies, fnhe->fnhe_expires))
2935                                 goto next;
2936
2937                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2938                         if (!rt)
2939                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2940                         if (!rt)
2941                                 goto next;
2942
2943                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2944                                            table_id, NULL, skb,
2945                                            NETLINK_CB(cb->skb).portid,
2946                                            cb->nlh->nlmsg_seq, flags);
2947                         if (err)
2948                                 return err;
2949 next:
2950                         (*fa_index)++;
2951                 }
2952         }
2953
2954         return 0;
2955 }
2956
2957 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2958                        u32 table_id, struct fib_info *fi,
2959                        int *fa_index, int fa_start, unsigned int flags)
2960 {
2961         struct net *net = sock_net(cb->skb->sk);
2962         int nhsel, genid = fnhe_genid(net);
2963
2964         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2965                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2966                 struct fnhe_hash_bucket *bucket;
2967                 int err;
2968
2969                 if (nhc->nhc_flags & RTNH_F_DEAD)
2970                         continue;
2971
2972                 rcu_read_lock();
2973                 bucket = rcu_dereference(nhc->nhc_exceptions);
2974                 err = 0;
2975                 if (bucket)
2976                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2977                                                genid, fa_index, fa_start,
2978                                                flags);
2979                 rcu_read_unlock();
2980                 if (err)
2981                         return err;
2982         }
2983
2984         return 0;
2985 }
2986
2987 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2988                                                    u8 ip_proto, __be16 sport,
2989                                                    __be16 dport)
2990 {
2991         struct sk_buff *skb;
2992         struct iphdr *iph;
2993
2994         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2995         if (!skb)
2996                 return NULL;
2997
2998         /* Reserve room for dummy headers, this skb can pass
2999          * through good chunk of routing engine.
3000          */
3001         skb_reset_mac_header(skb);
3002         skb_reset_network_header(skb);
3003         skb->protocol = htons(ETH_P_IP);
3004         iph = skb_put(skb, sizeof(struct iphdr));
3005         iph->protocol = ip_proto;
3006         iph->saddr = src;
3007         iph->daddr = dst;
3008         iph->version = 0x4;
3009         iph->frag_off = 0;
3010         iph->ihl = 0x5;
3011         skb_set_transport_header(skb, skb->len);
3012
3013         switch (iph->protocol) {
3014         case IPPROTO_UDP: {
3015                 struct udphdr *udph;
3016
3017                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3018                 udph->source = sport;
3019                 udph->dest = dport;
3020                 udph->len = sizeof(struct udphdr);
3021                 udph->check = 0;
3022                 break;
3023         }
3024         case IPPROTO_TCP: {
3025                 struct tcphdr *tcph;
3026
3027                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3028                 tcph->source    = sport;
3029                 tcph->dest      = dport;
3030                 tcph->doff      = sizeof(struct tcphdr) / 4;
3031                 tcph->rst = 1;
3032                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3033                                             src, dst, 0);
3034                 break;
3035         }
3036         case IPPROTO_ICMP: {
3037                 struct icmphdr *icmph;
3038
3039                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3040                 icmph->type = ICMP_ECHO;
3041                 icmph->code = 0;
3042         }
3043         }
3044
3045         return skb;
3046 }
3047
3048 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3049                                        const struct nlmsghdr *nlh,
3050                                        struct nlattr **tb,
3051                                        struct netlink_ext_ack *extack)
3052 {
3053         struct rtmsg *rtm;
3054         int i, err;
3055
3056         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3057                 NL_SET_ERR_MSG(extack,
3058                                "ipv4: Invalid header for route get request");
3059                 return -EINVAL;
3060         }
3061
3062         if (!netlink_strict_get_check(skb))
3063                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3064                                               rtm_ipv4_policy, extack);
3065
3066         rtm = nlmsg_data(nlh);
3067         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3068             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3069             rtm->rtm_table || rtm->rtm_protocol ||
3070             rtm->rtm_scope || rtm->rtm_type) {
3071                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3072                 return -EINVAL;
3073         }
3074
3075         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3076                                RTM_F_LOOKUP_TABLE |
3077                                RTM_F_FIB_MATCH)) {
3078                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3079                 return -EINVAL;
3080         }
3081
3082         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3083                                             rtm_ipv4_policy, extack);
3084         if (err)
3085                 return err;
3086
3087         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3088             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3089                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3090                 return -EINVAL;
3091         }
3092
3093         for (i = 0; i <= RTA_MAX; i++) {
3094                 if (!tb[i])
3095                         continue;
3096
3097                 switch (i) {
3098                 case RTA_IIF:
3099                 case RTA_OIF:
3100                 case RTA_SRC:
3101                 case RTA_DST:
3102                 case RTA_IP_PROTO:
3103                 case RTA_SPORT:
3104                 case RTA_DPORT:
3105                 case RTA_MARK:
3106                 case RTA_UID:
3107                         break;
3108                 default:
3109                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3110                         return -EINVAL;
3111                 }
3112         }
3113
3114         return 0;
3115 }
3116
3117 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3118                              struct netlink_ext_ack *extack)
3119 {
3120         struct net *net = sock_net(in_skb->sk);
3121         struct nlattr *tb[RTA_MAX+1];
3122         u32 table_id = RT_TABLE_MAIN;
3123         __be16 sport = 0, dport = 0;
3124         struct fib_result res = {};
3125         u8 ip_proto = IPPROTO_UDP;
3126         struct rtable *rt = NULL;
3127         struct sk_buff *skb;
3128         struct rtmsg *rtm;
3129         struct flowi4 fl4 = {};
3130         __be32 dst = 0;
3131         __be32 src = 0;
3132         kuid_t uid;
3133         u32 iif;
3134         int err;
3135         int mark;
3136
3137         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3138         if (err < 0)
3139                 return err;
3140
3141         rtm = nlmsg_data(nlh);
3142         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3143         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3144         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3145         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3146         if (tb[RTA_UID])
3147                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3148         else
3149                 uid = (iif ? INVALID_UID : current_uid());
3150
3151         if (tb[RTA_IP_PROTO]) {
3152                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3153                                                   &ip_proto, AF_INET, extack);
3154                 if (err)
3155                         return err;
3156         }
3157
3158         if (tb[RTA_SPORT])
3159                 sport = nla_get_be16(tb[RTA_SPORT]);
3160
3161         if (tb[RTA_DPORT])
3162                 dport = nla_get_be16(tb[RTA_DPORT]);
3163
3164         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3165         if (!skb)
3166                 return -ENOBUFS;
3167
3168         fl4.daddr = dst;
3169         fl4.saddr = src;
3170         fl4.flowi4_tos = rtm->rtm_tos;
3171         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3172         fl4.flowi4_mark = mark;
3173         fl4.flowi4_uid = uid;
3174         if (sport)
3175                 fl4.fl4_sport = sport;
3176         if (dport)
3177                 fl4.fl4_dport = dport;
3178         fl4.flowi4_proto = ip_proto;
3179
3180         rcu_read_lock();
3181
3182         if (iif) {
3183                 struct net_device *dev;
3184
3185                 dev = dev_get_by_index_rcu(net, iif);
3186                 if (!dev) {
3187                         err = -ENODEV;
3188                         goto errout_rcu;
3189                 }
3190
3191                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3192                 skb->dev        = dev;
3193                 skb->mark       = mark;
3194                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3195                                          dev, &res);
3196
3197                 rt = skb_rtable(skb);
3198                 if (err == 0 && rt->dst.error)
3199                         err = -rt->dst.error;
3200         } else {
3201                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3202                 skb->dev = net->loopback_dev;
3203                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3204                 err = 0;
3205                 if (IS_ERR(rt))
3206                         err = PTR_ERR(rt);
3207                 else
3208                         skb_dst_set(skb, &rt->dst);
3209         }
3210
3211         if (err)
3212                 goto errout_rcu;
3213
3214         if (rtm->rtm_flags & RTM_F_NOTIFY)
3215                 rt->rt_flags |= RTCF_NOTIFY;
3216
3217         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3218                 table_id = res.table ? res.table->tb_id : 0;
3219
3220         /* reset skb for netlink reply msg */
3221         skb_trim(skb, 0);
3222         skb_reset_network_header(skb);
3223         skb_reset_transport_header(skb);
3224         skb_reset_mac_header(skb);
3225
3226         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3227                 struct fib_rt_info fri;
3228
3229                 if (!res.fi) {
3230                         err = fib_props[res.type].error;
3231                         if (!err)
3232                                 err = -EHOSTUNREACH;
3233                         goto errout_rcu;
3234                 }
3235                 fri.fi = res.fi;
3236                 fri.tb_id = table_id;
3237                 fri.dst = res.prefix;
3238                 fri.dst_len = res.prefixlen;
3239                 fri.tos = fl4.flowi4_tos;
3240                 fri.type = rt->rt_type;
3241                 fri.offload = 0;
3242                 fri.trap = 0;
3243                 if (res.fa_head) {
3244                         struct fib_alias *fa;
3245
3246                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3247                                 u8 slen = 32 - fri.dst_len;
3248
3249                                 if (fa->fa_slen == slen &&
3250                                     fa->tb_id == fri.tb_id &&
3251                                     fa->fa_tos == fri.tos &&
3252                                     fa->fa_info == res.fi &&
3253                                     fa->fa_type == fri.type) {
3254                                         fri.offload = fa->offload;
3255                                         fri.trap = fa->trap;
3256                                         break;
3257                                 }
3258                         }
3259                 }
3260                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3261                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3262         } else {
3263                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3264                                    NETLINK_CB(in_skb).portid,
3265                                    nlh->nlmsg_seq, 0);
3266         }
3267         if (err < 0)
3268                 goto errout_rcu;
3269
3270         rcu_read_unlock();
3271
3272         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3273
3274 errout_free:
3275         return err;
3276 errout_rcu:
3277         rcu_read_unlock();
3278         kfree_skb(skb);
3279         goto errout_free;
3280 }
3281
3282 void ip_rt_multicast_event(struct in_device *in_dev)
3283 {
3284         rt_cache_flush(dev_net(in_dev->dev));
3285 }
3286
3287 #ifdef CONFIG_SYSCTL
3288 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3289 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3290 static int ip_rt_gc_elasticity __read_mostly    = 8;
3291 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3292
3293 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3294                                         void __user *buffer,
3295                                         size_t *lenp, loff_t *ppos)
3296 {
3297         struct net *net = (struct net *)__ctl->extra1;
3298
3299         if (write) {
3300                 rt_cache_flush(net);
3301                 fnhe_genid_bump(net);
3302                 return 0;
3303         }
3304
3305         return -EINVAL;
3306 }
3307
3308 static struct ctl_table ipv4_route_table[] = {
3309         {
3310                 .procname       = "gc_thresh",
3311                 .data           = &ipv4_dst_ops.gc_thresh,
3312                 .maxlen         = sizeof(int),
3313                 .mode           = 0644,
3314                 .proc_handler   = proc_dointvec,
3315         },
3316         {
3317                 .procname       = "max_size",
3318                 .data           = &ip_rt_max_size,
3319                 .maxlen         = sizeof(int),
3320                 .mode           = 0644,
3321                 .proc_handler   = proc_dointvec,
3322         },
3323         {
3324                 /*  Deprecated. Use gc_min_interval_ms */
3325
3326                 .procname       = "gc_min_interval",
3327                 .data           = &ip_rt_gc_min_interval,
3328                 .maxlen         = sizeof(int),
3329                 .mode           = 0644,
3330                 .proc_handler   = proc_dointvec_jiffies,
3331         },
3332         {
3333                 .procname       = "gc_min_interval_ms",
3334                 .data           = &ip_rt_gc_min_interval,
3335                 .maxlen         = sizeof(int),
3336                 .mode           = 0644,
3337                 .proc_handler   = proc_dointvec_ms_jiffies,
3338         },
3339         {
3340                 .procname       = "gc_timeout",
3341                 .data           = &ip_rt_gc_timeout,
3342                 .maxlen         = sizeof(int),
3343                 .mode           = 0644,
3344                 .proc_handler   = proc_dointvec_jiffies,
3345         },
3346         {
3347                 .procname       = "gc_interval",
3348                 .data           = &ip_rt_gc_interval,
3349                 .maxlen         = sizeof(int),
3350                 .mode           = 0644,
3351                 .proc_handler   = proc_dointvec_jiffies,
3352         },
3353         {
3354                 .procname       = "redirect_load",
3355                 .data           = &ip_rt_redirect_load,
3356                 .maxlen         = sizeof(int),
3357                 .mode           = 0644,
3358                 .proc_handler   = proc_dointvec,
3359         },
3360         {
3361                 .procname       = "redirect_number",
3362                 .data           = &ip_rt_redirect_number,
3363                 .maxlen         = sizeof(int),
3364                 .mode           = 0644,
3365                 .proc_handler   = proc_dointvec,
3366         },
3367         {
3368                 .procname       = "redirect_silence",
3369                 .data           = &ip_rt_redirect_silence,
3370                 .maxlen         = sizeof(int),
3371                 .mode           = 0644,
3372                 .proc_handler   = proc_dointvec,
3373         },
3374         {
3375                 .procname       = "error_cost",
3376                 .data           = &ip_rt_error_cost,
3377                 .maxlen         = sizeof(int),
3378                 .mode           = 0644,
3379                 .proc_handler   = proc_dointvec,
3380         },
3381         {
3382                 .procname       = "error_burst",
3383                 .data           = &ip_rt_error_burst,
3384                 .maxlen         = sizeof(int),
3385                 .mode           = 0644,
3386                 .proc_handler   = proc_dointvec,
3387         },
3388         {
3389                 .procname       = "gc_elasticity",
3390                 .data           = &ip_rt_gc_elasticity,
3391                 .maxlen         = sizeof(int),
3392                 .mode           = 0644,
3393                 .proc_handler   = proc_dointvec,
3394         },
3395         {
3396                 .procname       = "mtu_expires",
3397                 .data           = &ip_rt_mtu_expires,
3398                 .maxlen         = sizeof(int),
3399                 .mode           = 0644,
3400                 .proc_handler   = proc_dointvec_jiffies,
3401         },
3402         {
3403                 .procname       = "min_pmtu",
3404                 .data           = &ip_rt_min_pmtu,
3405                 .maxlen         = sizeof(int),
3406                 .mode           = 0644,
3407                 .proc_handler   = proc_dointvec_minmax,
3408                 .extra1         = &ip_min_valid_pmtu,
3409         },
3410         {
3411                 .procname       = "min_adv_mss",
3412                 .data           = &ip_rt_min_advmss,
3413                 .maxlen         = sizeof(int),
3414                 .mode           = 0644,
3415                 .proc_handler   = proc_dointvec,
3416         },
3417         { }
3418 };
3419
3420 static const char ipv4_route_flush_procname[] = "flush";
3421
3422 static struct ctl_table ipv4_route_flush_table[] = {
3423         {
3424                 .procname       = ipv4_route_flush_procname,
3425                 .maxlen         = sizeof(int),
3426                 .mode           = 0200,
3427                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3428         },
3429         { },
3430 };
3431
3432 static __net_init int sysctl_route_net_init(struct net *net)
3433 {
3434         struct ctl_table *tbl;
3435
3436         tbl = ipv4_route_flush_table;
3437         if (!net_eq(net, &init_net)) {
3438                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3439                 if (!tbl)
3440                         goto err_dup;
3441
3442                 /* Don't export non-whitelisted sysctls to unprivileged users */
3443                 if (net->user_ns != &init_user_ns) {
3444                         if (tbl[0].procname != ipv4_route_flush_procname)
3445                                 tbl[0].procname = NULL;
3446                 }
3447         }
3448         tbl[0].extra1 = net;
3449
3450         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3451         if (!net->ipv4.route_hdr)
3452                 goto err_reg;
3453         return 0;
3454
3455 err_reg:
3456         if (tbl != ipv4_route_flush_table)
3457                 kfree(tbl);
3458 err_dup:
3459         return -ENOMEM;
3460 }
3461
3462 static __net_exit void sysctl_route_net_exit(struct net *net)
3463 {
3464         struct ctl_table *tbl;
3465
3466         tbl = net->ipv4.route_hdr->ctl_table_arg;
3467         unregister_net_sysctl_table(net->ipv4.route_hdr);
3468         BUG_ON(tbl == ipv4_route_flush_table);
3469         kfree(tbl);
3470 }
3471
3472 static __net_initdata struct pernet_operations sysctl_route_ops = {
3473         .init = sysctl_route_net_init,
3474         .exit = sysctl_route_net_exit,
3475 };
3476 #endif
3477
3478 static __net_init int rt_genid_init(struct net *net)
3479 {
3480         atomic_set(&net->ipv4.rt_genid, 0);
3481         atomic_set(&net->fnhe_genid, 0);
3482         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3483         return 0;
3484 }
3485
3486 static __net_initdata struct pernet_operations rt_genid_ops = {
3487         .init = rt_genid_init,
3488 };
3489
3490 static int __net_init ipv4_inetpeer_init(struct net *net)
3491 {
3492         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3493
3494         if (!bp)
3495                 return -ENOMEM;
3496         inet_peer_base_init(bp);
3497         net->ipv4.peers = bp;
3498         return 0;
3499 }
3500
3501 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3502 {
3503         struct inet_peer_base *bp = net->ipv4.peers;
3504
3505         net->ipv4.peers = NULL;
3506         inetpeer_invalidate_tree(bp);
3507         kfree(bp);
3508 }
3509
3510 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3511         .init   =       ipv4_inetpeer_init,
3512         .exit   =       ipv4_inetpeer_exit,
3513 };
3514
3515 #ifdef CONFIG_IP_ROUTE_CLASSID
3516 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3517 #endif /* CONFIG_IP_ROUTE_CLASSID */
3518
3519 int __init ip_rt_init(void)
3520 {
3521         int cpu;
3522
3523         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3524                                   GFP_KERNEL);
3525         if (!ip_idents)
3526                 panic("IP: failed to allocate ip_idents\n");
3527
3528         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3529
3530         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3531         if (!ip_tstamps)
3532                 panic("IP: failed to allocate ip_tstamps\n");
3533
3534         for_each_possible_cpu(cpu) {
3535                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3536
3537                 INIT_LIST_HEAD(&ul->head);
3538                 spin_lock_init(&ul->lock);
3539         }
3540 #ifdef CONFIG_IP_ROUTE_CLASSID
3541         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3542         if (!ip_rt_acct)
3543                 panic("IP: failed to allocate ip_rt_acct\n");
3544 #endif
3545
3546         ipv4_dst_ops.kmem_cachep =
3547                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3548                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3549
3550         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3551
3552         if (dst_entries_init(&ipv4_dst_ops) < 0)
3553                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3554
3555         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3556                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3557
3558         ipv4_dst_ops.gc_thresh = ~0;
3559         ip_rt_max_size = INT_MAX;
3560
3561         devinet_init();
3562         ip_fib_init();
3563
3564         if (ip_rt_proc_init())
3565                 pr_err("Unable to create route proc files\n");
3566 #ifdef CONFIG_XFRM
3567         xfrm_init();
3568         xfrm4_init();
3569 #endif
3570         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3571                       RTNL_FLAG_DOIT_UNLOCKED);
3572
3573 #ifdef CONFIG_SYSCTL
3574         register_pernet_subsys(&sysctl_route_ops);
3575 #endif
3576         register_pernet_subsys(&rt_genid_ops);
3577         register_pernet_subsys(&ipv4_inetpeer_ops);
3578         return 0;
3579 }
3580
3581 #ifdef CONFIG_SYSCTL
3582 /*
3583  * We really need to sanitize the damn ipv4 init order, then all
3584  * this nonsense will go away.
3585  */
3586 void __init ip_static_sysctl_init(void)
3587 {
3588         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3589 }
3590 #endif