Merge tag 'arm-soc-drivers-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu,
143                                            bool confirm_neigh);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct proc_ops rt_cache_proc_ops = {
241         .proc_open      = rt_cache_seq_open,
242         .proc_read      = seq_read,
243         .proc_lseek     = seq_lseek,
244         .proc_release   = seq_release,
245 };
246
247
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250         int cpu;
251
252         if (*pos == 0)
253                 return SEQ_START_TOKEN;
254
255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256                 if (!cpu_possible(cpu))
257                         continue;
258                 *pos = cpu+1;
259                 return &per_cpu(rt_cache_stat, cpu);
260         }
261         return NULL;
262 }
263
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266         int cpu;
267
268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269                 if (!cpu_possible(cpu))
270                         continue;
271                 *pos = cpu+1;
272                 return &per_cpu(rt_cache_stat, cpu);
273         }
274         (*pos)++;
275         return NULL;
276
277 }
278
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281
282 }
283
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324
325
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330
331 static const struct proc_ops rt_cpu_proc_ops = {
332         .proc_open      = rt_cpu_seq_open,
333         .proc_read      = seq_read,
334         .proc_lseek     = seq_lseek,
335         .proc_release   = seq_release,
336 };
337
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341         struct ip_rt_acct *dst, *src;
342         unsigned int i, j;
343
344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345         if (!dst)
346                 return -ENOMEM;
347
348         for_each_possible_cpu(i) {
349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350                 for (j = 0; j < 256; j++) {
351                         dst[j].o_bytes   += src[j].o_bytes;
352                         dst[j].o_packets += src[j].o_packets;
353                         dst[j].i_bytes   += src[j].i_bytes;
354                         dst[j].i_packets += src[j].i_packets;
355                 }
356         }
357
358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359         kfree(dst);
360         return 0;
361 }
362 #endif
363
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366         struct proc_dir_entry *pde;
367
368         pde = proc_create("rt_cache", 0444, net->proc_net,
369                           &rt_cache_proc_ops);
370         if (!pde)
371                 goto err1;
372
373         pde = proc_create("rt_cache", 0444,
374                           net->proc_net_stat, &rt_cpu_proc_ops);
375         if (!pde)
376                 goto err2;
377
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379         pde = proc_create_single("rt_acct", 0, net->proc_net,
380                         rt_acct_proc_show);
381         if (!pde)
382                 goto err3;
383 #endif
384         return 0;
385
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388         remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391         remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393         return -ENOMEM;
394 }
395
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399         remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401         remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406         .init = ip_rt_do_proc_init,
407         .exit = ip_rt_do_proc_exit,
408 };
409
410 static int __init ip_rt_proc_init(void)
411 {
412         return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418         return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426
427 void rt_cache_flush(struct net *net)
428 {
429         rt_genid_bump_ipv4(net);
430 }
431
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433                                            struct sk_buff *skb,
434                                            const void *daddr)
435 {
436         const struct rtable *rt = container_of(dst, struct rtable, dst);
437         struct net_device *dev = dst->dev;
438         struct neighbour *n;
439
440         rcu_read_lock_bh();
441
442         if (likely(rt->rt_gw_family == AF_INET)) {
443                 n = ip_neigh_gw4(dev, rt->rt_gw4);
444         } else if (rt->rt_gw_family == AF_INET6) {
445                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447                 __be32 pkey;
448
449                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450                 n = ip_neigh_gw4(dev, pkey);
451         }
452
453         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454                 n = NULL;
455
456         rcu_read_unlock_bh();
457
458         return n;
459 }
460
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463         const struct rtable *rt = container_of(dst, struct rtable, dst);
464         struct net_device *dev = dst->dev;
465         const __be32 *pkey = daddr;
466
467         if (rt->rt_gw_family == AF_INET) {
468                 pkey = (const __be32 *)&rt->rt_gw4;
469         } else if (rt->rt_gw_family == AF_INET6) {
470                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471         } else if (!daddr ||
472                  (rt->rt_flags &
473                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474                 return;
475         }
476         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478
479 #define IP_IDENTS_SZ 2048u
480
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492         u32 old = READ_ONCE(*p_tstamp);
493         u32 now = (u32)jiffies;
494         u32 delta = 0;
495
496         if (old != now && cmpxchg(p_tstamp, old, now) == old)
497                 delta = prandom_u32_max(now - old);
498
499         /* If UBSAN reports an error there, please make sure your compiler
500          * supports -fno-strict-overflow before reporting it that was a bug
501          * in UBSAN, and it has been fixed in GCC-8.
502          */
503         return atomic_add_return(segs + delta, p_id) - segs;
504 }
505 EXPORT_SYMBOL(ip_idents_reserve);
506
507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
508 {
509         u32 hash, id;
510
511         /* Note the following code is not safe, but this is okay. */
512         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513                 get_random_bytes(&net->ipv4.ip_id_key,
514                                  sizeof(net->ipv4.ip_id_key));
515
516         hash = siphash_3u32((__force u32)iph->daddr,
517                             (__force u32)iph->saddr,
518                             iph->protocol,
519                             &net->ipv4.ip_id_key);
520         id = ip_idents_reserve(hash, segs);
521         iph->id = htons(id);
522 }
523 EXPORT_SYMBOL(__ip_select_ident);
524
525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526                              const struct sock *sk,
527                              const struct iphdr *iph,
528                              int oif, u8 tos,
529                              u8 prot, u32 mark, int flow_flags)
530 {
531         if (sk) {
532                 const struct inet_sock *inet = inet_sk(sk);
533
534                 oif = sk->sk_bound_dev_if;
535                 mark = sk->sk_mark;
536                 tos = RT_CONN_FLAGS(sk);
537                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538         }
539         flowi4_init_output(fl4, oif, mark, tos,
540                            RT_SCOPE_UNIVERSE, prot,
541                            flow_flags,
542                            iph->daddr, iph->saddr, 0, 0,
543                            sock_net_uid(net, sk));
544 }
545
546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547                                const struct sock *sk)
548 {
549         const struct net *net = dev_net(skb->dev);
550         const struct iphdr *iph = ip_hdr(skb);
551         int oif = skb->dev->ifindex;
552         u8 tos = RT_TOS(iph->tos);
553         u8 prot = iph->protocol;
554         u32 mark = skb->mark;
555
556         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
557 }
558
559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
560 {
561         const struct inet_sock *inet = inet_sk(sk);
562         const struct ip_options_rcu *inet_opt;
563         __be32 daddr = inet->inet_daddr;
564
565         rcu_read_lock();
566         inet_opt = rcu_dereference(inet->inet_opt);
567         if (inet_opt && inet_opt->opt.srr)
568                 daddr = inet_opt->opt.faddr;
569         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572                            inet_sk_flowi_flags(sk),
573                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
574         rcu_read_unlock();
575 }
576
577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578                                  const struct sk_buff *skb)
579 {
580         if (skb)
581                 build_skb_flow_key(fl4, skb, sk);
582         else
583                 build_sk_flow_key(fl4, sk);
584 }
585
586 static DEFINE_SPINLOCK(fnhe_lock);
587
588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589 {
590         struct rtable *rt;
591
592         rt = rcu_dereference(fnhe->fnhe_rth_input);
593         if (rt) {
594                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
595                 dst_dev_put(&rt->dst);
596                 dst_release(&rt->dst);
597         }
598         rt = rcu_dereference(fnhe->fnhe_rth_output);
599         if (rt) {
600                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
601                 dst_dev_put(&rt->dst);
602                 dst_release(&rt->dst);
603         }
604 }
605
606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
607 {
608         struct fib_nh_exception *fnhe, *oldest;
609
610         oldest = rcu_dereference(hash->chain);
611         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612              fnhe = rcu_dereference(fnhe->fnhe_next)) {
613                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
614                         oldest = fnhe;
615         }
616         fnhe_flush_routes(oldest);
617         return oldest;
618 }
619
620 static inline u32 fnhe_hashfun(__be32 daddr)
621 {
622         static u32 fnhe_hashrnd __read_mostly;
623         u32 hval;
624
625         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626         hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
627         return hash_32(hval, FNHE_HASH_SHIFT);
628 }
629
630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631 {
632         rt->rt_pmtu = fnhe->fnhe_pmtu;
633         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
634         rt->dst.expires = fnhe->fnhe_expires;
635
636         if (fnhe->fnhe_gw) {
637                 rt->rt_flags |= RTCF_REDIRECTED;
638                 rt->rt_uses_gateway = 1;
639                 rt->rt_gw_family = AF_INET;
640                 rt->rt_gw4 = fnhe->fnhe_gw;
641         }
642 }
643
644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645                                   __be32 gw, u32 pmtu, bool lock,
646                                   unsigned long expires)
647 {
648         struct fnhe_hash_bucket *hash;
649         struct fib_nh_exception *fnhe;
650         struct rtable *rt;
651         u32 genid, hval;
652         unsigned int i;
653         int depth;
654
655         genid = fnhe_genid(dev_net(nhc->nhc_dev));
656         hval = fnhe_hashfun(daddr);
657
658         spin_lock_bh(&fnhe_lock);
659
660         hash = rcu_dereference(nhc->nhc_exceptions);
661         if (!hash) {
662                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
663                 if (!hash)
664                         goto out_unlock;
665                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
666         }
667
668         hash += hval;
669
670         depth = 0;
671         for (fnhe = rcu_dereference(hash->chain); fnhe;
672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
673                 if (fnhe->fnhe_daddr == daddr)
674                         break;
675                 depth++;
676         }
677
678         if (fnhe) {
679                 if (fnhe->fnhe_genid != genid)
680                         fnhe->fnhe_genid = genid;
681                 if (gw)
682                         fnhe->fnhe_gw = gw;
683                 if (pmtu) {
684                         fnhe->fnhe_pmtu = pmtu;
685                         fnhe->fnhe_mtu_locked = lock;
686                 }
687                 fnhe->fnhe_expires = max(1UL, expires);
688                 /* Update all cached dsts too */
689                 rt = rcu_dereference(fnhe->fnhe_rth_input);
690                 if (rt)
691                         fill_route_from_fnhe(rt, fnhe);
692                 rt = rcu_dereference(fnhe->fnhe_rth_output);
693                 if (rt)
694                         fill_route_from_fnhe(rt, fnhe);
695         } else {
696                 if (depth > FNHE_RECLAIM_DEPTH)
697                         fnhe = fnhe_oldest(hash);
698                 else {
699                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700                         if (!fnhe)
701                                 goto out_unlock;
702
703                         fnhe->fnhe_next = hash->chain;
704                         rcu_assign_pointer(hash->chain, fnhe);
705                 }
706                 fnhe->fnhe_genid = genid;
707                 fnhe->fnhe_daddr = daddr;
708                 fnhe->fnhe_gw = gw;
709                 fnhe->fnhe_pmtu = pmtu;
710                 fnhe->fnhe_mtu_locked = lock;
711                 fnhe->fnhe_expires = max(1UL, expires);
712
713                 /* Exception created; mark the cached routes for the nexthop
714                  * stale, so anyone caching it rechecks if this exception
715                  * applies to them.
716                  */
717                 rt = rcu_dereference(nhc->nhc_rth_input);
718                 if (rt)
719                         rt->dst.obsolete = DST_OBSOLETE_KILL;
720
721                 for_each_possible_cpu(i) {
722                         struct rtable __rcu **prt;
723                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
724                         rt = rcu_dereference(*prt);
725                         if (rt)
726                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
727                 }
728         }
729
730         fnhe->fnhe_stamp = jiffies;
731
732 out_unlock:
733         spin_unlock_bh(&fnhe_lock);
734 }
735
736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737                              bool kill_route)
738 {
739         __be32 new_gw = icmp_hdr(skb)->un.gateway;
740         __be32 old_gw = ip_hdr(skb)->saddr;
741         struct net_device *dev = skb->dev;
742         struct in_device *in_dev;
743         struct fib_result res;
744         struct neighbour *n;
745         struct net *net;
746
747         switch (icmp_hdr(skb)->code & 7) {
748         case ICMP_REDIR_NET:
749         case ICMP_REDIR_NETTOS:
750         case ICMP_REDIR_HOST:
751         case ICMP_REDIR_HOSTTOS:
752                 break;
753
754         default:
755                 return;
756         }
757
758         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
759                 return;
760
761         in_dev = __in_dev_get_rcu(dev);
762         if (!in_dev)
763                 return;
764
765         net = dev_net(dev);
766         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768             ipv4_is_zeronet(new_gw))
769                 goto reject_redirect;
770
771         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773                         goto reject_redirect;
774                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775                         goto reject_redirect;
776         } else {
777                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
778                         goto reject_redirect;
779         }
780
781         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782         if (!n)
783                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
784         if (!IS_ERR(n)) {
785                 if (!(n->nud_state & NUD_VALID)) {
786                         neigh_event_send(n, NULL);
787                 } else {
788                         if (fib_lookup(net, fl4, &res, 0) == 0) {
789                                 struct fib_nh_common *nhc;
790
791                                 fib_select_path(net, &res, fl4, skb);
792                                 nhc = FIB_RES_NHC(res);
793                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
794                                                 0, false,
795                                                 jiffies + ip_rt_gc_timeout);
796                         }
797                         if (kill_route)
798                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
799                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800                 }
801                 neigh_release(n);
802         }
803         return;
804
805 reject_redirect:
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807         if (IN_DEV_LOG_MARTIANS(in_dev)) {
808                 const struct iphdr *iph = (const struct iphdr *) skb->data;
809                 __be32 daddr = iph->daddr;
810                 __be32 saddr = iph->saddr;
811
812                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813                                      "  Advised path = %pI4 -> %pI4\n",
814                                      &old_gw, dev->name, &new_gw,
815                                      &saddr, &daddr);
816         }
817 #endif
818         ;
819 }
820
821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822 {
823         struct rtable *rt;
824         struct flowi4 fl4;
825         const struct iphdr *iph = (const struct iphdr *) skb->data;
826         struct net *net = dev_net(skb->dev);
827         int oif = skb->dev->ifindex;
828         u8 tos = RT_TOS(iph->tos);
829         u8 prot = iph->protocol;
830         u32 mark = skb->mark;
831
832         rt = (struct rtable *) dst;
833
834         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
835         __ip_do_redirect(rt, skb, &fl4, true);
836 }
837
838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839 {
840         struct rtable *rt = (struct rtable *)dst;
841         struct dst_entry *ret = dst;
842
843         if (rt) {
844                 if (dst->obsolete > 0) {
845                         ip_rt_put(rt);
846                         ret = NULL;
847                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848                            rt->dst.expires) {
849                         ip_rt_put(rt);
850                         ret = NULL;
851                 }
852         }
853         return ret;
854 }
855
856 /*
857  * Algorithm:
858  *      1. The first ip_rt_redirect_number redirects are sent
859  *         with exponential backoff, then we stop sending them at all,
860  *         assuming that the host ignores our redirects.
861  *      2. If we did not see packets requiring redirects
862  *         during ip_rt_redirect_silence, we assume that the host
863  *         forgot redirected route and start to send redirects again.
864  *
865  * This algorithm is much cheaper and more intelligent than dumb load limiting
866  * in icmp.c.
867  *
868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
870  */
871
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874         struct rtable *rt = skb_rtable(skb);
875         struct in_device *in_dev;
876         struct inet_peer *peer;
877         struct net *net;
878         int log_martians;
879         int vif;
880
881         rcu_read_lock();
882         in_dev = __in_dev_get_rcu(rt->dst.dev);
883         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884                 rcu_read_unlock();
885                 return;
886         }
887         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
888         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889         rcu_read_unlock();
890
891         net = dev_net(rt->dst.dev);
892         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
893         if (!peer) {
894                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895                           rt_nexthop(rt, ip_hdr(skb)->daddr));
896                 return;
897         }
898
899         /* No redirected packets during ip_rt_redirect_silence;
900          * reset the algorithm.
901          */
902         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
903                 peer->rate_tokens = 0;
904                 peer->n_redirects = 0;
905         }
906
907         /* Too many ignored redirects; do not send anything
908          * set dst.rate_last to the last seen redirected packet.
909          */
910         if (peer->n_redirects >= ip_rt_redirect_number) {
911                 peer->rate_last = jiffies;
912                 goto out_put_peer;
913         }
914
915         /* Check for load limit; set rate_last to the latest sent
916          * redirect.
917          */
918         if (peer->n_redirects == 0 ||
919             time_after(jiffies,
920                        (peer->rate_last +
921                         (ip_rt_redirect_load << peer->n_redirects)))) {
922                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
923
924                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
925                 peer->rate_last = jiffies;
926                 ++peer->n_redirects;
927 #ifdef CONFIG_IP_ROUTE_VERBOSE
928                 if (log_martians &&
929                     peer->n_redirects == ip_rt_redirect_number)
930                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
931                                              &ip_hdr(skb)->saddr, inet_iif(skb),
932                                              &ip_hdr(skb)->daddr, &gw);
933 #endif
934         }
935 out_put_peer:
936         inet_putpeer(peer);
937 }
938
939 static int ip_error(struct sk_buff *skb)
940 {
941         struct rtable *rt = skb_rtable(skb);
942         struct net_device *dev = skb->dev;
943         struct in_device *in_dev;
944         struct inet_peer *peer;
945         unsigned long now;
946         struct net *net;
947         bool send;
948         int code;
949
950         if (netif_is_l3_master(skb->dev)) {
951                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
952                 if (!dev)
953                         goto out;
954         }
955
956         in_dev = __in_dev_get_rcu(dev);
957
958         /* IP on this device is disabled. */
959         if (!in_dev)
960                 goto out;
961
962         net = dev_net(rt->dst.dev);
963         if (!IN_DEV_FORWARD(in_dev)) {
964                 switch (rt->dst.error) {
965                 case EHOSTUNREACH:
966                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
967                         break;
968
969                 case ENETUNREACH:
970                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
971                         break;
972                 }
973                 goto out;
974         }
975
976         switch (rt->dst.error) {
977         case EINVAL:
978         default:
979                 goto out;
980         case EHOSTUNREACH:
981                 code = ICMP_HOST_UNREACH;
982                 break;
983         case ENETUNREACH:
984                 code = ICMP_NET_UNREACH;
985                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
986                 break;
987         case EACCES:
988                 code = ICMP_PKT_FILTERED;
989                 break;
990         }
991
992         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
993                                l3mdev_master_ifindex(skb->dev), 1);
994
995         send = true;
996         if (peer) {
997                 now = jiffies;
998                 peer->rate_tokens += now - peer->rate_last;
999                 if (peer->rate_tokens > ip_rt_error_burst)
1000                         peer->rate_tokens = ip_rt_error_burst;
1001                 peer->rate_last = now;
1002                 if (peer->rate_tokens >= ip_rt_error_cost)
1003                         peer->rate_tokens -= ip_rt_error_cost;
1004                 else
1005                         send = false;
1006                 inet_putpeer(peer);
1007         }
1008         if (send)
1009                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1010
1011 out:    kfree_skb(skb);
1012         return 0;
1013 }
1014
1015 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1016 {
1017         struct dst_entry *dst = &rt->dst;
1018         struct net *net = dev_net(dst->dev);
1019         struct fib_result res;
1020         bool lock = false;
1021         u32 old_mtu;
1022
1023         if (ip_mtu_locked(dst))
1024                 return;
1025
1026         old_mtu = ipv4_mtu(dst);
1027         if (old_mtu < mtu)
1028                 return;
1029
1030         if (mtu < ip_rt_min_pmtu) {
1031                 lock = true;
1032                 mtu = min(old_mtu, ip_rt_min_pmtu);
1033         }
1034
1035         if (rt->rt_pmtu == mtu && !lock &&
1036             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1037                 return;
1038
1039         rcu_read_lock();
1040         if (fib_lookup(net, fl4, &res, 0) == 0) {
1041                 struct fib_nh_common *nhc;
1042
1043                 fib_select_path(net, &res, fl4, NULL);
1044                 nhc = FIB_RES_NHC(res);
1045                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1046                                       jiffies + ip_rt_mtu_expires);
1047         }
1048         rcu_read_unlock();
1049 }
1050
1051 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1052                               struct sk_buff *skb, u32 mtu,
1053                               bool confirm_neigh)
1054 {
1055         struct rtable *rt = (struct rtable *) dst;
1056         struct flowi4 fl4;
1057
1058         ip_rt_build_flow_key(&fl4, sk, skb);
1059
1060         /* Don't make lookup fail for bridged encapsulations */
1061         if (skb && netif_is_any_bridge_port(skb->dev))
1062                 fl4.flowi4_oif = 0;
1063
1064         __ip_rt_update_pmtu(rt, &fl4, mtu);
1065 }
1066
1067 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1068                       int oif, u8 protocol)
1069 {
1070         const struct iphdr *iph = (const struct iphdr *)skb->data;
1071         struct flowi4 fl4;
1072         struct rtable *rt;
1073         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1074
1075         __build_flow_key(net, &fl4, NULL, iph, oif,
1076                          RT_TOS(iph->tos), protocol, mark, 0);
1077         rt = __ip_route_output_key(net, &fl4);
1078         if (!IS_ERR(rt)) {
1079                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1080                 ip_rt_put(rt);
1081         }
1082 }
1083 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1084
1085 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1086 {
1087         const struct iphdr *iph = (const struct iphdr *)skb->data;
1088         struct flowi4 fl4;
1089         struct rtable *rt;
1090
1091         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1092
1093         if (!fl4.flowi4_mark)
1094                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1095
1096         rt = __ip_route_output_key(sock_net(sk), &fl4);
1097         if (!IS_ERR(rt)) {
1098                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1099                 ip_rt_put(rt);
1100         }
1101 }
1102
1103 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1104 {
1105         const struct iphdr *iph = (const struct iphdr *)skb->data;
1106         struct flowi4 fl4;
1107         struct rtable *rt;
1108         struct dst_entry *odst = NULL;
1109         bool new = false;
1110         struct net *net = sock_net(sk);
1111
1112         bh_lock_sock(sk);
1113
1114         if (!ip_sk_accept_pmtu(sk))
1115                 goto out;
1116
1117         odst = sk_dst_get(sk);
1118
1119         if (sock_owned_by_user(sk) || !odst) {
1120                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1121                 goto out;
1122         }
1123
1124         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1125
1126         rt = (struct rtable *)odst;
1127         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1128                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1129                 if (IS_ERR(rt))
1130                         goto out;
1131
1132                 new = true;
1133         }
1134
1135         __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1136
1137         if (!dst_check(&rt->dst, 0)) {
1138                 if (new)
1139                         dst_release(&rt->dst);
1140
1141                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1142                 if (IS_ERR(rt))
1143                         goto out;
1144
1145                 new = true;
1146         }
1147
1148         if (new)
1149                 sk_dst_set(sk, &rt->dst);
1150
1151 out:
1152         bh_unlock_sock(sk);
1153         dst_release(odst);
1154 }
1155 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1156
1157 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1158                    int oif, u8 protocol)
1159 {
1160         const struct iphdr *iph = (const struct iphdr *)skb->data;
1161         struct flowi4 fl4;
1162         struct rtable *rt;
1163
1164         __build_flow_key(net, &fl4, NULL, iph, oif,
1165                          RT_TOS(iph->tos), protocol, 0, 0);
1166         rt = __ip_route_output_key(net, &fl4);
1167         if (!IS_ERR(rt)) {
1168                 __ip_do_redirect(rt, skb, &fl4, false);
1169                 ip_rt_put(rt);
1170         }
1171 }
1172 EXPORT_SYMBOL_GPL(ipv4_redirect);
1173
1174 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1175 {
1176         const struct iphdr *iph = (const struct iphdr *)skb->data;
1177         struct flowi4 fl4;
1178         struct rtable *rt;
1179         struct net *net = sock_net(sk);
1180
1181         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1182         rt = __ip_route_output_key(net, &fl4);
1183         if (!IS_ERR(rt)) {
1184                 __ip_do_redirect(rt, skb, &fl4, false);
1185                 ip_rt_put(rt);
1186         }
1187 }
1188 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1189
1190 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1191 {
1192         struct rtable *rt = (struct rtable *) dst;
1193
1194         /* All IPV4 dsts are created with ->obsolete set to the value
1195          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1196          * into this function always.
1197          *
1198          * When a PMTU/redirect information update invalidates a route,
1199          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1200          * DST_OBSOLETE_DEAD.
1201          */
1202         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1203                 return NULL;
1204         return dst;
1205 }
1206
1207 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1208 {
1209         struct ip_options opt;
1210         int res;
1211
1212         /* Recompile ip options since IPCB may not be valid anymore.
1213          * Also check we have a reasonable ipv4 header.
1214          */
1215         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1216             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1217                 return;
1218
1219         memset(&opt, 0, sizeof(opt));
1220         if (ip_hdr(skb)->ihl > 5) {
1221                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1222                         return;
1223                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1224
1225                 rcu_read_lock();
1226                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1227                 rcu_read_unlock();
1228
1229                 if (res)
1230                         return;
1231         }
1232         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1233 }
1234
1235 static void ipv4_link_failure(struct sk_buff *skb)
1236 {
1237         struct rtable *rt;
1238
1239         ipv4_send_dest_unreach(skb);
1240
1241         rt = skb_rtable(skb);
1242         if (rt)
1243                 dst_set_expires(&rt->dst, 0);
1244 }
1245
1246 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1247 {
1248         pr_debug("%s: %pI4 -> %pI4, %s\n",
1249                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1250                  skb->dev ? skb->dev->name : "?");
1251         kfree_skb(skb);
1252         WARN_ON(1);
1253         return 0;
1254 }
1255
1256 /*
1257    We do not cache source address of outgoing interface,
1258    because it is used only by IP RR, TS and SRR options,
1259    so that it out of fast path.
1260
1261    BTW remember: "addr" is allowed to be not aligned
1262    in IP options!
1263  */
1264
1265 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1266 {
1267         __be32 src;
1268
1269         if (rt_is_output_route(rt))
1270                 src = ip_hdr(skb)->saddr;
1271         else {
1272                 struct fib_result res;
1273                 struct iphdr *iph = ip_hdr(skb);
1274                 struct flowi4 fl4 = {
1275                         .daddr = iph->daddr,
1276                         .saddr = iph->saddr,
1277                         .flowi4_tos = RT_TOS(iph->tos),
1278                         .flowi4_oif = rt->dst.dev->ifindex,
1279                         .flowi4_iif = skb->dev->ifindex,
1280                         .flowi4_mark = skb->mark,
1281                 };
1282
1283                 rcu_read_lock();
1284                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1285                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1286                 else
1287                         src = inet_select_addr(rt->dst.dev,
1288                                                rt_nexthop(rt, iph->daddr),
1289                                                RT_SCOPE_UNIVERSE);
1290                 rcu_read_unlock();
1291         }
1292         memcpy(addr, &src, 4);
1293 }
1294
1295 #ifdef CONFIG_IP_ROUTE_CLASSID
1296 static void set_class_tag(struct rtable *rt, u32 tag)
1297 {
1298         if (!(rt->dst.tclassid & 0xFFFF))
1299                 rt->dst.tclassid |= tag & 0xFFFF;
1300         if (!(rt->dst.tclassid & 0xFFFF0000))
1301                 rt->dst.tclassid |= tag & 0xFFFF0000;
1302 }
1303 #endif
1304
1305 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1306 {
1307         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1308         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1309                                     ip_rt_min_advmss);
1310
1311         return min(advmss, IPV4_MAX_PMTU - header_size);
1312 }
1313
1314 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1315 {
1316         const struct rtable *rt = (const struct rtable *)dst;
1317         unsigned int mtu = rt->rt_pmtu;
1318
1319         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1320                 mtu = dst_metric_raw(dst, RTAX_MTU);
1321
1322         if (mtu)
1323                 return mtu;
1324
1325         mtu = READ_ONCE(dst->dev->mtu);
1326
1327         if (unlikely(ip_mtu_locked(dst))) {
1328                 if (rt->rt_uses_gateway && mtu > 576)
1329                         mtu = 576;
1330         }
1331
1332         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1333
1334         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1335 }
1336
1337 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1338 {
1339         struct fnhe_hash_bucket *hash;
1340         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1341         u32 hval = fnhe_hashfun(daddr);
1342
1343         spin_lock_bh(&fnhe_lock);
1344
1345         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1346                                          lockdep_is_held(&fnhe_lock));
1347         hash += hval;
1348
1349         fnhe_p = &hash->chain;
1350         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1351         while (fnhe) {
1352                 if (fnhe->fnhe_daddr == daddr) {
1353                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1354                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1355                         /* set fnhe_daddr to 0 to ensure it won't bind with
1356                          * new dsts in rt_bind_exception().
1357                          */
1358                         fnhe->fnhe_daddr = 0;
1359                         fnhe_flush_routes(fnhe);
1360                         kfree_rcu(fnhe, rcu);
1361                         break;
1362                 }
1363                 fnhe_p = &fnhe->fnhe_next;
1364                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1365                                                  lockdep_is_held(&fnhe_lock));
1366         }
1367
1368         spin_unlock_bh(&fnhe_lock);
1369 }
1370
1371 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1372                                                __be32 daddr)
1373 {
1374         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1375         struct fib_nh_exception *fnhe;
1376         u32 hval;
1377
1378         if (!hash)
1379                 return NULL;
1380
1381         hval = fnhe_hashfun(daddr);
1382
1383         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1384              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1385                 if (fnhe->fnhe_daddr == daddr) {
1386                         if (fnhe->fnhe_expires &&
1387                             time_after(jiffies, fnhe->fnhe_expires)) {
1388                                 ip_del_fnhe(nhc, daddr);
1389                                 break;
1390                         }
1391                         return fnhe;
1392                 }
1393         }
1394         return NULL;
1395 }
1396
1397 /* MTU selection:
1398  * 1. mtu on route is locked - use it
1399  * 2. mtu from nexthop exception
1400  * 3. mtu from egress device
1401  */
1402
1403 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1404 {
1405         struct fib_nh_common *nhc = res->nhc;
1406         struct net_device *dev = nhc->nhc_dev;
1407         struct fib_info *fi = res->fi;
1408         u32 mtu = 0;
1409
1410         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1411             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1412                 mtu = fi->fib_mtu;
1413
1414         if (likely(!mtu)) {
1415                 struct fib_nh_exception *fnhe;
1416
1417                 fnhe = find_exception(nhc, daddr);
1418                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1419                         mtu = fnhe->fnhe_pmtu;
1420         }
1421
1422         if (likely(!mtu))
1423                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1424
1425         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1426 }
1427
1428 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1429                               __be32 daddr, const bool do_cache)
1430 {
1431         bool ret = false;
1432
1433         spin_lock_bh(&fnhe_lock);
1434
1435         if (daddr == fnhe->fnhe_daddr) {
1436                 struct rtable __rcu **porig;
1437                 struct rtable *orig;
1438                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1439
1440                 if (rt_is_input_route(rt))
1441                         porig = &fnhe->fnhe_rth_input;
1442                 else
1443                         porig = &fnhe->fnhe_rth_output;
1444                 orig = rcu_dereference(*porig);
1445
1446                 if (fnhe->fnhe_genid != genid) {
1447                         fnhe->fnhe_genid = genid;
1448                         fnhe->fnhe_gw = 0;
1449                         fnhe->fnhe_pmtu = 0;
1450                         fnhe->fnhe_expires = 0;
1451                         fnhe->fnhe_mtu_locked = false;
1452                         fnhe_flush_routes(fnhe);
1453                         orig = NULL;
1454                 }
1455                 fill_route_from_fnhe(rt, fnhe);
1456                 if (!rt->rt_gw4) {
1457                         rt->rt_gw4 = daddr;
1458                         rt->rt_gw_family = AF_INET;
1459                 }
1460
1461                 if (do_cache) {
1462                         dst_hold(&rt->dst);
1463                         rcu_assign_pointer(*porig, rt);
1464                         if (orig) {
1465                                 dst_dev_put(&orig->dst);
1466                                 dst_release(&orig->dst);
1467                         }
1468                         ret = true;
1469                 }
1470
1471                 fnhe->fnhe_stamp = jiffies;
1472         }
1473         spin_unlock_bh(&fnhe_lock);
1474
1475         return ret;
1476 }
1477
1478 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1479 {
1480         struct rtable *orig, *prev, **p;
1481         bool ret = true;
1482
1483         if (rt_is_input_route(rt)) {
1484                 p = (struct rtable **)&nhc->nhc_rth_input;
1485         } else {
1486                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1487         }
1488         orig = *p;
1489
1490         /* hold dst before doing cmpxchg() to avoid race condition
1491          * on this dst
1492          */
1493         dst_hold(&rt->dst);
1494         prev = cmpxchg(p, orig, rt);
1495         if (prev == orig) {
1496                 if (orig) {
1497                         rt_add_uncached_list(orig);
1498                         dst_release(&orig->dst);
1499                 }
1500         } else {
1501                 dst_release(&rt->dst);
1502                 ret = false;
1503         }
1504
1505         return ret;
1506 }
1507
1508 struct uncached_list {
1509         spinlock_t              lock;
1510         struct list_head        head;
1511 };
1512
1513 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1514
1515 void rt_add_uncached_list(struct rtable *rt)
1516 {
1517         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1518
1519         rt->rt_uncached_list = ul;
1520
1521         spin_lock_bh(&ul->lock);
1522         list_add_tail(&rt->rt_uncached, &ul->head);
1523         spin_unlock_bh(&ul->lock);
1524 }
1525
1526 void rt_del_uncached_list(struct rtable *rt)
1527 {
1528         if (!list_empty(&rt->rt_uncached)) {
1529                 struct uncached_list *ul = rt->rt_uncached_list;
1530
1531                 spin_lock_bh(&ul->lock);
1532                 list_del(&rt->rt_uncached);
1533                 spin_unlock_bh(&ul->lock);
1534         }
1535 }
1536
1537 static void ipv4_dst_destroy(struct dst_entry *dst)
1538 {
1539         struct rtable *rt = (struct rtable *)dst;
1540
1541         ip_dst_metrics_put(dst);
1542         rt_del_uncached_list(rt);
1543 }
1544
1545 void rt_flush_dev(struct net_device *dev)
1546 {
1547         struct rtable *rt;
1548         int cpu;
1549
1550         for_each_possible_cpu(cpu) {
1551                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1552
1553                 spin_lock_bh(&ul->lock);
1554                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1555                         if (rt->dst.dev != dev)
1556                                 continue;
1557                         rt->dst.dev = blackhole_netdev;
1558                         dev_hold(rt->dst.dev);
1559                         dev_put(dev);
1560                 }
1561                 spin_unlock_bh(&ul->lock);
1562         }
1563 }
1564
1565 static bool rt_cache_valid(const struct rtable *rt)
1566 {
1567         return  rt &&
1568                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1569                 !rt_is_expired(rt);
1570 }
1571
1572 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1573                            const struct fib_result *res,
1574                            struct fib_nh_exception *fnhe,
1575                            struct fib_info *fi, u16 type, u32 itag,
1576                            const bool do_cache)
1577 {
1578         bool cached = false;
1579
1580         if (fi) {
1581                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1582
1583                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1584                         rt->rt_uses_gateway = 1;
1585                         rt->rt_gw_family = nhc->nhc_gw_family;
1586                         /* only INET and INET6 are supported */
1587                         if (likely(nhc->nhc_gw_family == AF_INET))
1588                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1589                         else
1590                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1591                 }
1592
1593                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1594
1595 #ifdef CONFIG_IP_ROUTE_CLASSID
1596                 if (nhc->nhc_family == AF_INET) {
1597                         struct fib_nh *nh;
1598
1599                         nh = container_of(nhc, struct fib_nh, nh_common);
1600                         rt->dst.tclassid = nh->nh_tclassid;
1601                 }
1602 #endif
1603                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1604                 if (unlikely(fnhe))
1605                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1606                 else if (do_cache)
1607                         cached = rt_cache_route(nhc, rt);
1608                 if (unlikely(!cached)) {
1609                         /* Routes we intend to cache in nexthop exception or
1610                          * FIB nexthop have the DST_NOCACHE bit clear.
1611                          * However, if we are unsuccessful at storing this
1612                          * route into the cache we really need to set it.
1613                          */
1614                         if (!rt->rt_gw4) {
1615                                 rt->rt_gw_family = AF_INET;
1616                                 rt->rt_gw4 = daddr;
1617                         }
1618                         rt_add_uncached_list(rt);
1619                 }
1620         } else
1621                 rt_add_uncached_list(rt);
1622
1623 #ifdef CONFIG_IP_ROUTE_CLASSID
1624 #ifdef CONFIG_IP_MULTIPLE_TABLES
1625         set_class_tag(rt, res->tclassid);
1626 #endif
1627         set_class_tag(rt, itag);
1628 #endif
1629 }
1630
1631 struct rtable *rt_dst_alloc(struct net_device *dev,
1632                             unsigned int flags, u16 type,
1633                             bool nopolicy, bool noxfrm)
1634 {
1635         struct rtable *rt;
1636
1637         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1638                        (nopolicy ? DST_NOPOLICY : 0) |
1639                        (noxfrm ? DST_NOXFRM : 0));
1640
1641         if (rt) {
1642                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1643                 rt->rt_flags = flags;
1644                 rt->rt_type = type;
1645                 rt->rt_is_input = 0;
1646                 rt->rt_iif = 0;
1647                 rt->rt_pmtu = 0;
1648                 rt->rt_mtu_locked = 0;
1649                 rt->rt_uses_gateway = 0;
1650                 rt->rt_gw_family = 0;
1651                 rt->rt_gw4 = 0;
1652                 INIT_LIST_HEAD(&rt->rt_uncached);
1653
1654                 rt->dst.output = ip_output;
1655                 if (flags & RTCF_LOCAL)
1656                         rt->dst.input = ip_local_deliver;
1657         }
1658
1659         return rt;
1660 }
1661 EXPORT_SYMBOL(rt_dst_alloc);
1662
1663 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1664 {
1665         struct rtable *new_rt;
1666
1667         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1668                            rt->dst.flags);
1669
1670         if (new_rt) {
1671                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1672                 new_rt->rt_flags = rt->rt_flags;
1673                 new_rt->rt_type = rt->rt_type;
1674                 new_rt->rt_is_input = rt->rt_is_input;
1675                 new_rt->rt_iif = rt->rt_iif;
1676                 new_rt->rt_pmtu = rt->rt_pmtu;
1677                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1678                 new_rt->rt_gw_family = rt->rt_gw_family;
1679                 if (rt->rt_gw_family == AF_INET)
1680                         new_rt->rt_gw4 = rt->rt_gw4;
1681                 else if (rt->rt_gw_family == AF_INET6)
1682                         new_rt->rt_gw6 = rt->rt_gw6;
1683                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1684
1685                 new_rt->dst.input = rt->dst.input;
1686                 new_rt->dst.output = rt->dst.output;
1687                 new_rt->dst.error = rt->dst.error;
1688                 new_rt->dst.lastuse = jiffies;
1689                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1690         }
1691         return new_rt;
1692 }
1693 EXPORT_SYMBOL(rt_dst_clone);
1694
1695 /* called in rcu_read_lock() section */
1696 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1697                           u8 tos, struct net_device *dev,
1698                           struct in_device *in_dev, u32 *itag)
1699 {
1700         int err;
1701
1702         /* Primary sanity checks. */
1703         if (!in_dev)
1704                 return -EINVAL;
1705
1706         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1707             skb->protocol != htons(ETH_P_IP))
1708                 return -EINVAL;
1709
1710         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1711                 return -EINVAL;
1712
1713         if (ipv4_is_zeronet(saddr)) {
1714                 if (!ipv4_is_local_multicast(daddr) &&
1715                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1716                         return -EINVAL;
1717         } else {
1718                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1719                                           in_dev, itag);
1720                 if (err < 0)
1721                         return err;
1722         }
1723         return 0;
1724 }
1725
1726 /* called in rcu_read_lock() section */
1727 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1728                              u8 tos, struct net_device *dev, int our)
1729 {
1730         struct in_device *in_dev = __in_dev_get_rcu(dev);
1731         unsigned int flags = RTCF_MULTICAST;
1732         struct rtable *rth;
1733         u32 itag = 0;
1734         int err;
1735
1736         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1737         if (err)
1738                 return err;
1739
1740         if (our)
1741                 flags |= RTCF_LOCAL;
1742
1743         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1744                            IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1745         if (!rth)
1746                 return -ENOBUFS;
1747
1748 #ifdef CONFIG_IP_ROUTE_CLASSID
1749         rth->dst.tclassid = itag;
1750 #endif
1751         rth->dst.output = ip_rt_bug;
1752         rth->rt_is_input= 1;
1753
1754 #ifdef CONFIG_IP_MROUTE
1755         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1756                 rth->dst.input = ip_mr_input;
1757 #endif
1758         RT_CACHE_STAT_INC(in_slow_mc);
1759
1760         skb_dst_set(skb, &rth->dst);
1761         return 0;
1762 }
1763
1764
1765 static void ip_handle_martian_source(struct net_device *dev,
1766                                      struct in_device *in_dev,
1767                                      struct sk_buff *skb,
1768                                      __be32 daddr,
1769                                      __be32 saddr)
1770 {
1771         RT_CACHE_STAT_INC(in_martian_src);
1772 #ifdef CONFIG_IP_ROUTE_VERBOSE
1773         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1774                 /*
1775                  *      RFC1812 recommendation, if source is martian,
1776                  *      the only hint is MAC header.
1777                  */
1778                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1779                         &daddr, &saddr, dev->name);
1780                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1781                         print_hex_dump(KERN_WARNING, "ll header: ",
1782                                        DUMP_PREFIX_OFFSET, 16, 1,
1783                                        skb_mac_header(skb),
1784                                        dev->hard_header_len, false);
1785                 }
1786         }
1787 #endif
1788 }
1789
1790 /* called in rcu_read_lock() section */
1791 static int __mkroute_input(struct sk_buff *skb,
1792                            const struct fib_result *res,
1793                            struct in_device *in_dev,
1794                            __be32 daddr, __be32 saddr, u32 tos)
1795 {
1796         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1797         struct net_device *dev = nhc->nhc_dev;
1798         struct fib_nh_exception *fnhe;
1799         struct rtable *rth;
1800         int err;
1801         struct in_device *out_dev;
1802         bool do_cache;
1803         u32 itag = 0;
1804
1805         /* get a working reference to the output device */
1806         out_dev = __in_dev_get_rcu(dev);
1807         if (!out_dev) {
1808                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1809                 return -EINVAL;
1810         }
1811
1812         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1813                                   in_dev->dev, in_dev, &itag);
1814         if (err < 0) {
1815                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1816                                          saddr);
1817
1818                 goto cleanup;
1819         }
1820
1821         do_cache = res->fi && !itag;
1822         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1823             skb->protocol == htons(ETH_P_IP)) {
1824                 __be32 gw;
1825
1826                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1827                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1828                     inet_addr_onlink(out_dev, saddr, gw))
1829                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1830         }
1831
1832         if (skb->protocol != htons(ETH_P_IP)) {
1833                 /* Not IP (i.e. ARP). Do not create route, if it is
1834                  * invalid for proxy arp. DNAT routes are always valid.
1835                  *
1836                  * Proxy arp feature have been extended to allow, ARP
1837                  * replies back to the same interface, to support
1838                  * Private VLAN switch technologies. See arp.c.
1839                  */
1840                 if (out_dev == in_dev &&
1841                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1842                         err = -EINVAL;
1843                         goto cleanup;
1844                 }
1845         }
1846
1847         fnhe = find_exception(nhc, daddr);
1848         if (do_cache) {
1849                 if (fnhe)
1850                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1851                 else
1852                         rth = rcu_dereference(nhc->nhc_rth_input);
1853                 if (rt_cache_valid(rth)) {
1854                         skb_dst_set_noref(skb, &rth->dst);
1855                         goto out;
1856                 }
1857         }
1858
1859         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1860                            IN_DEV_ORCONF(in_dev, NOPOLICY),
1861                            IN_DEV_ORCONF(out_dev, NOXFRM));
1862         if (!rth) {
1863                 err = -ENOBUFS;
1864                 goto cleanup;
1865         }
1866
1867         rth->rt_is_input = 1;
1868         RT_CACHE_STAT_INC(in_slow_tot);
1869
1870         rth->dst.input = ip_forward;
1871
1872         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1873                        do_cache);
1874         lwtunnel_set_redirect(&rth->dst);
1875         skb_dst_set(skb, &rth->dst);
1876 out:
1877         err = 0;
1878  cleanup:
1879         return err;
1880 }
1881
1882 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1883 /* To make ICMP packets follow the right flow, the multipath hash is
1884  * calculated from the inner IP addresses.
1885  */
1886 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1887                                  struct flow_keys *hash_keys)
1888 {
1889         const struct iphdr *outer_iph = ip_hdr(skb);
1890         const struct iphdr *key_iph = outer_iph;
1891         const struct iphdr *inner_iph;
1892         const struct icmphdr *icmph;
1893         struct iphdr _inner_iph;
1894         struct icmphdr _icmph;
1895
1896         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1897                 goto out;
1898
1899         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1900                 goto out;
1901
1902         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1903                                    &_icmph);
1904         if (!icmph)
1905                 goto out;
1906
1907         if (!icmp_is_err(icmph->type))
1908                 goto out;
1909
1910         inner_iph = skb_header_pointer(skb,
1911                                        outer_iph->ihl * 4 + sizeof(_icmph),
1912                                        sizeof(_inner_iph), &_inner_iph);
1913         if (!inner_iph)
1914                 goto out;
1915
1916         key_iph = inner_iph;
1917 out:
1918         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1919         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1920 }
1921
1922 /* if skb is set it will be used and fl4 can be NULL */
1923 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1924                        const struct sk_buff *skb, struct flow_keys *flkeys)
1925 {
1926         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1927         struct flow_keys hash_keys;
1928         u32 mhash;
1929
1930         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1931         case 0:
1932                 memset(&hash_keys, 0, sizeof(hash_keys));
1933                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1934                 if (skb) {
1935                         ip_multipath_l3_keys(skb, &hash_keys);
1936                 } else {
1937                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1938                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1939                 }
1940                 break;
1941         case 1:
1942                 /* skb is currently provided only when forwarding */
1943                 if (skb) {
1944                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1945                         struct flow_keys keys;
1946
1947                         /* short-circuit if we already have L4 hash present */
1948                         if (skb->l4_hash)
1949                                 return skb_get_hash_raw(skb) >> 1;
1950
1951                         memset(&hash_keys, 0, sizeof(hash_keys));
1952
1953                         if (!flkeys) {
1954                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1955                                 flkeys = &keys;
1956                         }
1957
1958                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1959                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1960                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1961                         hash_keys.ports.src = flkeys->ports.src;
1962                         hash_keys.ports.dst = flkeys->ports.dst;
1963                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1964                 } else {
1965                         memset(&hash_keys, 0, sizeof(hash_keys));
1966                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1967                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1968                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1969                         hash_keys.ports.src = fl4->fl4_sport;
1970                         hash_keys.ports.dst = fl4->fl4_dport;
1971                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1972                 }
1973                 break;
1974         case 2:
1975                 memset(&hash_keys, 0, sizeof(hash_keys));
1976                 /* skb is currently provided only when forwarding */
1977                 if (skb) {
1978                         struct flow_keys keys;
1979
1980                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1981                         /* Inner can be v4 or v6 */
1982                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1983                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1984                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1985                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1986                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1987                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1988                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1989                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1990                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1991                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1992                         } else {
1993                                 /* Same as case 0 */
1994                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1995                                 ip_multipath_l3_keys(skb, &hash_keys);
1996                         }
1997                 } else {
1998                         /* Same as case 0 */
1999                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2000                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2001                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2002                 }
2003                 break;
2004         }
2005         mhash = flow_hash_from_keys(&hash_keys);
2006
2007         if (multipath_hash)
2008                 mhash = jhash_2words(mhash, multipath_hash, 0);
2009
2010         return mhash >> 1;
2011 }
2012 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2013
2014 static int ip_mkroute_input(struct sk_buff *skb,
2015                             struct fib_result *res,
2016                             struct in_device *in_dev,
2017                             __be32 daddr, __be32 saddr, u32 tos,
2018                             struct flow_keys *hkeys)
2019 {
2020 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2021         if (res->fi && fib_info_num_path(res->fi) > 1) {
2022                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2023
2024                 fib_select_multipath(res, h);
2025         }
2026 #endif
2027
2028         /* create a routing cache entry */
2029         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2030 }
2031
2032 /* Implements all the saddr-related checks as ip_route_input_slow(),
2033  * assuming daddr is valid and the destination is not a local broadcast one.
2034  * Uses the provided hint instead of performing a route lookup.
2035  */
2036 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2037                       u8 tos, struct net_device *dev,
2038                       const struct sk_buff *hint)
2039 {
2040         struct in_device *in_dev = __in_dev_get_rcu(dev);
2041         struct rtable *rt = skb_rtable(hint);
2042         struct net *net = dev_net(dev);
2043         int err = -EINVAL;
2044         u32 tag = 0;
2045
2046         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2047                 goto martian_source;
2048
2049         if (ipv4_is_zeronet(saddr))
2050                 goto martian_source;
2051
2052         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2053                 goto martian_source;
2054
2055         if (rt->rt_type != RTN_LOCAL)
2056                 goto skip_validate_source;
2057
2058         tos &= IPTOS_RT_MASK;
2059         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2060         if (err < 0)
2061                 goto martian_source;
2062
2063 skip_validate_source:
2064         skb_dst_copy(skb, hint);
2065         return 0;
2066
2067 martian_source:
2068         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2069         return err;
2070 }
2071
2072 /*
2073  *      NOTE. We drop all the packets that has local source
2074  *      addresses, because every properly looped back packet
2075  *      must have correct destination already attached by output routine.
2076  *      Changes in the enforced policies must be applied also to
2077  *      ip_route_use_hint().
2078  *
2079  *      Such approach solves two big problems:
2080  *      1. Not simplex devices are handled properly.
2081  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2082  *      called with rcu_read_lock()
2083  */
2084
2085 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2086                                u8 tos, struct net_device *dev,
2087                                struct fib_result *res)
2088 {
2089         struct in_device *in_dev = __in_dev_get_rcu(dev);
2090         struct flow_keys *flkeys = NULL, _flkeys;
2091         struct net    *net = dev_net(dev);
2092         struct ip_tunnel_info *tun_info;
2093         int             err = -EINVAL;
2094         unsigned int    flags = 0;
2095         u32             itag = 0;
2096         struct rtable   *rth;
2097         struct flowi4   fl4;
2098         bool do_cache = true;
2099
2100         /* IP on this device is disabled. */
2101
2102         if (!in_dev)
2103                 goto out;
2104
2105         /* Check for the most weird martians, which can be not detected
2106            by fib_lookup.
2107          */
2108
2109         tun_info = skb_tunnel_info(skb);
2110         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2111                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2112         else
2113                 fl4.flowi4_tun_key.tun_id = 0;
2114         skb_dst_drop(skb);
2115
2116         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2117                 goto martian_source;
2118
2119         res->fi = NULL;
2120         res->table = NULL;
2121         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2122                 goto brd_input;
2123
2124         /* Accept zero addresses only to limited broadcast;
2125          * I even do not know to fix it or not. Waiting for complains :-)
2126          */
2127         if (ipv4_is_zeronet(saddr))
2128                 goto martian_source;
2129
2130         if (ipv4_is_zeronet(daddr))
2131                 goto martian_destination;
2132
2133         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2134          * and call it once if daddr or/and saddr are loopback addresses
2135          */
2136         if (ipv4_is_loopback(daddr)) {
2137                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2138                         goto martian_destination;
2139         } else if (ipv4_is_loopback(saddr)) {
2140                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2141                         goto martian_source;
2142         }
2143
2144         /*
2145          *      Now we are ready to route packet.
2146          */
2147         fl4.flowi4_oif = 0;
2148         fl4.flowi4_iif = dev->ifindex;
2149         fl4.flowi4_mark = skb->mark;
2150         fl4.flowi4_tos = tos;
2151         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2152         fl4.flowi4_flags = 0;
2153         fl4.daddr = daddr;
2154         fl4.saddr = saddr;
2155         fl4.flowi4_uid = sock_net_uid(net, NULL);
2156         fl4.flowi4_multipath_hash = 0;
2157
2158         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2159                 flkeys = &_flkeys;
2160         } else {
2161                 fl4.flowi4_proto = 0;
2162                 fl4.fl4_sport = 0;
2163                 fl4.fl4_dport = 0;
2164         }
2165
2166         err = fib_lookup(net, &fl4, res, 0);
2167         if (err != 0) {
2168                 if (!IN_DEV_FORWARD(in_dev))
2169                         err = -EHOSTUNREACH;
2170                 goto no_route;
2171         }
2172
2173         if (res->type == RTN_BROADCAST) {
2174                 if (IN_DEV_BFORWARD(in_dev))
2175                         goto make_route;
2176                 /* not do cache if bc_forwarding is enabled */
2177                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2178                         do_cache = false;
2179                 goto brd_input;
2180         }
2181
2182         if (res->type == RTN_LOCAL) {
2183                 err = fib_validate_source(skb, saddr, daddr, tos,
2184                                           0, dev, in_dev, &itag);
2185                 if (err < 0)
2186                         goto martian_source;
2187                 goto local_input;
2188         }
2189
2190         if (!IN_DEV_FORWARD(in_dev)) {
2191                 err = -EHOSTUNREACH;
2192                 goto no_route;
2193         }
2194         if (res->type != RTN_UNICAST)
2195                 goto martian_destination;
2196
2197 make_route:
2198         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2199 out:    return err;
2200
2201 brd_input:
2202         if (skb->protocol != htons(ETH_P_IP))
2203                 goto e_inval;
2204
2205         if (!ipv4_is_zeronet(saddr)) {
2206                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2207                                           in_dev, &itag);
2208                 if (err < 0)
2209                         goto martian_source;
2210         }
2211         flags |= RTCF_BROADCAST;
2212         res->type = RTN_BROADCAST;
2213         RT_CACHE_STAT_INC(in_brd);
2214
2215 local_input:
2216         do_cache &= res->fi && !itag;
2217         if (do_cache) {
2218                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2219
2220                 rth = rcu_dereference(nhc->nhc_rth_input);
2221                 if (rt_cache_valid(rth)) {
2222                         skb_dst_set_noref(skb, &rth->dst);
2223                         err = 0;
2224                         goto out;
2225                 }
2226         }
2227
2228         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2229                            flags | RTCF_LOCAL, res->type,
2230                            IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2231         if (!rth)
2232                 goto e_nobufs;
2233
2234         rth->dst.output= ip_rt_bug;
2235 #ifdef CONFIG_IP_ROUTE_CLASSID
2236         rth->dst.tclassid = itag;
2237 #endif
2238         rth->rt_is_input = 1;
2239
2240         RT_CACHE_STAT_INC(in_slow_tot);
2241         if (res->type == RTN_UNREACHABLE) {
2242                 rth->dst.input= ip_error;
2243                 rth->dst.error= -err;
2244                 rth->rt_flags   &= ~RTCF_LOCAL;
2245         }
2246
2247         if (do_cache) {
2248                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2249
2250                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2251                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2252                         WARN_ON(rth->dst.input == lwtunnel_input);
2253                         rth->dst.lwtstate->orig_input = rth->dst.input;
2254                         rth->dst.input = lwtunnel_input;
2255                 }
2256
2257                 if (unlikely(!rt_cache_route(nhc, rth)))
2258                         rt_add_uncached_list(rth);
2259         }
2260         skb_dst_set(skb, &rth->dst);
2261         err = 0;
2262         goto out;
2263
2264 no_route:
2265         RT_CACHE_STAT_INC(in_no_route);
2266         res->type = RTN_UNREACHABLE;
2267         res->fi = NULL;
2268         res->table = NULL;
2269         goto local_input;
2270
2271         /*
2272          *      Do not cache martian addresses: they should be logged (RFC1812)
2273          */
2274 martian_destination:
2275         RT_CACHE_STAT_INC(in_martian_dst);
2276 #ifdef CONFIG_IP_ROUTE_VERBOSE
2277         if (IN_DEV_LOG_MARTIANS(in_dev))
2278                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2279                                      &daddr, &saddr, dev->name);
2280 #endif
2281
2282 e_inval:
2283         err = -EINVAL;
2284         goto out;
2285
2286 e_nobufs:
2287         err = -ENOBUFS;
2288         goto out;
2289
2290 martian_source:
2291         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2292         goto out;
2293 }
2294
2295 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2296                          u8 tos, struct net_device *dev)
2297 {
2298         struct fib_result res;
2299         int err;
2300
2301         tos &= IPTOS_RT_MASK;
2302         rcu_read_lock();
2303         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2304         rcu_read_unlock();
2305
2306         return err;
2307 }
2308 EXPORT_SYMBOL(ip_route_input_noref);
2309
2310 /* called with rcu_read_lock held */
2311 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2312                        u8 tos, struct net_device *dev, struct fib_result *res)
2313 {
2314         /* Multicast recognition logic is moved from route cache to here.
2315            The problem was that too many Ethernet cards have broken/missing
2316            hardware multicast filters :-( As result the host on multicasting
2317            network acquires a lot of useless route cache entries, sort of
2318            SDR messages from all the world. Now we try to get rid of them.
2319            Really, provided software IP multicast filter is organized
2320            reasonably (at least, hashed), it does not result in a slowdown
2321            comparing with route cache reject entries.
2322            Note, that multicast routers are not affected, because
2323            route cache entry is created eventually.
2324          */
2325         if (ipv4_is_multicast(daddr)) {
2326                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2327                 int our = 0;
2328                 int err = -EINVAL;
2329
2330                 if (!in_dev)
2331                         return err;
2332                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2333                                       ip_hdr(skb)->protocol);
2334
2335                 /* check l3 master if no match yet */
2336                 if (!our && netif_is_l3_slave(dev)) {
2337                         struct in_device *l3_in_dev;
2338
2339                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2340                         if (l3_in_dev)
2341                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2342                                                       ip_hdr(skb)->protocol);
2343                 }
2344
2345                 if (our
2346 #ifdef CONFIG_IP_MROUTE
2347                         ||
2348                     (!ipv4_is_local_multicast(daddr) &&
2349                      IN_DEV_MFORWARD(in_dev))
2350 #endif
2351                    ) {
2352                         err = ip_route_input_mc(skb, daddr, saddr,
2353                                                 tos, dev, our);
2354                 }
2355                 return err;
2356         }
2357
2358         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2359 }
2360
2361 /* called with rcu_read_lock() */
2362 static struct rtable *__mkroute_output(const struct fib_result *res,
2363                                        const struct flowi4 *fl4, int orig_oif,
2364                                        struct net_device *dev_out,
2365                                        unsigned int flags)
2366 {
2367         struct fib_info *fi = res->fi;
2368         struct fib_nh_exception *fnhe;
2369         struct in_device *in_dev;
2370         u16 type = res->type;
2371         struct rtable *rth;
2372         bool do_cache;
2373
2374         in_dev = __in_dev_get_rcu(dev_out);
2375         if (!in_dev)
2376                 return ERR_PTR(-EINVAL);
2377
2378         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2379                 if (ipv4_is_loopback(fl4->saddr) &&
2380                     !(dev_out->flags & IFF_LOOPBACK) &&
2381                     !netif_is_l3_master(dev_out))
2382                         return ERR_PTR(-EINVAL);
2383
2384         if (ipv4_is_lbcast(fl4->daddr))
2385                 type = RTN_BROADCAST;
2386         else if (ipv4_is_multicast(fl4->daddr))
2387                 type = RTN_MULTICAST;
2388         else if (ipv4_is_zeronet(fl4->daddr))
2389                 return ERR_PTR(-EINVAL);
2390
2391         if (dev_out->flags & IFF_LOOPBACK)
2392                 flags |= RTCF_LOCAL;
2393
2394         do_cache = true;
2395         if (type == RTN_BROADCAST) {
2396                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2397                 fi = NULL;
2398         } else if (type == RTN_MULTICAST) {
2399                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2400                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2401                                      fl4->flowi4_proto))
2402                         flags &= ~RTCF_LOCAL;
2403                 else
2404                         do_cache = false;
2405                 /* If multicast route do not exist use
2406                  * default one, but do not gateway in this case.
2407                  * Yes, it is hack.
2408                  */
2409                 if (fi && res->prefixlen < 4)
2410                         fi = NULL;
2411         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2412                    (orig_oif != dev_out->ifindex)) {
2413                 /* For local routes that require a particular output interface
2414                  * we do not want to cache the result.  Caching the result
2415                  * causes incorrect behaviour when there are multiple source
2416                  * addresses on the interface, the end result being that if the
2417                  * intended recipient is waiting on that interface for the
2418                  * packet he won't receive it because it will be delivered on
2419                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2420                  * be set to the loopback interface as well.
2421                  */
2422                 do_cache = false;
2423         }
2424
2425         fnhe = NULL;
2426         do_cache &= fi != NULL;
2427         if (fi) {
2428                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2429                 struct rtable __rcu **prth;
2430
2431                 fnhe = find_exception(nhc, fl4->daddr);
2432                 if (!do_cache)
2433                         goto add;
2434                 if (fnhe) {
2435                         prth = &fnhe->fnhe_rth_output;
2436                 } else {
2437                         if (unlikely(fl4->flowi4_flags &
2438                                      FLOWI_FLAG_KNOWN_NH &&
2439                                      !(nhc->nhc_gw_family &&
2440                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2441                                 do_cache = false;
2442                                 goto add;
2443                         }
2444                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2445                 }
2446                 rth = rcu_dereference(*prth);
2447                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2448                         return rth;
2449         }
2450
2451 add:
2452         rth = rt_dst_alloc(dev_out, flags, type,
2453                            IN_DEV_ORCONF(in_dev, NOPOLICY),
2454                            IN_DEV_ORCONF(in_dev, NOXFRM));
2455         if (!rth)
2456                 return ERR_PTR(-ENOBUFS);
2457
2458         rth->rt_iif = orig_oif;
2459
2460         RT_CACHE_STAT_INC(out_slow_tot);
2461
2462         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2463                 if (flags & RTCF_LOCAL &&
2464                     !(dev_out->flags & IFF_LOOPBACK)) {
2465                         rth->dst.output = ip_mc_output;
2466                         RT_CACHE_STAT_INC(out_slow_mc);
2467                 }
2468 #ifdef CONFIG_IP_MROUTE
2469                 if (type == RTN_MULTICAST) {
2470                         if (IN_DEV_MFORWARD(in_dev) &&
2471                             !ipv4_is_local_multicast(fl4->daddr)) {
2472                                 rth->dst.input = ip_mr_input;
2473                                 rth->dst.output = ip_mc_output;
2474                         }
2475                 }
2476 #endif
2477         }
2478
2479         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2480         lwtunnel_set_redirect(&rth->dst);
2481
2482         return rth;
2483 }
2484
2485 /*
2486  * Major route resolver routine.
2487  */
2488
2489 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2490                                         const struct sk_buff *skb)
2491 {
2492         __u8 tos = RT_FL_TOS(fl4);
2493         struct fib_result res = {
2494                 .type           = RTN_UNSPEC,
2495                 .fi             = NULL,
2496                 .table          = NULL,
2497                 .tclassid       = 0,
2498         };
2499         struct rtable *rth;
2500
2501         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2502         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2503         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2504                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2505
2506         rcu_read_lock();
2507         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2508         rcu_read_unlock();
2509
2510         return rth;
2511 }
2512 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2513
2514 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2515                                             struct fib_result *res,
2516                                             const struct sk_buff *skb)
2517 {
2518         struct net_device *dev_out = NULL;
2519         int orig_oif = fl4->flowi4_oif;
2520         unsigned int flags = 0;
2521         struct rtable *rth;
2522         int err;
2523
2524         if (fl4->saddr) {
2525                 if (ipv4_is_multicast(fl4->saddr) ||
2526                     ipv4_is_lbcast(fl4->saddr) ||
2527                     ipv4_is_zeronet(fl4->saddr)) {
2528                         rth = ERR_PTR(-EINVAL);
2529                         goto out;
2530                 }
2531
2532                 rth = ERR_PTR(-ENETUNREACH);
2533
2534                 /* I removed check for oif == dev_out->oif here.
2535                    It was wrong for two reasons:
2536                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2537                       is assigned to multiple interfaces.
2538                    2. Moreover, we are allowed to send packets with saddr
2539                       of another iface. --ANK
2540                  */
2541
2542                 if (fl4->flowi4_oif == 0 &&
2543                     (ipv4_is_multicast(fl4->daddr) ||
2544                      ipv4_is_lbcast(fl4->daddr))) {
2545                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2546                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2547                         if (!dev_out)
2548                                 goto out;
2549
2550                         /* Special hack: user can direct multicasts
2551                            and limited broadcast via necessary interface
2552                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2553                            This hack is not just for fun, it allows
2554                            vic,vat and friends to work.
2555                            They bind socket to loopback, set ttl to zero
2556                            and expect that it will work.
2557                            From the viewpoint of routing cache they are broken,
2558                            because we are not allowed to build multicast path
2559                            with loopback source addr (look, routing cache
2560                            cannot know, that ttl is zero, so that packet
2561                            will not leave this host and route is valid).
2562                            Luckily, this hack is good workaround.
2563                          */
2564
2565                         fl4->flowi4_oif = dev_out->ifindex;
2566                         goto make_route;
2567                 }
2568
2569                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2570                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2571                         if (!__ip_dev_find(net, fl4->saddr, false))
2572                                 goto out;
2573                 }
2574         }
2575
2576
2577         if (fl4->flowi4_oif) {
2578                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2579                 rth = ERR_PTR(-ENODEV);
2580                 if (!dev_out)
2581                         goto out;
2582
2583                 /* RACE: Check return value of inet_select_addr instead. */
2584                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2585                         rth = ERR_PTR(-ENETUNREACH);
2586                         goto out;
2587                 }
2588                 if (ipv4_is_local_multicast(fl4->daddr) ||
2589                     ipv4_is_lbcast(fl4->daddr) ||
2590                     fl4->flowi4_proto == IPPROTO_IGMP) {
2591                         if (!fl4->saddr)
2592                                 fl4->saddr = inet_select_addr(dev_out, 0,
2593                                                               RT_SCOPE_LINK);
2594                         goto make_route;
2595                 }
2596                 if (!fl4->saddr) {
2597                         if (ipv4_is_multicast(fl4->daddr))
2598                                 fl4->saddr = inet_select_addr(dev_out, 0,
2599                                                               fl4->flowi4_scope);
2600                         else if (!fl4->daddr)
2601                                 fl4->saddr = inet_select_addr(dev_out, 0,
2602                                                               RT_SCOPE_HOST);
2603                 }
2604         }
2605
2606         if (!fl4->daddr) {
2607                 fl4->daddr = fl4->saddr;
2608                 if (!fl4->daddr)
2609                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2610                 dev_out = net->loopback_dev;
2611                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2612                 res->type = RTN_LOCAL;
2613                 flags |= RTCF_LOCAL;
2614                 goto make_route;
2615         }
2616
2617         err = fib_lookup(net, fl4, res, 0);
2618         if (err) {
2619                 res->fi = NULL;
2620                 res->table = NULL;
2621                 if (fl4->flowi4_oif &&
2622                     (ipv4_is_multicast(fl4->daddr) ||
2623                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2624                         /* Apparently, routing tables are wrong. Assume,
2625                            that the destination is on link.
2626
2627                            WHY? DW.
2628                            Because we are allowed to send to iface
2629                            even if it has NO routes and NO assigned
2630                            addresses. When oif is specified, routing
2631                            tables are looked up with only one purpose:
2632                            to catch if destination is gatewayed, rather than
2633                            direct. Moreover, if MSG_DONTROUTE is set,
2634                            we send packet, ignoring both routing tables
2635                            and ifaddr state. --ANK
2636
2637
2638                            We could make it even if oif is unknown,
2639                            likely IPv6, but we do not.
2640                          */
2641
2642                         if (fl4->saddr == 0)
2643                                 fl4->saddr = inet_select_addr(dev_out, 0,
2644                                                               RT_SCOPE_LINK);
2645                         res->type = RTN_UNICAST;
2646                         goto make_route;
2647                 }
2648                 rth = ERR_PTR(err);
2649                 goto out;
2650         }
2651
2652         if (res->type == RTN_LOCAL) {
2653                 if (!fl4->saddr) {
2654                         if (res->fi->fib_prefsrc)
2655                                 fl4->saddr = res->fi->fib_prefsrc;
2656                         else
2657                                 fl4->saddr = fl4->daddr;
2658                 }
2659
2660                 /* L3 master device is the loopback for that domain */
2661                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2662                         net->loopback_dev;
2663
2664                 /* make sure orig_oif points to fib result device even
2665                  * though packet rx/tx happens over loopback or l3mdev
2666                  */
2667                 orig_oif = FIB_RES_OIF(*res);
2668
2669                 fl4->flowi4_oif = dev_out->ifindex;
2670                 flags |= RTCF_LOCAL;
2671                 goto make_route;
2672         }
2673
2674         fib_select_path(net, res, fl4, skb);
2675
2676         dev_out = FIB_RES_DEV(*res);
2677
2678 make_route:
2679         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2680
2681 out:
2682         return rth;
2683 }
2684
2685 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2686 {
2687         return NULL;
2688 }
2689
2690 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2691 {
2692         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2693
2694         return mtu ? : dst->dev->mtu;
2695 }
2696
2697 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2698                                           struct sk_buff *skb, u32 mtu,
2699                                           bool confirm_neigh)
2700 {
2701 }
2702
2703 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2704                                        struct sk_buff *skb)
2705 {
2706 }
2707
2708 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2709                                           unsigned long old)
2710 {
2711         return NULL;
2712 }
2713
2714 static struct dst_ops ipv4_dst_blackhole_ops = {
2715         .family                 =       AF_INET,
2716         .check                  =       ipv4_blackhole_dst_check,
2717         .mtu                    =       ipv4_blackhole_mtu,
2718         .default_advmss         =       ipv4_default_advmss,
2719         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2720         .redirect               =       ipv4_rt_blackhole_redirect,
2721         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2722         .neigh_lookup           =       ipv4_neigh_lookup,
2723 };
2724
2725 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2726 {
2727         struct rtable *ort = (struct rtable *) dst_orig;
2728         struct rtable *rt;
2729
2730         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2731         if (rt) {
2732                 struct dst_entry *new = &rt->dst;
2733
2734                 new->__use = 1;
2735                 new->input = dst_discard;
2736                 new->output = dst_discard_out;
2737
2738                 new->dev = net->loopback_dev;
2739                 if (new->dev)
2740                         dev_hold(new->dev);
2741
2742                 rt->rt_is_input = ort->rt_is_input;
2743                 rt->rt_iif = ort->rt_iif;
2744                 rt->rt_pmtu = ort->rt_pmtu;
2745                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2746
2747                 rt->rt_genid = rt_genid_ipv4(net);
2748                 rt->rt_flags = ort->rt_flags;
2749                 rt->rt_type = ort->rt_type;
2750                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2751                 rt->rt_gw_family = ort->rt_gw_family;
2752                 if (rt->rt_gw_family == AF_INET)
2753                         rt->rt_gw4 = ort->rt_gw4;
2754                 else if (rt->rt_gw_family == AF_INET6)
2755                         rt->rt_gw6 = ort->rt_gw6;
2756
2757                 INIT_LIST_HEAD(&rt->rt_uncached);
2758         }
2759
2760         dst_release(dst_orig);
2761
2762         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2763 }
2764
2765 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2766                                     const struct sock *sk)
2767 {
2768         struct rtable *rt = __ip_route_output_key(net, flp4);
2769
2770         if (IS_ERR(rt))
2771                 return rt;
2772
2773         if (flp4->flowi4_proto) {
2774                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2775                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2776                                                         flowi4_to_flowi(flp4),
2777                                                         sk, 0);
2778         }
2779
2780         return rt;
2781 }
2782 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2783
2784 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2785                                       struct net_device *dev,
2786                                       struct net *net, __be32 *saddr,
2787                                       const struct ip_tunnel_info *info,
2788                                       u8 protocol, bool use_cache)
2789 {
2790 #ifdef CONFIG_DST_CACHE
2791         struct dst_cache *dst_cache;
2792 #endif
2793         struct rtable *rt = NULL;
2794         struct flowi4 fl4;
2795         __u8 tos;
2796
2797 #ifdef CONFIG_DST_CACHE
2798         dst_cache = (struct dst_cache *)&info->dst_cache;
2799         if (use_cache) {
2800                 rt = dst_cache_get_ip4(dst_cache, saddr);
2801                 if (rt)
2802                         return rt;
2803         }
2804 #endif
2805         memset(&fl4, 0, sizeof(fl4));
2806         fl4.flowi4_mark = skb->mark;
2807         fl4.flowi4_proto = protocol;
2808         fl4.daddr = info->key.u.ipv4.dst;
2809         fl4.saddr = info->key.u.ipv4.src;
2810         tos = info->key.tos;
2811         fl4.flowi4_tos = RT_TOS(tos);
2812
2813         rt = ip_route_output_key(net, &fl4);
2814         if (IS_ERR(rt)) {
2815                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2816                 return ERR_PTR(-ENETUNREACH);
2817         }
2818         if (rt->dst.dev == dev) { /* is this necessary? */
2819                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2820                 ip_rt_put(rt);
2821                 return ERR_PTR(-ELOOP);
2822         }
2823 #ifdef CONFIG_DST_CACHE
2824         if (use_cache)
2825                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2826 #endif
2827         *saddr = fl4.saddr;
2828         return rt;
2829 }
2830 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2831
2832 /* called with rcu_read_lock held */
2833 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2834                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2835                         struct sk_buff *skb, u32 portid, u32 seq,
2836                         unsigned int flags)
2837 {
2838         struct rtmsg *r;
2839         struct nlmsghdr *nlh;
2840         unsigned long expires = 0;
2841         u32 error;
2842         u32 metrics[RTAX_MAX];
2843
2844         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2845         if (!nlh)
2846                 return -EMSGSIZE;
2847
2848         r = nlmsg_data(nlh);
2849         r->rtm_family    = AF_INET;
2850         r->rtm_dst_len  = 32;
2851         r->rtm_src_len  = 0;
2852         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2853         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2854         if (nla_put_u32(skb, RTA_TABLE, table_id))
2855                 goto nla_put_failure;
2856         r->rtm_type     = rt->rt_type;
2857         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2858         r->rtm_protocol = RTPROT_UNSPEC;
2859         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2860         if (rt->rt_flags & RTCF_NOTIFY)
2861                 r->rtm_flags |= RTM_F_NOTIFY;
2862         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2863                 r->rtm_flags |= RTCF_DOREDIRECT;
2864
2865         if (nla_put_in_addr(skb, RTA_DST, dst))
2866                 goto nla_put_failure;
2867         if (src) {
2868                 r->rtm_src_len = 32;
2869                 if (nla_put_in_addr(skb, RTA_SRC, src))
2870                         goto nla_put_failure;
2871         }
2872         if (rt->dst.dev &&
2873             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2874                 goto nla_put_failure;
2875         if (rt->dst.lwtstate &&
2876             lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2877                 goto nla_put_failure;
2878 #ifdef CONFIG_IP_ROUTE_CLASSID
2879         if (rt->dst.tclassid &&
2880             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2881                 goto nla_put_failure;
2882 #endif
2883         if (fl4 && !rt_is_input_route(rt) &&
2884             fl4->saddr != src) {
2885                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2886                         goto nla_put_failure;
2887         }
2888         if (rt->rt_uses_gateway) {
2889                 if (rt->rt_gw_family == AF_INET &&
2890                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2891                         goto nla_put_failure;
2892                 } else if (rt->rt_gw_family == AF_INET6) {
2893                         int alen = sizeof(struct in6_addr);
2894                         struct nlattr *nla;
2895                         struct rtvia *via;
2896
2897                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2898                         if (!nla)
2899                                 goto nla_put_failure;
2900
2901                         via = nla_data(nla);
2902                         via->rtvia_family = AF_INET6;
2903                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2904                 }
2905         }
2906
2907         expires = rt->dst.expires;
2908         if (expires) {
2909                 unsigned long now = jiffies;
2910
2911                 if (time_before(now, expires))
2912                         expires -= now;
2913                 else
2914                         expires = 0;
2915         }
2916
2917         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2918         if (rt->rt_pmtu && expires)
2919                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2920         if (rt->rt_mtu_locked && expires)
2921                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2922         if (rtnetlink_put_metrics(skb, metrics) < 0)
2923                 goto nla_put_failure;
2924
2925         if (fl4) {
2926                 if (fl4->flowi4_mark &&
2927                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2928                         goto nla_put_failure;
2929
2930                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2931                     nla_put_u32(skb, RTA_UID,
2932                                 from_kuid_munged(current_user_ns(),
2933                                                  fl4->flowi4_uid)))
2934                         goto nla_put_failure;
2935
2936                 if (rt_is_input_route(rt)) {
2937 #ifdef CONFIG_IP_MROUTE
2938                         if (ipv4_is_multicast(dst) &&
2939                             !ipv4_is_local_multicast(dst) &&
2940                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2941                                 int err = ipmr_get_route(net, skb,
2942                                                          fl4->saddr, fl4->daddr,
2943                                                          r, portid);
2944
2945                                 if (err <= 0) {
2946                                         if (err == 0)
2947                                                 return 0;
2948                                         goto nla_put_failure;
2949                                 }
2950                         } else
2951 #endif
2952                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2953                                         goto nla_put_failure;
2954                 }
2955         }
2956
2957         error = rt->dst.error;
2958
2959         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2960                 goto nla_put_failure;
2961
2962         nlmsg_end(skb, nlh);
2963         return 0;
2964
2965 nla_put_failure:
2966         nlmsg_cancel(skb, nlh);
2967         return -EMSGSIZE;
2968 }
2969
2970 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2971                             struct netlink_callback *cb, u32 table_id,
2972                             struct fnhe_hash_bucket *bucket, int genid,
2973                             int *fa_index, int fa_start, unsigned int flags)
2974 {
2975         int i;
2976
2977         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2978                 struct fib_nh_exception *fnhe;
2979
2980                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2981                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2982                         struct rtable *rt;
2983                         int err;
2984
2985                         if (*fa_index < fa_start)
2986                                 goto next;
2987
2988                         if (fnhe->fnhe_genid != genid)
2989                                 goto next;
2990
2991                         if (fnhe->fnhe_expires &&
2992                             time_after(jiffies, fnhe->fnhe_expires))
2993                                 goto next;
2994
2995                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2996                         if (!rt)
2997                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2998                         if (!rt)
2999                                 goto next;
3000
3001                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3002                                            table_id, NULL, skb,
3003                                            NETLINK_CB(cb->skb).portid,
3004                                            cb->nlh->nlmsg_seq, flags);
3005                         if (err)
3006                                 return err;
3007 next:
3008                         (*fa_index)++;
3009                 }
3010         }
3011
3012         return 0;
3013 }
3014
3015 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3016                        u32 table_id, struct fib_info *fi,
3017                        int *fa_index, int fa_start, unsigned int flags)
3018 {
3019         struct net *net = sock_net(cb->skb->sk);
3020         int nhsel, genid = fnhe_genid(net);
3021
3022         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3023                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3024                 struct fnhe_hash_bucket *bucket;
3025                 int err;
3026
3027                 if (nhc->nhc_flags & RTNH_F_DEAD)
3028                         continue;
3029
3030                 rcu_read_lock();
3031                 bucket = rcu_dereference(nhc->nhc_exceptions);
3032                 err = 0;
3033                 if (bucket)
3034                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3035                                                genid, fa_index, fa_start,
3036                                                flags);
3037                 rcu_read_unlock();
3038                 if (err)
3039                         return err;
3040         }
3041
3042         return 0;
3043 }
3044
3045 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3046                                                    u8 ip_proto, __be16 sport,
3047                                                    __be16 dport)
3048 {
3049         struct sk_buff *skb;
3050         struct iphdr *iph;
3051
3052         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3053         if (!skb)
3054                 return NULL;
3055
3056         /* Reserve room for dummy headers, this skb can pass
3057          * through good chunk of routing engine.
3058          */
3059         skb_reset_mac_header(skb);
3060         skb_reset_network_header(skb);
3061         skb->protocol = htons(ETH_P_IP);
3062         iph = skb_put(skb, sizeof(struct iphdr));
3063         iph->protocol = ip_proto;
3064         iph->saddr = src;
3065         iph->daddr = dst;
3066         iph->version = 0x4;
3067         iph->frag_off = 0;
3068         iph->ihl = 0x5;
3069         skb_set_transport_header(skb, skb->len);
3070
3071         switch (iph->protocol) {
3072         case IPPROTO_UDP: {
3073                 struct udphdr *udph;
3074
3075                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3076                 udph->source = sport;
3077                 udph->dest = dport;
3078                 udph->len = sizeof(struct udphdr);
3079                 udph->check = 0;
3080                 break;
3081         }
3082         case IPPROTO_TCP: {
3083                 struct tcphdr *tcph;
3084
3085                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3086                 tcph->source    = sport;
3087                 tcph->dest      = dport;
3088                 tcph->doff      = sizeof(struct tcphdr) / 4;
3089                 tcph->rst = 1;
3090                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3091                                             src, dst, 0);
3092                 break;
3093         }
3094         case IPPROTO_ICMP: {
3095                 struct icmphdr *icmph;
3096
3097                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3098                 icmph->type = ICMP_ECHO;
3099                 icmph->code = 0;
3100         }
3101         }
3102
3103         return skb;
3104 }
3105
3106 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3107                                        const struct nlmsghdr *nlh,
3108                                        struct nlattr **tb,
3109                                        struct netlink_ext_ack *extack)
3110 {
3111         struct rtmsg *rtm;
3112         int i, err;
3113
3114         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3115                 NL_SET_ERR_MSG(extack,
3116                                "ipv4: Invalid header for route get request");
3117                 return -EINVAL;
3118         }
3119
3120         if (!netlink_strict_get_check(skb))
3121                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3122                                               rtm_ipv4_policy, extack);
3123
3124         rtm = nlmsg_data(nlh);
3125         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3126             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3127             rtm->rtm_table || rtm->rtm_protocol ||
3128             rtm->rtm_scope || rtm->rtm_type) {
3129                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3130                 return -EINVAL;
3131         }
3132
3133         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3134                                RTM_F_LOOKUP_TABLE |
3135                                RTM_F_FIB_MATCH)) {
3136                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3137                 return -EINVAL;
3138         }
3139
3140         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3141                                             rtm_ipv4_policy, extack);
3142         if (err)
3143                 return err;
3144
3145         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3146             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3147                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3148                 return -EINVAL;
3149         }
3150
3151         for (i = 0; i <= RTA_MAX; i++) {
3152                 if (!tb[i])
3153                         continue;
3154
3155                 switch (i) {
3156                 case RTA_IIF:
3157                 case RTA_OIF:
3158                 case RTA_SRC:
3159                 case RTA_DST:
3160                 case RTA_IP_PROTO:
3161                 case RTA_SPORT:
3162                 case RTA_DPORT:
3163                 case RTA_MARK:
3164                 case RTA_UID:
3165                         break;
3166                 default:
3167                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3168                         return -EINVAL;
3169                 }
3170         }
3171
3172         return 0;
3173 }
3174
3175 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3176                              struct netlink_ext_ack *extack)
3177 {
3178         struct net *net = sock_net(in_skb->sk);
3179         struct nlattr *tb[RTA_MAX+1];
3180         u32 table_id = RT_TABLE_MAIN;
3181         __be16 sport = 0, dport = 0;
3182         struct fib_result res = {};
3183         u8 ip_proto = IPPROTO_UDP;
3184         struct rtable *rt = NULL;
3185         struct sk_buff *skb;
3186         struct rtmsg *rtm;
3187         struct flowi4 fl4 = {};
3188         __be32 dst = 0;
3189         __be32 src = 0;
3190         kuid_t uid;
3191         u32 iif;
3192         int err;
3193         int mark;
3194
3195         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3196         if (err < 0)
3197                 return err;
3198
3199         rtm = nlmsg_data(nlh);
3200         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3201         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3202         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3203         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3204         if (tb[RTA_UID])
3205                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3206         else
3207                 uid = (iif ? INVALID_UID : current_uid());
3208
3209         if (tb[RTA_IP_PROTO]) {
3210                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3211                                                   &ip_proto, AF_INET, extack);
3212                 if (err)
3213                         return err;
3214         }
3215
3216         if (tb[RTA_SPORT])
3217                 sport = nla_get_be16(tb[RTA_SPORT]);
3218
3219         if (tb[RTA_DPORT])
3220                 dport = nla_get_be16(tb[RTA_DPORT]);
3221
3222         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3223         if (!skb)
3224                 return -ENOBUFS;
3225
3226         fl4.daddr = dst;
3227         fl4.saddr = src;
3228         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3229         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3230         fl4.flowi4_mark = mark;
3231         fl4.flowi4_uid = uid;
3232         if (sport)
3233                 fl4.fl4_sport = sport;
3234         if (dport)
3235                 fl4.fl4_dport = dport;
3236         fl4.flowi4_proto = ip_proto;
3237
3238         rcu_read_lock();
3239
3240         if (iif) {
3241                 struct net_device *dev;
3242
3243                 dev = dev_get_by_index_rcu(net, iif);
3244                 if (!dev) {
3245                         err = -ENODEV;
3246                         goto errout_rcu;
3247                 }
3248
3249                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3250                 skb->dev        = dev;
3251                 skb->mark       = mark;
3252                 err = ip_route_input_rcu(skb, dst, src,
3253                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
3254                                          &res);
3255
3256                 rt = skb_rtable(skb);
3257                 if (err == 0 && rt->dst.error)
3258                         err = -rt->dst.error;
3259         } else {
3260                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3261                 skb->dev = net->loopback_dev;
3262                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3263                 err = 0;
3264                 if (IS_ERR(rt))
3265                         err = PTR_ERR(rt);
3266                 else
3267                         skb_dst_set(skb, &rt->dst);
3268         }
3269
3270         if (err)
3271                 goto errout_rcu;
3272
3273         if (rtm->rtm_flags & RTM_F_NOTIFY)
3274                 rt->rt_flags |= RTCF_NOTIFY;
3275
3276         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3277                 table_id = res.table ? res.table->tb_id : 0;
3278
3279         /* reset skb for netlink reply msg */
3280         skb_trim(skb, 0);
3281         skb_reset_network_header(skb);
3282         skb_reset_transport_header(skb);
3283         skb_reset_mac_header(skb);
3284
3285         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3286                 struct fib_rt_info fri;
3287
3288                 if (!res.fi) {
3289                         err = fib_props[res.type].error;
3290                         if (!err)
3291                                 err = -EHOSTUNREACH;
3292                         goto errout_rcu;
3293                 }
3294                 fri.fi = res.fi;
3295                 fri.tb_id = table_id;
3296                 fri.dst = res.prefix;
3297                 fri.dst_len = res.prefixlen;
3298                 fri.tos = fl4.flowi4_tos;
3299                 fri.type = rt->rt_type;
3300                 fri.offload = 0;
3301                 fri.trap = 0;
3302                 if (res.fa_head) {
3303                         struct fib_alias *fa;
3304
3305                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3306                                 u8 slen = 32 - fri.dst_len;
3307
3308                                 if (fa->fa_slen == slen &&
3309                                     fa->tb_id == fri.tb_id &&
3310                                     fa->fa_tos == fri.tos &&
3311                                     fa->fa_info == res.fi &&
3312                                     fa->fa_type == fri.type) {
3313                                         fri.offload = fa->offload;
3314                                         fri.trap = fa->trap;
3315                                         break;
3316                                 }
3317                         }
3318                 }
3319                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3320                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3321         } else {
3322                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3323                                    NETLINK_CB(in_skb).portid,
3324                                    nlh->nlmsg_seq, 0);
3325         }
3326         if (err < 0)
3327                 goto errout_rcu;
3328
3329         rcu_read_unlock();
3330
3331         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3332
3333 errout_free:
3334         return err;
3335 errout_rcu:
3336         rcu_read_unlock();
3337         kfree_skb(skb);
3338         goto errout_free;
3339 }
3340
3341 void ip_rt_multicast_event(struct in_device *in_dev)
3342 {
3343         rt_cache_flush(dev_net(in_dev->dev));
3344 }
3345
3346 #ifdef CONFIG_SYSCTL
3347 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3348 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3349 static int ip_rt_gc_elasticity __read_mostly    = 8;
3350 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3351
3352 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3353                 void *buffer, size_t *lenp, loff_t *ppos)
3354 {
3355         struct net *net = (struct net *)__ctl->extra1;
3356
3357         if (write) {
3358                 rt_cache_flush(net);
3359                 fnhe_genid_bump(net);
3360                 return 0;
3361         }
3362
3363         return -EINVAL;
3364 }
3365
3366 static struct ctl_table ipv4_route_table[] = {
3367         {
3368                 .procname       = "gc_thresh",
3369                 .data           = &ipv4_dst_ops.gc_thresh,
3370                 .maxlen         = sizeof(int),
3371                 .mode           = 0644,
3372                 .proc_handler   = proc_dointvec,
3373         },
3374         {
3375                 .procname       = "max_size",
3376                 .data           = &ip_rt_max_size,
3377                 .maxlen         = sizeof(int),
3378                 .mode           = 0644,
3379                 .proc_handler   = proc_dointvec,
3380         },
3381         {
3382                 /*  Deprecated. Use gc_min_interval_ms */
3383
3384                 .procname       = "gc_min_interval",
3385                 .data           = &ip_rt_gc_min_interval,
3386                 .maxlen         = sizeof(int),
3387                 .mode           = 0644,
3388                 .proc_handler   = proc_dointvec_jiffies,
3389         },
3390         {
3391                 .procname       = "gc_min_interval_ms",
3392                 .data           = &ip_rt_gc_min_interval,
3393                 .maxlen         = sizeof(int),
3394                 .mode           = 0644,
3395                 .proc_handler   = proc_dointvec_ms_jiffies,
3396         },
3397         {
3398                 .procname       = "gc_timeout",
3399                 .data           = &ip_rt_gc_timeout,
3400                 .maxlen         = sizeof(int),
3401                 .mode           = 0644,
3402                 .proc_handler   = proc_dointvec_jiffies,
3403         },
3404         {
3405                 .procname       = "gc_interval",
3406                 .data           = &ip_rt_gc_interval,
3407                 .maxlen         = sizeof(int),
3408                 .mode           = 0644,
3409                 .proc_handler   = proc_dointvec_jiffies,
3410         },
3411         {
3412                 .procname       = "redirect_load",
3413                 .data           = &ip_rt_redirect_load,
3414                 .maxlen         = sizeof(int),
3415                 .mode           = 0644,
3416                 .proc_handler   = proc_dointvec,
3417         },
3418         {
3419                 .procname       = "redirect_number",
3420                 .data           = &ip_rt_redirect_number,
3421                 .maxlen         = sizeof(int),
3422                 .mode           = 0644,
3423                 .proc_handler   = proc_dointvec,
3424         },
3425         {
3426                 .procname       = "redirect_silence",
3427                 .data           = &ip_rt_redirect_silence,
3428                 .maxlen         = sizeof(int),
3429                 .mode           = 0644,
3430                 .proc_handler   = proc_dointvec,
3431         },
3432         {
3433                 .procname       = "error_cost",
3434                 .data           = &ip_rt_error_cost,
3435                 .maxlen         = sizeof(int),
3436                 .mode           = 0644,
3437                 .proc_handler   = proc_dointvec,
3438         },
3439         {
3440                 .procname       = "error_burst",
3441                 .data           = &ip_rt_error_burst,
3442                 .maxlen         = sizeof(int),
3443                 .mode           = 0644,
3444                 .proc_handler   = proc_dointvec,
3445         },
3446         {
3447                 .procname       = "gc_elasticity",
3448                 .data           = &ip_rt_gc_elasticity,
3449                 .maxlen         = sizeof(int),
3450                 .mode           = 0644,
3451                 .proc_handler   = proc_dointvec,
3452         },
3453         {
3454                 .procname       = "mtu_expires",
3455                 .data           = &ip_rt_mtu_expires,
3456                 .maxlen         = sizeof(int),
3457                 .mode           = 0644,
3458                 .proc_handler   = proc_dointvec_jiffies,
3459         },
3460         {
3461                 .procname       = "min_pmtu",
3462                 .data           = &ip_rt_min_pmtu,
3463                 .maxlen         = sizeof(int),
3464                 .mode           = 0644,
3465                 .proc_handler   = proc_dointvec_minmax,
3466                 .extra1         = &ip_min_valid_pmtu,
3467         },
3468         {
3469                 .procname       = "min_adv_mss",
3470                 .data           = &ip_rt_min_advmss,
3471                 .maxlen         = sizeof(int),
3472                 .mode           = 0644,
3473                 .proc_handler   = proc_dointvec,
3474         },
3475         { }
3476 };
3477
3478 static const char ipv4_route_flush_procname[] = "flush";
3479
3480 static struct ctl_table ipv4_route_flush_table[] = {
3481         {
3482                 .procname       = ipv4_route_flush_procname,
3483                 .maxlen         = sizeof(int),
3484                 .mode           = 0200,
3485                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3486         },
3487         { },
3488 };
3489
3490 static __net_init int sysctl_route_net_init(struct net *net)
3491 {
3492         struct ctl_table *tbl;
3493
3494         tbl = ipv4_route_flush_table;
3495         if (!net_eq(net, &init_net)) {
3496                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3497                 if (!tbl)
3498                         goto err_dup;
3499
3500                 /* Don't export non-whitelisted sysctls to unprivileged users */
3501                 if (net->user_ns != &init_user_ns) {
3502                         if (tbl[0].procname != ipv4_route_flush_procname)
3503                                 tbl[0].procname = NULL;
3504                 }
3505         }
3506         tbl[0].extra1 = net;
3507
3508         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3509         if (!net->ipv4.route_hdr)
3510                 goto err_reg;
3511         return 0;
3512
3513 err_reg:
3514         if (tbl != ipv4_route_flush_table)
3515                 kfree(tbl);
3516 err_dup:
3517         return -ENOMEM;
3518 }
3519
3520 static __net_exit void sysctl_route_net_exit(struct net *net)
3521 {
3522         struct ctl_table *tbl;
3523
3524         tbl = net->ipv4.route_hdr->ctl_table_arg;
3525         unregister_net_sysctl_table(net->ipv4.route_hdr);
3526         BUG_ON(tbl == ipv4_route_flush_table);
3527         kfree(tbl);
3528 }
3529
3530 static __net_initdata struct pernet_operations sysctl_route_ops = {
3531         .init = sysctl_route_net_init,
3532         .exit = sysctl_route_net_exit,
3533 };
3534 #endif
3535
3536 static __net_init int rt_genid_init(struct net *net)
3537 {
3538         atomic_set(&net->ipv4.rt_genid, 0);
3539         atomic_set(&net->fnhe_genid, 0);
3540         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3541         return 0;
3542 }
3543
3544 static __net_initdata struct pernet_operations rt_genid_ops = {
3545         .init = rt_genid_init,
3546 };
3547
3548 static int __net_init ipv4_inetpeer_init(struct net *net)
3549 {
3550         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3551
3552         if (!bp)
3553                 return -ENOMEM;
3554         inet_peer_base_init(bp);
3555         net->ipv4.peers = bp;
3556         return 0;
3557 }
3558
3559 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3560 {
3561         struct inet_peer_base *bp = net->ipv4.peers;
3562
3563         net->ipv4.peers = NULL;
3564         inetpeer_invalidate_tree(bp);
3565         kfree(bp);
3566 }
3567
3568 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3569         .init   =       ipv4_inetpeer_init,
3570         .exit   =       ipv4_inetpeer_exit,
3571 };
3572
3573 #ifdef CONFIG_IP_ROUTE_CLASSID
3574 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3575 #endif /* CONFIG_IP_ROUTE_CLASSID */
3576
3577 int __init ip_rt_init(void)
3578 {
3579         int cpu;
3580
3581         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3582                                   GFP_KERNEL);
3583         if (!ip_idents)
3584                 panic("IP: failed to allocate ip_idents\n");
3585
3586         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3587
3588         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3589         if (!ip_tstamps)
3590                 panic("IP: failed to allocate ip_tstamps\n");
3591
3592         for_each_possible_cpu(cpu) {
3593                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3594
3595                 INIT_LIST_HEAD(&ul->head);
3596                 spin_lock_init(&ul->lock);
3597         }
3598 #ifdef CONFIG_IP_ROUTE_CLASSID
3599         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3600         if (!ip_rt_acct)
3601                 panic("IP: failed to allocate ip_rt_acct\n");
3602 #endif
3603
3604         ipv4_dst_ops.kmem_cachep =
3605                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3606                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3607
3608         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3609
3610         if (dst_entries_init(&ipv4_dst_ops) < 0)
3611                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3612
3613         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3614                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3615
3616         ipv4_dst_ops.gc_thresh = ~0;
3617         ip_rt_max_size = INT_MAX;
3618
3619         devinet_init();
3620         ip_fib_init();
3621
3622         if (ip_rt_proc_init())
3623                 pr_err("Unable to create route proc files\n");
3624 #ifdef CONFIG_XFRM
3625         xfrm_init();
3626         xfrm4_init();
3627 #endif
3628         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3629                       RTNL_FLAG_DOIT_UNLOCKED);
3630
3631 #ifdef CONFIG_SYSCTL
3632         register_pernet_subsys(&sysctl_route_ops);
3633 #endif
3634         register_pernet_subsys(&rt_genid_ops);
3635         register_pernet_subsys(&ipv4_inetpeer_ops);
3636         return 0;
3637 }
3638
3639 #ifdef CONFIG_SYSCTL
3640 /*
3641  * We really need to sanitize the damn ipv4 init order, then all
3642  * this nonsense will go away.
3643  */
3644 void __init ip_static_sysctl_init(void)
3645 {
3646         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3647 }
3648 #endif