Merge tag 'gfs2-for-5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux...
[linux-2.6-microblaze.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu,
143                                            bool confirm_neigh);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct proc_ops rt_cache_proc_ops = {
241         .proc_open      = rt_cache_seq_open,
242         .proc_read      = seq_read,
243         .proc_lseek     = seq_lseek,
244         .proc_release   = seq_release,
245 };
246
247
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250         int cpu;
251
252         if (*pos == 0)
253                 return SEQ_START_TOKEN;
254
255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256                 if (!cpu_possible(cpu))
257                         continue;
258                 *pos = cpu+1;
259                 return &per_cpu(rt_cache_stat, cpu);
260         }
261         return NULL;
262 }
263
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266         int cpu;
267
268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269                 if (!cpu_possible(cpu))
270                         continue;
271                 *pos = cpu+1;
272                 return &per_cpu(rt_cache_stat, cpu);
273         }
274         (*pos)++;
275         return NULL;
276
277 }
278
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281
282 }
283
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324
325
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330
331 static const struct proc_ops rt_cpu_proc_ops = {
332         .proc_open      = rt_cpu_seq_open,
333         .proc_read      = seq_read,
334         .proc_lseek     = seq_lseek,
335         .proc_release   = seq_release,
336 };
337
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341         struct ip_rt_acct *dst, *src;
342         unsigned int i, j;
343
344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345         if (!dst)
346                 return -ENOMEM;
347
348         for_each_possible_cpu(i) {
349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350                 for (j = 0; j < 256; j++) {
351                         dst[j].o_bytes   += src[j].o_bytes;
352                         dst[j].o_packets += src[j].o_packets;
353                         dst[j].i_bytes   += src[j].i_bytes;
354                         dst[j].i_packets += src[j].i_packets;
355                 }
356         }
357
358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359         kfree(dst);
360         return 0;
361 }
362 #endif
363
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366         struct proc_dir_entry *pde;
367
368         pde = proc_create("rt_cache", 0444, net->proc_net,
369                           &rt_cache_proc_ops);
370         if (!pde)
371                 goto err1;
372
373         pde = proc_create("rt_cache", 0444,
374                           net->proc_net_stat, &rt_cpu_proc_ops);
375         if (!pde)
376                 goto err2;
377
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379         pde = proc_create_single("rt_acct", 0, net->proc_net,
380                         rt_acct_proc_show);
381         if (!pde)
382                 goto err3;
383 #endif
384         return 0;
385
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388         remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391         remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393         return -ENOMEM;
394 }
395
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399         remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401         remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406         .init = ip_rt_do_proc_init,
407         .exit = ip_rt_do_proc_exit,
408 };
409
410 static int __init ip_rt_proc_init(void)
411 {
412         return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418         return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426
427 void rt_cache_flush(struct net *net)
428 {
429         rt_genid_bump_ipv4(net);
430 }
431
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433                                            struct sk_buff *skb,
434                                            const void *daddr)
435 {
436         const struct rtable *rt = container_of(dst, struct rtable, dst);
437         struct net_device *dev = dst->dev;
438         struct neighbour *n;
439
440         rcu_read_lock_bh();
441
442         if (likely(rt->rt_gw_family == AF_INET)) {
443                 n = ip_neigh_gw4(dev, rt->rt_gw4);
444         } else if (rt->rt_gw_family == AF_INET6) {
445                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447                 __be32 pkey;
448
449                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450                 n = ip_neigh_gw4(dev, pkey);
451         }
452
453         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454                 n = NULL;
455
456         rcu_read_unlock_bh();
457
458         return n;
459 }
460
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463         const struct rtable *rt = container_of(dst, struct rtable, dst);
464         struct net_device *dev = dst->dev;
465         const __be32 *pkey = daddr;
466
467         if (rt->rt_gw_family == AF_INET) {
468                 pkey = (const __be32 *)&rt->rt_gw4;
469         } else if (rt->rt_gw_family == AF_INET6) {
470                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471         } else if (!daddr ||
472                  (rt->rt_flags &
473                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474                 return;
475         }
476         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478
479 #define IP_IDENTS_SZ 2048u
480
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492         u32 old = READ_ONCE(*p_tstamp);
493         u32 now = (u32)jiffies;
494         u32 delta = 0;
495
496         if (old != now && cmpxchg(p_tstamp, old, now) == old)
497                 delta = prandom_u32_max(now - old);
498
499         /* If UBSAN reports an error there, please make sure your compiler
500          * supports -fno-strict-overflow before reporting it that was a bug
501          * in UBSAN, and it has been fixed in GCC-8.
502          */
503         return atomic_add_return(segs + delta, p_id) - segs;
504 }
505 EXPORT_SYMBOL(ip_idents_reserve);
506
507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
508 {
509         u32 hash, id;
510
511         /* Note the following code is not safe, but this is okay. */
512         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513                 get_random_bytes(&net->ipv4.ip_id_key,
514                                  sizeof(net->ipv4.ip_id_key));
515
516         hash = siphash_3u32((__force u32)iph->daddr,
517                             (__force u32)iph->saddr,
518                             iph->protocol,
519                             &net->ipv4.ip_id_key);
520         id = ip_idents_reserve(hash, segs);
521         iph->id = htons(id);
522 }
523 EXPORT_SYMBOL(__ip_select_ident);
524
525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526                              const struct sock *sk,
527                              const struct iphdr *iph,
528                              int oif, u8 tos,
529                              u8 prot, u32 mark, int flow_flags)
530 {
531         if (sk) {
532                 const struct inet_sock *inet = inet_sk(sk);
533
534                 oif = sk->sk_bound_dev_if;
535                 mark = sk->sk_mark;
536                 tos = RT_CONN_FLAGS(sk);
537                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538         }
539         flowi4_init_output(fl4, oif, mark, tos,
540                            RT_SCOPE_UNIVERSE, prot,
541                            flow_flags,
542                            iph->daddr, iph->saddr, 0, 0,
543                            sock_net_uid(net, sk));
544 }
545
546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547                                const struct sock *sk)
548 {
549         const struct net *net = dev_net(skb->dev);
550         const struct iphdr *iph = ip_hdr(skb);
551         int oif = skb->dev->ifindex;
552         u8 tos = RT_TOS(iph->tos);
553         u8 prot = iph->protocol;
554         u32 mark = skb->mark;
555
556         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
557 }
558
559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
560 {
561         const struct inet_sock *inet = inet_sk(sk);
562         const struct ip_options_rcu *inet_opt;
563         __be32 daddr = inet->inet_daddr;
564
565         rcu_read_lock();
566         inet_opt = rcu_dereference(inet->inet_opt);
567         if (inet_opt && inet_opt->opt.srr)
568                 daddr = inet_opt->opt.faddr;
569         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572                            inet_sk_flowi_flags(sk),
573                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
574         rcu_read_unlock();
575 }
576
577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578                                  const struct sk_buff *skb)
579 {
580         if (skb)
581                 build_skb_flow_key(fl4, skb, sk);
582         else
583                 build_sk_flow_key(fl4, sk);
584 }
585
586 static DEFINE_SPINLOCK(fnhe_lock);
587
588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589 {
590         struct rtable *rt;
591
592         rt = rcu_dereference(fnhe->fnhe_rth_input);
593         if (rt) {
594                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
595                 dst_dev_put(&rt->dst);
596                 dst_release(&rt->dst);
597         }
598         rt = rcu_dereference(fnhe->fnhe_rth_output);
599         if (rt) {
600                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
601                 dst_dev_put(&rt->dst);
602                 dst_release(&rt->dst);
603         }
604 }
605
606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
607 {
608         struct fib_nh_exception *fnhe, *oldest;
609
610         oldest = rcu_dereference(hash->chain);
611         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612              fnhe = rcu_dereference(fnhe->fnhe_next)) {
613                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
614                         oldest = fnhe;
615         }
616         fnhe_flush_routes(oldest);
617         return oldest;
618 }
619
620 static inline u32 fnhe_hashfun(__be32 daddr)
621 {
622         static u32 fnhe_hashrnd __read_mostly;
623         u32 hval;
624
625         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
627         return hash_32(hval, FNHE_HASH_SHIFT);
628 }
629
630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631 {
632         rt->rt_pmtu = fnhe->fnhe_pmtu;
633         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
634         rt->dst.expires = fnhe->fnhe_expires;
635
636         if (fnhe->fnhe_gw) {
637                 rt->rt_flags |= RTCF_REDIRECTED;
638                 rt->rt_uses_gateway = 1;
639                 rt->rt_gw_family = AF_INET;
640                 rt->rt_gw4 = fnhe->fnhe_gw;
641         }
642 }
643
644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645                                   __be32 gw, u32 pmtu, bool lock,
646                                   unsigned long expires)
647 {
648         struct fnhe_hash_bucket *hash;
649         struct fib_nh_exception *fnhe;
650         struct rtable *rt;
651         u32 genid, hval;
652         unsigned int i;
653         int depth;
654
655         genid = fnhe_genid(dev_net(nhc->nhc_dev));
656         hval = fnhe_hashfun(daddr);
657
658         spin_lock_bh(&fnhe_lock);
659
660         hash = rcu_dereference(nhc->nhc_exceptions);
661         if (!hash) {
662                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
663                 if (!hash)
664                         goto out_unlock;
665                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
666         }
667
668         hash += hval;
669
670         depth = 0;
671         for (fnhe = rcu_dereference(hash->chain); fnhe;
672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
673                 if (fnhe->fnhe_daddr == daddr)
674                         break;
675                 depth++;
676         }
677
678         if (fnhe) {
679                 if (fnhe->fnhe_genid != genid)
680                         fnhe->fnhe_genid = genid;
681                 if (gw)
682                         fnhe->fnhe_gw = gw;
683                 if (pmtu) {
684                         fnhe->fnhe_pmtu = pmtu;
685                         fnhe->fnhe_mtu_locked = lock;
686                 }
687                 fnhe->fnhe_expires = max(1UL, expires);
688                 /* Update all cached dsts too */
689                 rt = rcu_dereference(fnhe->fnhe_rth_input);
690                 if (rt)
691                         fill_route_from_fnhe(rt, fnhe);
692                 rt = rcu_dereference(fnhe->fnhe_rth_output);
693                 if (rt)
694                         fill_route_from_fnhe(rt, fnhe);
695         } else {
696                 if (depth > FNHE_RECLAIM_DEPTH)
697                         fnhe = fnhe_oldest(hash);
698                 else {
699                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700                         if (!fnhe)
701                                 goto out_unlock;
702
703                         fnhe->fnhe_next = hash->chain;
704                         rcu_assign_pointer(hash->chain, fnhe);
705                 }
706                 fnhe->fnhe_genid = genid;
707                 fnhe->fnhe_daddr = daddr;
708                 fnhe->fnhe_gw = gw;
709                 fnhe->fnhe_pmtu = pmtu;
710                 fnhe->fnhe_mtu_locked = lock;
711                 fnhe->fnhe_expires = max(1UL, expires);
712
713                 /* Exception created; mark the cached routes for the nexthop
714                  * stale, so anyone caching it rechecks if this exception
715                  * applies to them.
716                  */
717                 rt = rcu_dereference(nhc->nhc_rth_input);
718                 if (rt)
719                         rt->dst.obsolete = DST_OBSOLETE_KILL;
720
721                 for_each_possible_cpu(i) {
722                         struct rtable __rcu **prt;
723                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
724                         rt = rcu_dereference(*prt);
725                         if (rt)
726                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
727                 }
728         }
729
730         fnhe->fnhe_stamp = jiffies;
731
732 out_unlock:
733         spin_unlock_bh(&fnhe_lock);
734 }
735
736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737                              bool kill_route)
738 {
739         __be32 new_gw = icmp_hdr(skb)->un.gateway;
740         __be32 old_gw = ip_hdr(skb)->saddr;
741         struct net_device *dev = skb->dev;
742         struct in_device *in_dev;
743         struct fib_result res;
744         struct neighbour *n;
745         struct net *net;
746
747         switch (icmp_hdr(skb)->code & 7) {
748         case ICMP_REDIR_NET:
749         case ICMP_REDIR_NETTOS:
750         case ICMP_REDIR_HOST:
751         case ICMP_REDIR_HOSTTOS:
752                 break;
753
754         default:
755                 return;
756         }
757
758         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
759                 return;
760
761         in_dev = __in_dev_get_rcu(dev);
762         if (!in_dev)
763                 return;
764
765         net = dev_net(dev);
766         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768             ipv4_is_zeronet(new_gw))
769                 goto reject_redirect;
770
771         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773                         goto reject_redirect;
774                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775                         goto reject_redirect;
776         } else {
777                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
778                         goto reject_redirect;
779         }
780
781         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782         if (!n)
783                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
784         if (!IS_ERR(n)) {
785                 if (!(n->nud_state & NUD_VALID)) {
786                         neigh_event_send(n, NULL);
787                 } else {
788                         if (fib_lookup(net, fl4, &res, 0) == 0) {
789                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
790
791                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
792                                                 0, false,
793                                                 jiffies + ip_rt_gc_timeout);
794                         }
795                         if (kill_route)
796                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
797                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
798                 }
799                 neigh_release(n);
800         }
801         return;
802
803 reject_redirect:
804 #ifdef CONFIG_IP_ROUTE_VERBOSE
805         if (IN_DEV_LOG_MARTIANS(in_dev)) {
806                 const struct iphdr *iph = (const struct iphdr *) skb->data;
807                 __be32 daddr = iph->daddr;
808                 __be32 saddr = iph->saddr;
809
810                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
811                                      "  Advised path = %pI4 -> %pI4\n",
812                                      &old_gw, dev->name, &new_gw,
813                                      &saddr, &daddr);
814         }
815 #endif
816         ;
817 }
818
819 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
820 {
821         struct rtable *rt;
822         struct flowi4 fl4;
823         const struct iphdr *iph = (const struct iphdr *) skb->data;
824         struct net *net = dev_net(skb->dev);
825         int oif = skb->dev->ifindex;
826         u8 tos = RT_TOS(iph->tos);
827         u8 prot = iph->protocol;
828         u32 mark = skb->mark;
829
830         rt = (struct rtable *) dst;
831
832         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
833         __ip_do_redirect(rt, skb, &fl4, true);
834 }
835
836 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
837 {
838         struct rtable *rt = (struct rtable *)dst;
839         struct dst_entry *ret = dst;
840
841         if (rt) {
842                 if (dst->obsolete > 0) {
843                         ip_rt_put(rt);
844                         ret = NULL;
845                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
846                            rt->dst.expires) {
847                         ip_rt_put(rt);
848                         ret = NULL;
849                 }
850         }
851         return ret;
852 }
853
854 /*
855  * Algorithm:
856  *      1. The first ip_rt_redirect_number redirects are sent
857  *         with exponential backoff, then we stop sending them at all,
858  *         assuming that the host ignores our redirects.
859  *      2. If we did not see packets requiring redirects
860  *         during ip_rt_redirect_silence, we assume that the host
861  *         forgot redirected route and start to send redirects again.
862  *
863  * This algorithm is much cheaper and more intelligent than dumb load limiting
864  * in icmp.c.
865  *
866  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
867  * and "frag. need" (breaks PMTU discovery) in icmp.c.
868  */
869
870 void ip_rt_send_redirect(struct sk_buff *skb)
871 {
872         struct rtable *rt = skb_rtable(skb);
873         struct in_device *in_dev;
874         struct inet_peer *peer;
875         struct net *net;
876         int log_martians;
877         int vif;
878
879         rcu_read_lock();
880         in_dev = __in_dev_get_rcu(rt->dst.dev);
881         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
882                 rcu_read_unlock();
883                 return;
884         }
885         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
886         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
887         rcu_read_unlock();
888
889         net = dev_net(rt->dst.dev);
890         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
891         if (!peer) {
892                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
893                           rt_nexthop(rt, ip_hdr(skb)->daddr));
894                 return;
895         }
896
897         /* No redirected packets during ip_rt_redirect_silence;
898          * reset the algorithm.
899          */
900         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
901                 peer->rate_tokens = 0;
902                 peer->n_redirects = 0;
903         }
904
905         /* Too many ignored redirects; do not send anything
906          * set dst.rate_last to the last seen redirected packet.
907          */
908         if (peer->n_redirects >= ip_rt_redirect_number) {
909                 peer->rate_last = jiffies;
910                 goto out_put_peer;
911         }
912
913         /* Check for load limit; set rate_last to the latest sent
914          * redirect.
915          */
916         if (peer->n_redirects == 0 ||
917             time_after(jiffies,
918                        (peer->rate_last +
919                         (ip_rt_redirect_load << peer->n_redirects)))) {
920                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
921
922                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
923                 peer->rate_last = jiffies;
924                 ++peer->n_redirects;
925 #ifdef CONFIG_IP_ROUTE_VERBOSE
926                 if (log_martians &&
927                     peer->n_redirects == ip_rt_redirect_number)
928                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
929                                              &ip_hdr(skb)->saddr, inet_iif(skb),
930                                              &ip_hdr(skb)->daddr, &gw);
931 #endif
932         }
933 out_put_peer:
934         inet_putpeer(peer);
935 }
936
937 static int ip_error(struct sk_buff *skb)
938 {
939         struct rtable *rt = skb_rtable(skb);
940         struct net_device *dev = skb->dev;
941         struct in_device *in_dev;
942         struct inet_peer *peer;
943         unsigned long now;
944         struct net *net;
945         bool send;
946         int code;
947
948         if (netif_is_l3_master(skb->dev)) {
949                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
950                 if (!dev)
951                         goto out;
952         }
953
954         in_dev = __in_dev_get_rcu(dev);
955
956         /* IP on this device is disabled. */
957         if (!in_dev)
958                 goto out;
959
960         net = dev_net(rt->dst.dev);
961         if (!IN_DEV_FORWARD(in_dev)) {
962                 switch (rt->dst.error) {
963                 case EHOSTUNREACH:
964                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
965                         break;
966
967                 case ENETUNREACH:
968                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
969                         break;
970                 }
971                 goto out;
972         }
973
974         switch (rt->dst.error) {
975         case EINVAL:
976         default:
977                 goto out;
978         case EHOSTUNREACH:
979                 code = ICMP_HOST_UNREACH;
980                 break;
981         case ENETUNREACH:
982                 code = ICMP_NET_UNREACH;
983                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
984                 break;
985         case EACCES:
986                 code = ICMP_PKT_FILTERED;
987                 break;
988         }
989
990         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
991                                l3mdev_master_ifindex(skb->dev), 1);
992
993         send = true;
994         if (peer) {
995                 now = jiffies;
996                 peer->rate_tokens += now - peer->rate_last;
997                 if (peer->rate_tokens > ip_rt_error_burst)
998                         peer->rate_tokens = ip_rt_error_burst;
999                 peer->rate_last = now;
1000                 if (peer->rate_tokens >= ip_rt_error_cost)
1001                         peer->rate_tokens -= ip_rt_error_cost;
1002                 else
1003                         send = false;
1004                 inet_putpeer(peer);
1005         }
1006         if (send)
1007                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009 out:    kfree_skb(skb);
1010         return 0;
1011 }
1012
1013 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014 {
1015         struct dst_entry *dst = &rt->dst;
1016         u32 old_mtu = ipv4_mtu(dst);
1017         struct fib_result res;
1018         bool lock = false;
1019
1020         if (ip_mtu_locked(dst))
1021                 return;
1022
1023         if (old_mtu < mtu)
1024                 return;
1025
1026         if (mtu < ip_rt_min_pmtu) {
1027                 lock = true;
1028                 mtu = min(old_mtu, ip_rt_min_pmtu);
1029         }
1030
1031         if (rt->rt_pmtu == mtu && !lock &&
1032             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033                 return;
1034
1035         rcu_read_lock();
1036         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1038
1039                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1040                                       jiffies + ip_rt_mtu_expires);
1041         }
1042         rcu_read_unlock();
1043 }
1044
1045 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046                               struct sk_buff *skb, u32 mtu,
1047                               bool confirm_neigh)
1048 {
1049         struct rtable *rt = (struct rtable *) dst;
1050         struct flowi4 fl4;
1051
1052         ip_rt_build_flow_key(&fl4, sk, skb);
1053
1054         /* Don't make lookup fail for bridged encapsulations */
1055         if (skb && netif_is_any_bridge_port(skb->dev))
1056                 fl4.flowi4_oif = 0;
1057
1058         __ip_rt_update_pmtu(rt, &fl4, mtu);
1059 }
1060
1061 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1062                       int oif, u8 protocol)
1063 {
1064         const struct iphdr *iph = (const struct iphdr *) skb->data;
1065         struct flowi4 fl4;
1066         struct rtable *rt;
1067         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1068
1069         __build_flow_key(net, &fl4, NULL, iph, oif,
1070                          RT_TOS(iph->tos), protocol, mark, 0);
1071         rt = __ip_route_output_key(net, &fl4);
1072         if (!IS_ERR(rt)) {
1073                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1074                 ip_rt_put(rt);
1075         }
1076 }
1077 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1078
1079 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1080 {
1081         const struct iphdr *iph = (const struct iphdr *) skb->data;
1082         struct flowi4 fl4;
1083         struct rtable *rt;
1084
1085         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1086
1087         if (!fl4.flowi4_mark)
1088                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1089
1090         rt = __ip_route_output_key(sock_net(sk), &fl4);
1091         if (!IS_ERR(rt)) {
1092                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1093                 ip_rt_put(rt);
1094         }
1095 }
1096
1097 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1098 {
1099         const struct iphdr *iph = (const struct iphdr *) skb->data;
1100         struct flowi4 fl4;
1101         struct rtable *rt;
1102         struct dst_entry *odst = NULL;
1103         bool new = false;
1104         struct net *net = sock_net(sk);
1105
1106         bh_lock_sock(sk);
1107
1108         if (!ip_sk_accept_pmtu(sk))
1109                 goto out;
1110
1111         odst = sk_dst_get(sk);
1112
1113         if (sock_owned_by_user(sk) || !odst) {
1114                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1115                 goto out;
1116         }
1117
1118         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1119
1120         rt = (struct rtable *)odst;
1121         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1122                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1123                 if (IS_ERR(rt))
1124                         goto out;
1125
1126                 new = true;
1127         }
1128
1129         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1130
1131         if (!dst_check(&rt->dst, 0)) {
1132                 if (new)
1133                         dst_release(&rt->dst);
1134
1135                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1136                 if (IS_ERR(rt))
1137                         goto out;
1138
1139                 new = true;
1140         }
1141
1142         if (new)
1143                 sk_dst_set(sk, &rt->dst);
1144
1145 out:
1146         bh_unlock_sock(sk);
1147         dst_release(odst);
1148 }
1149 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1150
1151 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1152                    int oif, u8 protocol)
1153 {
1154         const struct iphdr *iph = (const struct iphdr *) skb->data;
1155         struct flowi4 fl4;
1156         struct rtable *rt;
1157
1158         __build_flow_key(net, &fl4, NULL, iph, oif,
1159                          RT_TOS(iph->tos), protocol, 0, 0);
1160         rt = __ip_route_output_key(net, &fl4);
1161         if (!IS_ERR(rt)) {
1162                 __ip_do_redirect(rt, skb, &fl4, false);
1163                 ip_rt_put(rt);
1164         }
1165 }
1166 EXPORT_SYMBOL_GPL(ipv4_redirect);
1167
1168 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1169 {
1170         const struct iphdr *iph = (const struct iphdr *) skb->data;
1171         struct flowi4 fl4;
1172         struct rtable *rt;
1173         struct net *net = sock_net(sk);
1174
1175         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1176         rt = __ip_route_output_key(net, &fl4);
1177         if (!IS_ERR(rt)) {
1178                 __ip_do_redirect(rt, skb, &fl4, false);
1179                 ip_rt_put(rt);
1180         }
1181 }
1182 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1183
1184 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1185 {
1186         struct rtable *rt = (struct rtable *) dst;
1187
1188         /* All IPV4 dsts are created with ->obsolete set to the value
1189          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1190          * into this function always.
1191          *
1192          * When a PMTU/redirect information update invalidates a route,
1193          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1194          * DST_OBSOLETE_DEAD.
1195          */
1196         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1197                 return NULL;
1198         return dst;
1199 }
1200
1201 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1202 {
1203         struct ip_options opt;
1204         int res;
1205
1206         /* Recompile ip options since IPCB may not be valid anymore.
1207          * Also check we have a reasonable ipv4 header.
1208          */
1209         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1210             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1211                 return;
1212
1213         memset(&opt, 0, sizeof(opt));
1214         if (ip_hdr(skb)->ihl > 5) {
1215                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1216                         return;
1217                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1218
1219                 rcu_read_lock();
1220                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1221                 rcu_read_unlock();
1222
1223                 if (res)
1224                         return;
1225         }
1226         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1227 }
1228
1229 static void ipv4_link_failure(struct sk_buff *skb)
1230 {
1231         struct rtable *rt;
1232
1233         ipv4_send_dest_unreach(skb);
1234
1235         rt = skb_rtable(skb);
1236         if (rt)
1237                 dst_set_expires(&rt->dst, 0);
1238 }
1239
1240 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1241 {
1242         pr_debug("%s: %pI4 -> %pI4, %s\n",
1243                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1244                  skb->dev ? skb->dev->name : "?");
1245         kfree_skb(skb);
1246         WARN_ON(1);
1247         return 0;
1248 }
1249
1250 /*
1251    We do not cache source address of outgoing interface,
1252    because it is used only by IP RR, TS and SRR options,
1253    so that it out of fast path.
1254
1255    BTW remember: "addr" is allowed to be not aligned
1256    in IP options!
1257  */
1258
1259 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1260 {
1261         __be32 src;
1262
1263         if (rt_is_output_route(rt))
1264                 src = ip_hdr(skb)->saddr;
1265         else {
1266                 struct fib_result res;
1267                 struct iphdr *iph = ip_hdr(skb);
1268                 struct flowi4 fl4 = {
1269                         .daddr = iph->daddr,
1270                         .saddr = iph->saddr,
1271                         .flowi4_tos = RT_TOS(iph->tos),
1272                         .flowi4_oif = rt->dst.dev->ifindex,
1273                         .flowi4_iif = skb->dev->ifindex,
1274                         .flowi4_mark = skb->mark,
1275                 };
1276
1277                 rcu_read_lock();
1278                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1279                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1280                 else
1281                         src = inet_select_addr(rt->dst.dev,
1282                                                rt_nexthop(rt, iph->daddr),
1283                                                RT_SCOPE_UNIVERSE);
1284                 rcu_read_unlock();
1285         }
1286         memcpy(addr, &src, 4);
1287 }
1288
1289 #ifdef CONFIG_IP_ROUTE_CLASSID
1290 static void set_class_tag(struct rtable *rt, u32 tag)
1291 {
1292         if (!(rt->dst.tclassid & 0xFFFF))
1293                 rt->dst.tclassid |= tag & 0xFFFF;
1294         if (!(rt->dst.tclassid & 0xFFFF0000))
1295                 rt->dst.tclassid |= tag & 0xFFFF0000;
1296 }
1297 #endif
1298
1299 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1300 {
1301         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1302         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1303                                     ip_rt_min_advmss);
1304
1305         return min(advmss, IPV4_MAX_PMTU - header_size);
1306 }
1307
1308 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1309 {
1310         const struct rtable *rt = (const struct rtable *) dst;
1311         unsigned int mtu = rt->rt_pmtu;
1312
1313         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1314                 mtu = dst_metric_raw(dst, RTAX_MTU);
1315
1316         if (mtu)
1317                 return mtu;
1318
1319         mtu = READ_ONCE(dst->dev->mtu);
1320
1321         if (unlikely(ip_mtu_locked(dst))) {
1322                 if (rt->rt_uses_gateway && mtu > 576)
1323                         mtu = 576;
1324         }
1325
1326         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1327
1328         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1329 }
1330
1331 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1332 {
1333         struct fnhe_hash_bucket *hash;
1334         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1335         u32 hval = fnhe_hashfun(daddr);
1336
1337         spin_lock_bh(&fnhe_lock);
1338
1339         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1340                                          lockdep_is_held(&fnhe_lock));
1341         hash += hval;
1342
1343         fnhe_p = &hash->chain;
1344         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1345         while (fnhe) {
1346                 if (fnhe->fnhe_daddr == daddr) {
1347                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1348                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1349                         /* set fnhe_daddr to 0 to ensure it won't bind with
1350                          * new dsts in rt_bind_exception().
1351                          */
1352                         fnhe->fnhe_daddr = 0;
1353                         fnhe_flush_routes(fnhe);
1354                         kfree_rcu(fnhe, rcu);
1355                         break;
1356                 }
1357                 fnhe_p = &fnhe->fnhe_next;
1358                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1359                                                  lockdep_is_held(&fnhe_lock));
1360         }
1361
1362         spin_unlock_bh(&fnhe_lock);
1363 }
1364
1365 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1366                                                __be32 daddr)
1367 {
1368         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1369         struct fib_nh_exception *fnhe;
1370         u32 hval;
1371
1372         if (!hash)
1373                 return NULL;
1374
1375         hval = fnhe_hashfun(daddr);
1376
1377         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1378              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1379                 if (fnhe->fnhe_daddr == daddr) {
1380                         if (fnhe->fnhe_expires &&
1381                             time_after(jiffies, fnhe->fnhe_expires)) {
1382                                 ip_del_fnhe(nhc, daddr);
1383                                 break;
1384                         }
1385                         return fnhe;
1386                 }
1387         }
1388         return NULL;
1389 }
1390
1391 /* MTU selection:
1392  * 1. mtu on route is locked - use it
1393  * 2. mtu from nexthop exception
1394  * 3. mtu from egress device
1395  */
1396
1397 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1398 {
1399         struct fib_nh_common *nhc = res->nhc;
1400         struct net_device *dev = nhc->nhc_dev;
1401         struct fib_info *fi = res->fi;
1402         u32 mtu = 0;
1403
1404         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1405             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1406                 mtu = fi->fib_mtu;
1407
1408         if (likely(!mtu)) {
1409                 struct fib_nh_exception *fnhe;
1410
1411                 fnhe = find_exception(nhc, daddr);
1412                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1413                         mtu = fnhe->fnhe_pmtu;
1414         }
1415
1416         if (likely(!mtu))
1417                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1418
1419         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1420 }
1421
1422 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1423                               __be32 daddr, const bool do_cache)
1424 {
1425         bool ret = false;
1426
1427         spin_lock_bh(&fnhe_lock);
1428
1429         if (daddr == fnhe->fnhe_daddr) {
1430                 struct rtable __rcu **porig;
1431                 struct rtable *orig;
1432                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1433
1434                 if (rt_is_input_route(rt))
1435                         porig = &fnhe->fnhe_rth_input;
1436                 else
1437                         porig = &fnhe->fnhe_rth_output;
1438                 orig = rcu_dereference(*porig);
1439
1440                 if (fnhe->fnhe_genid != genid) {
1441                         fnhe->fnhe_genid = genid;
1442                         fnhe->fnhe_gw = 0;
1443                         fnhe->fnhe_pmtu = 0;
1444                         fnhe->fnhe_expires = 0;
1445                         fnhe->fnhe_mtu_locked = false;
1446                         fnhe_flush_routes(fnhe);
1447                         orig = NULL;
1448                 }
1449                 fill_route_from_fnhe(rt, fnhe);
1450                 if (!rt->rt_gw4) {
1451                         rt->rt_gw4 = daddr;
1452                         rt->rt_gw_family = AF_INET;
1453                 }
1454
1455                 if (do_cache) {
1456                         dst_hold(&rt->dst);
1457                         rcu_assign_pointer(*porig, rt);
1458                         if (orig) {
1459                                 dst_dev_put(&orig->dst);
1460                                 dst_release(&orig->dst);
1461                         }
1462                         ret = true;
1463                 }
1464
1465                 fnhe->fnhe_stamp = jiffies;
1466         }
1467         spin_unlock_bh(&fnhe_lock);
1468
1469         return ret;
1470 }
1471
1472 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1473 {
1474         struct rtable *orig, *prev, **p;
1475         bool ret = true;
1476
1477         if (rt_is_input_route(rt)) {
1478                 p = (struct rtable **)&nhc->nhc_rth_input;
1479         } else {
1480                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1481         }
1482         orig = *p;
1483
1484         /* hold dst before doing cmpxchg() to avoid race condition
1485          * on this dst
1486          */
1487         dst_hold(&rt->dst);
1488         prev = cmpxchg(p, orig, rt);
1489         if (prev == orig) {
1490                 if (orig) {
1491                         rt_add_uncached_list(orig);
1492                         dst_release(&orig->dst);
1493                 }
1494         } else {
1495                 dst_release(&rt->dst);
1496                 ret = false;
1497         }
1498
1499         return ret;
1500 }
1501
1502 struct uncached_list {
1503         spinlock_t              lock;
1504         struct list_head        head;
1505 };
1506
1507 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1508
1509 void rt_add_uncached_list(struct rtable *rt)
1510 {
1511         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1512
1513         rt->rt_uncached_list = ul;
1514
1515         spin_lock_bh(&ul->lock);
1516         list_add_tail(&rt->rt_uncached, &ul->head);
1517         spin_unlock_bh(&ul->lock);
1518 }
1519
1520 void rt_del_uncached_list(struct rtable *rt)
1521 {
1522         if (!list_empty(&rt->rt_uncached)) {
1523                 struct uncached_list *ul = rt->rt_uncached_list;
1524
1525                 spin_lock_bh(&ul->lock);
1526                 list_del(&rt->rt_uncached);
1527                 spin_unlock_bh(&ul->lock);
1528         }
1529 }
1530
1531 static void ipv4_dst_destroy(struct dst_entry *dst)
1532 {
1533         struct rtable *rt = (struct rtable *)dst;
1534
1535         ip_dst_metrics_put(dst);
1536         rt_del_uncached_list(rt);
1537 }
1538
1539 void rt_flush_dev(struct net_device *dev)
1540 {
1541         struct rtable *rt;
1542         int cpu;
1543
1544         for_each_possible_cpu(cpu) {
1545                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1546
1547                 spin_lock_bh(&ul->lock);
1548                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1549                         if (rt->dst.dev != dev)
1550                                 continue;
1551                         rt->dst.dev = blackhole_netdev;
1552                         dev_hold(rt->dst.dev);
1553                         dev_put(dev);
1554                 }
1555                 spin_unlock_bh(&ul->lock);
1556         }
1557 }
1558
1559 static bool rt_cache_valid(const struct rtable *rt)
1560 {
1561         return  rt &&
1562                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1563                 !rt_is_expired(rt);
1564 }
1565
1566 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1567                            const struct fib_result *res,
1568                            struct fib_nh_exception *fnhe,
1569                            struct fib_info *fi, u16 type, u32 itag,
1570                            const bool do_cache)
1571 {
1572         bool cached = false;
1573
1574         if (fi) {
1575                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1576
1577                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1578                         rt->rt_uses_gateway = 1;
1579                         rt->rt_gw_family = nhc->nhc_gw_family;
1580                         /* only INET and INET6 are supported */
1581                         if (likely(nhc->nhc_gw_family == AF_INET))
1582                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1583                         else
1584                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1585                 }
1586
1587                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1588
1589 #ifdef CONFIG_IP_ROUTE_CLASSID
1590                 if (nhc->nhc_family == AF_INET) {
1591                         struct fib_nh *nh;
1592
1593                         nh = container_of(nhc, struct fib_nh, nh_common);
1594                         rt->dst.tclassid = nh->nh_tclassid;
1595                 }
1596 #endif
1597                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1598                 if (unlikely(fnhe))
1599                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1600                 else if (do_cache)
1601                         cached = rt_cache_route(nhc, rt);
1602                 if (unlikely(!cached)) {
1603                         /* Routes we intend to cache in nexthop exception or
1604                          * FIB nexthop have the DST_NOCACHE bit clear.
1605                          * However, if we are unsuccessful at storing this
1606                          * route into the cache we really need to set it.
1607                          */
1608                         if (!rt->rt_gw4) {
1609                                 rt->rt_gw_family = AF_INET;
1610                                 rt->rt_gw4 = daddr;
1611                         }
1612                         rt_add_uncached_list(rt);
1613                 }
1614         } else
1615                 rt_add_uncached_list(rt);
1616
1617 #ifdef CONFIG_IP_ROUTE_CLASSID
1618 #ifdef CONFIG_IP_MULTIPLE_TABLES
1619         set_class_tag(rt, res->tclassid);
1620 #endif
1621         set_class_tag(rt, itag);
1622 #endif
1623 }
1624
1625 struct rtable *rt_dst_alloc(struct net_device *dev,
1626                             unsigned int flags, u16 type,
1627                             bool nopolicy, bool noxfrm)
1628 {
1629         struct rtable *rt;
1630
1631         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1632                        (nopolicy ? DST_NOPOLICY : 0) |
1633                        (noxfrm ? DST_NOXFRM : 0));
1634
1635         if (rt) {
1636                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1637                 rt->rt_flags = flags;
1638                 rt->rt_type = type;
1639                 rt->rt_is_input = 0;
1640                 rt->rt_iif = 0;
1641                 rt->rt_pmtu = 0;
1642                 rt->rt_mtu_locked = 0;
1643                 rt->rt_uses_gateway = 0;
1644                 rt->rt_gw_family = 0;
1645                 rt->rt_gw4 = 0;
1646                 INIT_LIST_HEAD(&rt->rt_uncached);
1647
1648                 rt->dst.output = ip_output;
1649                 if (flags & RTCF_LOCAL)
1650                         rt->dst.input = ip_local_deliver;
1651         }
1652
1653         return rt;
1654 }
1655 EXPORT_SYMBOL(rt_dst_alloc);
1656
1657 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1658 {
1659         struct rtable *new_rt;
1660
1661         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1662                            rt->dst.flags);
1663
1664         if (new_rt) {
1665                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1666                 new_rt->rt_flags = rt->rt_flags;
1667                 new_rt->rt_type = rt->rt_type;
1668                 new_rt->rt_is_input = rt->rt_is_input;
1669                 new_rt->rt_iif = rt->rt_iif;
1670                 new_rt->rt_pmtu = rt->rt_pmtu;
1671                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1672                 new_rt->rt_gw_family = rt->rt_gw_family;
1673                 if (rt->rt_gw_family == AF_INET)
1674                         new_rt->rt_gw4 = rt->rt_gw4;
1675                 else if (rt->rt_gw_family == AF_INET6)
1676                         new_rt->rt_gw6 = rt->rt_gw6;
1677                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1678
1679                 new_rt->dst.input = rt->dst.input;
1680                 new_rt->dst.output = rt->dst.output;
1681                 new_rt->dst.error = rt->dst.error;
1682                 new_rt->dst.lastuse = jiffies;
1683                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1684         }
1685         return new_rt;
1686 }
1687 EXPORT_SYMBOL(rt_dst_clone);
1688
1689 /* called in rcu_read_lock() section */
1690 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1691                           u8 tos, struct net_device *dev,
1692                           struct in_device *in_dev, u32 *itag)
1693 {
1694         int err;
1695
1696         /* Primary sanity checks. */
1697         if (!in_dev)
1698                 return -EINVAL;
1699
1700         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1701             skb->protocol != htons(ETH_P_IP))
1702                 return -EINVAL;
1703
1704         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1705                 return -EINVAL;
1706
1707         if (ipv4_is_zeronet(saddr)) {
1708                 if (!ipv4_is_local_multicast(daddr) &&
1709                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1710                         return -EINVAL;
1711         } else {
1712                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1713                                           in_dev, itag);
1714                 if (err < 0)
1715                         return err;
1716         }
1717         return 0;
1718 }
1719
1720 /* called in rcu_read_lock() section */
1721 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1722                              u8 tos, struct net_device *dev, int our)
1723 {
1724         struct in_device *in_dev = __in_dev_get_rcu(dev);
1725         unsigned int flags = RTCF_MULTICAST;
1726         struct rtable *rth;
1727         u32 itag = 0;
1728         int err;
1729
1730         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1731         if (err)
1732                 return err;
1733
1734         if (our)
1735                 flags |= RTCF_LOCAL;
1736
1737         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1738                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1739         if (!rth)
1740                 return -ENOBUFS;
1741
1742 #ifdef CONFIG_IP_ROUTE_CLASSID
1743         rth->dst.tclassid = itag;
1744 #endif
1745         rth->dst.output = ip_rt_bug;
1746         rth->rt_is_input= 1;
1747
1748 #ifdef CONFIG_IP_MROUTE
1749         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1750                 rth->dst.input = ip_mr_input;
1751 #endif
1752         RT_CACHE_STAT_INC(in_slow_mc);
1753
1754         skb_dst_set(skb, &rth->dst);
1755         return 0;
1756 }
1757
1758
1759 static void ip_handle_martian_source(struct net_device *dev,
1760                                      struct in_device *in_dev,
1761                                      struct sk_buff *skb,
1762                                      __be32 daddr,
1763                                      __be32 saddr)
1764 {
1765         RT_CACHE_STAT_INC(in_martian_src);
1766 #ifdef CONFIG_IP_ROUTE_VERBOSE
1767         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1768                 /*
1769                  *      RFC1812 recommendation, if source is martian,
1770                  *      the only hint is MAC header.
1771                  */
1772                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1773                         &daddr, &saddr, dev->name);
1774                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1775                         print_hex_dump(KERN_WARNING, "ll header: ",
1776                                        DUMP_PREFIX_OFFSET, 16, 1,
1777                                        skb_mac_header(skb),
1778                                        dev->hard_header_len, false);
1779                 }
1780         }
1781 #endif
1782 }
1783
1784 /* called in rcu_read_lock() section */
1785 static int __mkroute_input(struct sk_buff *skb,
1786                            const struct fib_result *res,
1787                            struct in_device *in_dev,
1788                            __be32 daddr, __be32 saddr, u32 tos)
1789 {
1790         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1791         struct net_device *dev = nhc->nhc_dev;
1792         struct fib_nh_exception *fnhe;
1793         struct rtable *rth;
1794         int err;
1795         struct in_device *out_dev;
1796         bool do_cache;
1797         u32 itag = 0;
1798
1799         /* get a working reference to the output device */
1800         out_dev = __in_dev_get_rcu(dev);
1801         if (!out_dev) {
1802                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1803                 return -EINVAL;
1804         }
1805
1806         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1807                                   in_dev->dev, in_dev, &itag);
1808         if (err < 0) {
1809                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1810                                          saddr);
1811
1812                 goto cleanup;
1813         }
1814
1815         do_cache = res->fi && !itag;
1816         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1817             skb->protocol == htons(ETH_P_IP)) {
1818                 __be32 gw;
1819
1820                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1821                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1822                     inet_addr_onlink(out_dev, saddr, gw))
1823                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1824         }
1825
1826         if (skb->protocol != htons(ETH_P_IP)) {
1827                 /* Not IP (i.e. ARP). Do not create route, if it is
1828                  * invalid for proxy arp. DNAT routes are always valid.
1829                  *
1830                  * Proxy arp feature have been extended to allow, ARP
1831                  * replies back to the same interface, to support
1832                  * Private VLAN switch technologies. See arp.c.
1833                  */
1834                 if (out_dev == in_dev &&
1835                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1836                         err = -EINVAL;
1837                         goto cleanup;
1838                 }
1839         }
1840
1841         fnhe = find_exception(nhc, daddr);
1842         if (do_cache) {
1843                 if (fnhe)
1844                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1845                 else
1846                         rth = rcu_dereference(nhc->nhc_rth_input);
1847                 if (rt_cache_valid(rth)) {
1848                         skb_dst_set_noref(skb, &rth->dst);
1849                         goto out;
1850                 }
1851         }
1852
1853         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1854                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1855                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1856         if (!rth) {
1857                 err = -ENOBUFS;
1858                 goto cleanup;
1859         }
1860
1861         rth->rt_is_input = 1;
1862         RT_CACHE_STAT_INC(in_slow_tot);
1863
1864         rth->dst.input = ip_forward;
1865
1866         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1867                        do_cache);
1868         lwtunnel_set_redirect(&rth->dst);
1869         skb_dst_set(skb, &rth->dst);
1870 out:
1871         err = 0;
1872  cleanup:
1873         return err;
1874 }
1875
1876 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1877 /* To make ICMP packets follow the right flow, the multipath hash is
1878  * calculated from the inner IP addresses.
1879  */
1880 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1881                                  struct flow_keys *hash_keys)
1882 {
1883         const struct iphdr *outer_iph = ip_hdr(skb);
1884         const struct iphdr *key_iph = outer_iph;
1885         const struct iphdr *inner_iph;
1886         const struct icmphdr *icmph;
1887         struct iphdr _inner_iph;
1888         struct icmphdr _icmph;
1889
1890         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1891                 goto out;
1892
1893         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1894                 goto out;
1895
1896         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1897                                    &_icmph);
1898         if (!icmph)
1899                 goto out;
1900
1901         if (!icmp_is_err(icmph->type))
1902                 goto out;
1903
1904         inner_iph = skb_header_pointer(skb,
1905                                        outer_iph->ihl * 4 + sizeof(_icmph),
1906                                        sizeof(_inner_iph), &_inner_iph);
1907         if (!inner_iph)
1908                 goto out;
1909
1910         key_iph = inner_iph;
1911 out:
1912         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1913         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1914 }
1915
1916 /* if skb is set it will be used and fl4 can be NULL */
1917 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1918                        const struct sk_buff *skb, struct flow_keys *flkeys)
1919 {
1920         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1921         struct flow_keys hash_keys;
1922         u32 mhash;
1923
1924         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1925         case 0:
1926                 memset(&hash_keys, 0, sizeof(hash_keys));
1927                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1928                 if (skb) {
1929                         ip_multipath_l3_keys(skb, &hash_keys);
1930                 } else {
1931                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1932                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1933                 }
1934                 break;
1935         case 1:
1936                 /* skb is currently provided only when forwarding */
1937                 if (skb) {
1938                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1939                         struct flow_keys keys;
1940
1941                         /* short-circuit if we already have L4 hash present */
1942                         if (skb->l4_hash)
1943                                 return skb_get_hash_raw(skb) >> 1;
1944
1945                         memset(&hash_keys, 0, sizeof(hash_keys));
1946
1947                         if (!flkeys) {
1948                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1949                                 flkeys = &keys;
1950                         }
1951
1952                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1953                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1954                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1955                         hash_keys.ports.src = flkeys->ports.src;
1956                         hash_keys.ports.dst = flkeys->ports.dst;
1957                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1958                 } else {
1959                         memset(&hash_keys, 0, sizeof(hash_keys));
1960                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1961                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1962                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1963                         hash_keys.ports.src = fl4->fl4_sport;
1964                         hash_keys.ports.dst = fl4->fl4_dport;
1965                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1966                 }
1967                 break;
1968         case 2:
1969                 memset(&hash_keys, 0, sizeof(hash_keys));
1970                 /* skb is currently provided only when forwarding */
1971                 if (skb) {
1972                         struct flow_keys keys;
1973
1974                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1975                         /* Inner can be v4 or v6 */
1976                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1977                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1978                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1979                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1980                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1981                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1982                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1983                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1984                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1985                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1986                         } else {
1987                                 /* Same as case 0 */
1988                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989                                 ip_multipath_l3_keys(skb, &hash_keys);
1990                         }
1991                 } else {
1992                         /* Same as case 0 */
1993                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1994                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1995                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1996                 }
1997                 break;
1998         }
1999         mhash = flow_hash_from_keys(&hash_keys);
2000
2001         if (multipath_hash)
2002                 mhash = jhash_2words(mhash, multipath_hash, 0);
2003
2004         return mhash >> 1;
2005 }
2006 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2007
2008 static int ip_mkroute_input(struct sk_buff *skb,
2009                             struct fib_result *res,
2010                             struct in_device *in_dev,
2011                             __be32 daddr, __be32 saddr, u32 tos,
2012                             struct flow_keys *hkeys)
2013 {
2014 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2015         if (res->fi && fib_info_num_path(res->fi) > 1) {
2016                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2017
2018                 fib_select_multipath(res, h);
2019         }
2020 #endif
2021
2022         /* create a routing cache entry */
2023         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2024 }
2025
2026 /* Implements all the saddr-related checks as ip_route_input_slow(),
2027  * assuming daddr is valid and the destination is not a local broadcast one.
2028  * Uses the provided hint instead of performing a route lookup.
2029  */
2030 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2031                       u8 tos, struct net_device *dev,
2032                       const struct sk_buff *hint)
2033 {
2034         struct in_device *in_dev = __in_dev_get_rcu(dev);
2035         struct rtable *rt = skb_rtable(hint);
2036         struct net *net = dev_net(dev);
2037         int err = -EINVAL;
2038         u32 tag = 0;
2039
2040         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2041                 goto martian_source;
2042
2043         if (ipv4_is_zeronet(saddr))
2044                 goto martian_source;
2045
2046         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2047                 goto martian_source;
2048
2049         if (rt->rt_type != RTN_LOCAL)
2050                 goto skip_validate_source;
2051
2052         tos &= IPTOS_RT_MASK;
2053         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2054         if (err < 0)
2055                 goto martian_source;
2056
2057 skip_validate_source:
2058         skb_dst_copy(skb, hint);
2059         return 0;
2060
2061 martian_source:
2062         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2063         return err;
2064 }
2065
2066 /*
2067  *      NOTE. We drop all the packets that has local source
2068  *      addresses, because every properly looped back packet
2069  *      must have correct destination already attached by output routine.
2070  *      Changes in the enforced policies must be applied also to
2071  *      ip_route_use_hint().
2072  *
2073  *      Such approach solves two big problems:
2074  *      1. Not simplex devices are handled properly.
2075  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2076  *      called with rcu_read_lock()
2077  */
2078
2079 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2080                                u8 tos, struct net_device *dev,
2081                                struct fib_result *res)
2082 {
2083         struct in_device *in_dev = __in_dev_get_rcu(dev);
2084         struct flow_keys *flkeys = NULL, _flkeys;
2085         struct net    *net = dev_net(dev);
2086         struct ip_tunnel_info *tun_info;
2087         int             err = -EINVAL;
2088         unsigned int    flags = 0;
2089         u32             itag = 0;
2090         struct rtable   *rth;
2091         struct flowi4   fl4;
2092         bool do_cache = true;
2093
2094         /* IP on this device is disabled. */
2095
2096         if (!in_dev)
2097                 goto out;
2098
2099         /* Check for the most weird martians, which can be not detected
2100            by fib_lookup.
2101          */
2102
2103         tun_info = skb_tunnel_info(skb);
2104         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2105                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2106         else
2107                 fl4.flowi4_tun_key.tun_id = 0;
2108         skb_dst_drop(skb);
2109
2110         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2111                 goto martian_source;
2112
2113         res->fi = NULL;
2114         res->table = NULL;
2115         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2116                 goto brd_input;
2117
2118         /* Accept zero addresses only to limited broadcast;
2119          * I even do not know to fix it or not. Waiting for complains :-)
2120          */
2121         if (ipv4_is_zeronet(saddr))
2122                 goto martian_source;
2123
2124         if (ipv4_is_zeronet(daddr))
2125                 goto martian_destination;
2126
2127         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2128          * and call it once if daddr or/and saddr are loopback addresses
2129          */
2130         if (ipv4_is_loopback(daddr)) {
2131                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2132                         goto martian_destination;
2133         } else if (ipv4_is_loopback(saddr)) {
2134                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2135                         goto martian_source;
2136         }
2137
2138         /*
2139          *      Now we are ready to route packet.
2140          */
2141         fl4.flowi4_oif = 0;
2142         fl4.flowi4_iif = dev->ifindex;
2143         fl4.flowi4_mark = skb->mark;
2144         fl4.flowi4_tos = tos;
2145         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2146         fl4.flowi4_flags = 0;
2147         fl4.daddr = daddr;
2148         fl4.saddr = saddr;
2149         fl4.flowi4_uid = sock_net_uid(net, NULL);
2150
2151         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2152                 flkeys = &_flkeys;
2153         } else {
2154                 fl4.flowi4_proto = 0;
2155                 fl4.fl4_sport = 0;
2156                 fl4.fl4_dport = 0;
2157         }
2158
2159         err = fib_lookup(net, &fl4, res, 0);
2160         if (err != 0) {
2161                 if (!IN_DEV_FORWARD(in_dev))
2162                         err = -EHOSTUNREACH;
2163                 goto no_route;
2164         }
2165
2166         if (res->type == RTN_BROADCAST) {
2167                 if (IN_DEV_BFORWARD(in_dev))
2168                         goto make_route;
2169                 /* not do cache if bc_forwarding is enabled */
2170                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2171                         do_cache = false;
2172                 goto brd_input;
2173         }
2174
2175         if (res->type == RTN_LOCAL) {
2176                 err = fib_validate_source(skb, saddr, daddr, tos,
2177                                           0, dev, in_dev, &itag);
2178                 if (err < 0)
2179                         goto martian_source;
2180                 goto local_input;
2181         }
2182
2183         if (!IN_DEV_FORWARD(in_dev)) {
2184                 err = -EHOSTUNREACH;
2185                 goto no_route;
2186         }
2187         if (res->type != RTN_UNICAST)
2188                 goto martian_destination;
2189
2190 make_route:
2191         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2192 out:    return err;
2193
2194 brd_input:
2195         if (skb->protocol != htons(ETH_P_IP))
2196                 goto e_inval;
2197
2198         if (!ipv4_is_zeronet(saddr)) {
2199                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2200                                           in_dev, &itag);
2201                 if (err < 0)
2202                         goto martian_source;
2203         }
2204         flags |= RTCF_BROADCAST;
2205         res->type = RTN_BROADCAST;
2206         RT_CACHE_STAT_INC(in_brd);
2207
2208 local_input:
2209         do_cache &= res->fi && !itag;
2210         if (do_cache) {
2211                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2212
2213                 rth = rcu_dereference(nhc->nhc_rth_input);
2214                 if (rt_cache_valid(rth)) {
2215                         skb_dst_set_noref(skb, &rth->dst);
2216                         err = 0;
2217                         goto out;
2218                 }
2219         }
2220
2221         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2222                            flags | RTCF_LOCAL, res->type,
2223                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2224         if (!rth)
2225                 goto e_nobufs;
2226
2227         rth->dst.output= ip_rt_bug;
2228 #ifdef CONFIG_IP_ROUTE_CLASSID
2229         rth->dst.tclassid = itag;
2230 #endif
2231         rth->rt_is_input = 1;
2232
2233         RT_CACHE_STAT_INC(in_slow_tot);
2234         if (res->type == RTN_UNREACHABLE) {
2235                 rth->dst.input= ip_error;
2236                 rth->dst.error= -err;
2237                 rth->rt_flags   &= ~RTCF_LOCAL;
2238         }
2239
2240         if (do_cache) {
2241                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2242
2243                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2244                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2245                         WARN_ON(rth->dst.input == lwtunnel_input);
2246                         rth->dst.lwtstate->orig_input = rth->dst.input;
2247                         rth->dst.input = lwtunnel_input;
2248                 }
2249
2250                 if (unlikely(!rt_cache_route(nhc, rth)))
2251                         rt_add_uncached_list(rth);
2252         }
2253         skb_dst_set(skb, &rth->dst);
2254         err = 0;
2255         goto out;
2256
2257 no_route:
2258         RT_CACHE_STAT_INC(in_no_route);
2259         res->type = RTN_UNREACHABLE;
2260         res->fi = NULL;
2261         res->table = NULL;
2262         goto local_input;
2263
2264         /*
2265          *      Do not cache martian addresses: they should be logged (RFC1812)
2266          */
2267 martian_destination:
2268         RT_CACHE_STAT_INC(in_martian_dst);
2269 #ifdef CONFIG_IP_ROUTE_VERBOSE
2270         if (IN_DEV_LOG_MARTIANS(in_dev))
2271                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2272                                      &daddr, &saddr, dev->name);
2273 #endif
2274
2275 e_inval:
2276         err = -EINVAL;
2277         goto out;
2278
2279 e_nobufs:
2280         err = -ENOBUFS;
2281         goto out;
2282
2283 martian_source:
2284         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2285         goto out;
2286 }
2287
2288 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2289                          u8 tos, struct net_device *dev)
2290 {
2291         struct fib_result res;
2292         int err;
2293
2294         tos &= IPTOS_RT_MASK;
2295         rcu_read_lock();
2296         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2297         rcu_read_unlock();
2298
2299         return err;
2300 }
2301 EXPORT_SYMBOL(ip_route_input_noref);
2302
2303 /* called with rcu_read_lock held */
2304 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2305                        u8 tos, struct net_device *dev, struct fib_result *res)
2306 {
2307         /* Multicast recognition logic is moved from route cache to here.
2308            The problem was that too many Ethernet cards have broken/missing
2309            hardware multicast filters :-( As result the host on multicasting
2310            network acquires a lot of useless route cache entries, sort of
2311            SDR messages from all the world. Now we try to get rid of them.
2312            Really, provided software IP multicast filter is organized
2313            reasonably (at least, hashed), it does not result in a slowdown
2314            comparing with route cache reject entries.
2315            Note, that multicast routers are not affected, because
2316            route cache entry is created eventually.
2317          */
2318         if (ipv4_is_multicast(daddr)) {
2319                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2320                 int our = 0;
2321                 int err = -EINVAL;
2322
2323                 if (!in_dev)
2324                         return err;
2325                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2326                                       ip_hdr(skb)->protocol);
2327
2328                 /* check l3 master if no match yet */
2329                 if (!our && netif_is_l3_slave(dev)) {
2330                         struct in_device *l3_in_dev;
2331
2332                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2333                         if (l3_in_dev)
2334                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2335                                                       ip_hdr(skb)->protocol);
2336                 }
2337
2338                 if (our
2339 #ifdef CONFIG_IP_MROUTE
2340                         ||
2341                     (!ipv4_is_local_multicast(daddr) &&
2342                      IN_DEV_MFORWARD(in_dev))
2343 #endif
2344                    ) {
2345                         err = ip_route_input_mc(skb, daddr, saddr,
2346                                                 tos, dev, our);
2347                 }
2348                 return err;
2349         }
2350
2351         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2352 }
2353
2354 /* called with rcu_read_lock() */
2355 static struct rtable *__mkroute_output(const struct fib_result *res,
2356                                        const struct flowi4 *fl4, int orig_oif,
2357                                        struct net_device *dev_out,
2358                                        unsigned int flags)
2359 {
2360         struct fib_info *fi = res->fi;
2361         struct fib_nh_exception *fnhe;
2362         struct in_device *in_dev;
2363         u16 type = res->type;
2364         struct rtable *rth;
2365         bool do_cache;
2366
2367         in_dev = __in_dev_get_rcu(dev_out);
2368         if (!in_dev)
2369                 return ERR_PTR(-EINVAL);
2370
2371         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2372                 if (ipv4_is_loopback(fl4->saddr) &&
2373                     !(dev_out->flags & IFF_LOOPBACK) &&
2374                     !netif_is_l3_master(dev_out))
2375                         return ERR_PTR(-EINVAL);
2376
2377         if (ipv4_is_lbcast(fl4->daddr))
2378                 type = RTN_BROADCAST;
2379         else if (ipv4_is_multicast(fl4->daddr))
2380                 type = RTN_MULTICAST;
2381         else if (ipv4_is_zeronet(fl4->daddr))
2382                 return ERR_PTR(-EINVAL);
2383
2384         if (dev_out->flags & IFF_LOOPBACK)
2385                 flags |= RTCF_LOCAL;
2386
2387         do_cache = true;
2388         if (type == RTN_BROADCAST) {
2389                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2390                 fi = NULL;
2391         } else if (type == RTN_MULTICAST) {
2392                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2393                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2394                                      fl4->flowi4_proto))
2395                         flags &= ~RTCF_LOCAL;
2396                 else
2397                         do_cache = false;
2398                 /* If multicast route do not exist use
2399                  * default one, but do not gateway in this case.
2400                  * Yes, it is hack.
2401                  */
2402                 if (fi && res->prefixlen < 4)
2403                         fi = NULL;
2404         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2405                    (orig_oif != dev_out->ifindex)) {
2406                 /* For local routes that require a particular output interface
2407                  * we do not want to cache the result.  Caching the result
2408                  * causes incorrect behaviour when there are multiple source
2409                  * addresses on the interface, the end result being that if the
2410                  * intended recipient is waiting on that interface for the
2411                  * packet he won't receive it because it will be delivered on
2412                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2413                  * be set to the loopback interface as well.
2414                  */
2415                 do_cache = false;
2416         }
2417
2418         fnhe = NULL;
2419         do_cache &= fi != NULL;
2420         if (fi) {
2421                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2422                 struct rtable __rcu **prth;
2423
2424                 fnhe = find_exception(nhc, fl4->daddr);
2425                 if (!do_cache)
2426                         goto add;
2427                 if (fnhe) {
2428                         prth = &fnhe->fnhe_rth_output;
2429                 } else {
2430                         if (unlikely(fl4->flowi4_flags &
2431                                      FLOWI_FLAG_KNOWN_NH &&
2432                                      !(nhc->nhc_gw_family &&
2433                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2434                                 do_cache = false;
2435                                 goto add;
2436                         }
2437                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2438                 }
2439                 rth = rcu_dereference(*prth);
2440                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2441                         return rth;
2442         }
2443
2444 add:
2445         rth = rt_dst_alloc(dev_out, flags, type,
2446                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2447                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2448         if (!rth)
2449                 return ERR_PTR(-ENOBUFS);
2450
2451         rth->rt_iif = orig_oif;
2452
2453         RT_CACHE_STAT_INC(out_slow_tot);
2454
2455         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2456                 if (flags & RTCF_LOCAL &&
2457                     !(dev_out->flags & IFF_LOOPBACK)) {
2458                         rth->dst.output = ip_mc_output;
2459                         RT_CACHE_STAT_INC(out_slow_mc);
2460                 }
2461 #ifdef CONFIG_IP_MROUTE
2462                 if (type == RTN_MULTICAST) {
2463                         if (IN_DEV_MFORWARD(in_dev) &&
2464                             !ipv4_is_local_multicast(fl4->daddr)) {
2465                                 rth->dst.input = ip_mr_input;
2466                                 rth->dst.output = ip_mc_output;
2467                         }
2468                 }
2469 #endif
2470         }
2471
2472         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2473         lwtunnel_set_redirect(&rth->dst);
2474
2475         return rth;
2476 }
2477
2478 /*
2479  * Major route resolver routine.
2480  */
2481
2482 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2483                                         const struct sk_buff *skb)
2484 {
2485         __u8 tos = RT_FL_TOS(fl4);
2486         struct fib_result res = {
2487                 .type           = RTN_UNSPEC,
2488                 .fi             = NULL,
2489                 .table          = NULL,
2490                 .tclassid       = 0,
2491         };
2492         struct rtable *rth;
2493
2494         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2495         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2496         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2497                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2498
2499         rcu_read_lock();
2500         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2501         rcu_read_unlock();
2502
2503         return rth;
2504 }
2505 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2506
2507 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2508                                             struct fib_result *res,
2509                                             const struct sk_buff *skb)
2510 {
2511         struct net_device *dev_out = NULL;
2512         int orig_oif = fl4->flowi4_oif;
2513         unsigned int flags = 0;
2514         struct rtable *rth;
2515         int err;
2516
2517         if (fl4->saddr) {
2518                 if (ipv4_is_multicast(fl4->saddr) ||
2519                     ipv4_is_lbcast(fl4->saddr) ||
2520                     ipv4_is_zeronet(fl4->saddr)) {
2521                         rth = ERR_PTR(-EINVAL);
2522                         goto out;
2523                 }
2524
2525                 rth = ERR_PTR(-ENETUNREACH);
2526
2527                 /* I removed check for oif == dev_out->oif here.
2528                    It was wrong for two reasons:
2529                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2530                       is assigned to multiple interfaces.
2531                    2. Moreover, we are allowed to send packets with saddr
2532                       of another iface. --ANK
2533                  */
2534
2535                 if (fl4->flowi4_oif == 0 &&
2536                     (ipv4_is_multicast(fl4->daddr) ||
2537                      ipv4_is_lbcast(fl4->daddr))) {
2538                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2539                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2540                         if (!dev_out)
2541                                 goto out;
2542
2543                         /* Special hack: user can direct multicasts
2544                            and limited broadcast via necessary interface
2545                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2546                            This hack is not just for fun, it allows
2547                            vic,vat and friends to work.
2548                            They bind socket to loopback, set ttl to zero
2549                            and expect that it will work.
2550                            From the viewpoint of routing cache they are broken,
2551                            because we are not allowed to build multicast path
2552                            with loopback source addr (look, routing cache
2553                            cannot know, that ttl is zero, so that packet
2554                            will not leave this host and route is valid).
2555                            Luckily, this hack is good workaround.
2556                          */
2557
2558                         fl4->flowi4_oif = dev_out->ifindex;
2559                         goto make_route;
2560                 }
2561
2562                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2563                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2564                         if (!__ip_dev_find(net, fl4->saddr, false))
2565                                 goto out;
2566                 }
2567         }
2568
2569
2570         if (fl4->flowi4_oif) {
2571                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2572                 rth = ERR_PTR(-ENODEV);
2573                 if (!dev_out)
2574                         goto out;
2575
2576                 /* RACE: Check return value of inet_select_addr instead. */
2577                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2578                         rth = ERR_PTR(-ENETUNREACH);
2579                         goto out;
2580                 }
2581                 if (ipv4_is_local_multicast(fl4->daddr) ||
2582                     ipv4_is_lbcast(fl4->daddr) ||
2583                     fl4->flowi4_proto == IPPROTO_IGMP) {
2584                         if (!fl4->saddr)
2585                                 fl4->saddr = inet_select_addr(dev_out, 0,
2586                                                               RT_SCOPE_LINK);
2587                         goto make_route;
2588                 }
2589                 if (!fl4->saddr) {
2590                         if (ipv4_is_multicast(fl4->daddr))
2591                                 fl4->saddr = inet_select_addr(dev_out, 0,
2592                                                               fl4->flowi4_scope);
2593                         else if (!fl4->daddr)
2594                                 fl4->saddr = inet_select_addr(dev_out, 0,
2595                                                               RT_SCOPE_HOST);
2596                 }
2597         }
2598
2599         if (!fl4->daddr) {
2600                 fl4->daddr = fl4->saddr;
2601                 if (!fl4->daddr)
2602                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2603                 dev_out = net->loopback_dev;
2604                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2605                 res->type = RTN_LOCAL;
2606                 flags |= RTCF_LOCAL;
2607                 goto make_route;
2608         }
2609
2610         err = fib_lookup(net, fl4, res, 0);
2611         if (err) {
2612                 res->fi = NULL;
2613                 res->table = NULL;
2614                 if (fl4->flowi4_oif &&
2615                     (ipv4_is_multicast(fl4->daddr) ||
2616                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2617                         /* Apparently, routing tables are wrong. Assume,
2618                            that the destination is on link.
2619
2620                            WHY? DW.
2621                            Because we are allowed to send to iface
2622                            even if it has NO routes and NO assigned
2623                            addresses. When oif is specified, routing
2624                            tables are looked up with only one purpose:
2625                            to catch if destination is gatewayed, rather than
2626                            direct. Moreover, if MSG_DONTROUTE is set,
2627                            we send packet, ignoring both routing tables
2628                            and ifaddr state. --ANK
2629
2630
2631                            We could make it even if oif is unknown,
2632                            likely IPv6, but we do not.
2633                          */
2634
2635                         if (fl4->saddr == 0)
2636                                 fl4->saddr = inet_select_addr(dev_out, 0,
2637                                                               RT_SCOPE_LINK);
2638                         res->type = RTN_UNICAST;
2639                         goto make_route;
2640                 }
2641                 rth = ERR_PTR(err);
2642                 goto out;
2643         }
2644
2645         if (res->type == RTN_LOCAL) {
2646                 if (!fl4->saddr) {
2647                         if (res->fi->fib_prefsrc)
2648                                 fl4->saddr = res->fi->fib_prefsrc;
2649                         else
2650                                 fl4->saddr = fl4->daddr;
2651                 }
2652
2653                 /* L3 master device is the loopback for that domain */
2654                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2655                         net->loopback_dev;
2656
2657                 /* make sure orig_oif points to fib result device even
2658                  * though packet rx/tx happens over loopback or l3mdev
2659                  */
2660                 orig_oif = FIB_RES_OIF(*res);
2661
2662                 fl4->flowi4_oif = dev_out->ifindex;
2663                 flags |= RTCF_LOCAL;
2664                 goto make_route;
2665         }
2666
2667         fib_select_path(net, res, fl4, skb);
2668
2669         dev_out = FIB_RES_DEV(*res);
2670         fl4->flowi4_oif = dev_out->ifindex;
2671
2672
2673 make_route:
2674         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2675
2676 out:
2677         return rth;
2678 }
2679
2680 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2681 {
2682         return NULL;
2683 }
2684
2685 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2686 {
2687         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2688
2689         return mtu ? : dst->dev->mtu;
2690 }
2691
2692 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2693                                           struct sk_buff *skb, u32 mtu,
2694                                           bool confirm_neigh)
2695 {
2696 }
2697
2698 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2699                                        struct sk_buff *skb)
2700 {
2701 }
2702
2703 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2704                                           unsigned long old)
2705 {
2706         return NULL;
2707 }
2708
2709 static struct dst_ops ipv4_dst_blackhole_ops = {
2710         .family                 =       AF_INET,
2711         .check                  =       ipv4_blackhole_dst_check,
2712         .mtu                    =       ipv4_blackhole_mtu,
2713         .default_advmss         =       ipv4_default_advmss,
2714         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2715         .redirect               =       ipv4_rt_blackhole_redirect,
2716         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2717         .neigh_lookup           =       ipv4_neigh_lookup,
2718 };
2719
2720 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2721 {
2722         struct rtable *ort = (struct rtable *) dst_orig;
2723         struct rtable *rt;
2724
2725         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2726         if (rt) {
2727                 struct dst_entry *new = &rt->dst;
2728
2729                 new->__use = 1;
2730                 new->input = dst_discard;
2731                 new->output = dst_discard_out;
2732
2733                 new->dev = net->loopback_dev;
2734                 if (new->dev)
2735                         dev_hold(new->dev);
2736
2737                 rt->rt_is_input = ort->rt_is_input;
2738                 rt->rt_iif = ort->rt_iif;
2739                 rt->rt_pmtu = ort->rt_pmtu;
2740                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2741
2742                 rt->rt_genid = rt_genid_ipv4(net);
2743                 rt->rt_flags = ort->rt_flags;
2744                 rt->rt_type = ort->rt_type;
2745                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2746                 rt->rt_gw_family = ort->rt_gw_family;
2747                 if (rt->rt_gw_family == AF_INET)
2748                         rt->rt_gw4 = ort->rt_gw4;
2749                 else if (rt->rt_gw_family == AF_INET6)
2750                         rt->rt_gw6 = ort->rt_gw6;
2751
2752                 INIT_LIST_HEAD(&rt->rt_uncached);
2753         }
2754
2755         dst_release(dst_orig);
2756
2757         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2758 }
2759
2760 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2761                                     const struct sock *sk)
2762 {
2763         struct rtable *rt = __ip_route_output_key(net, flp4);
2764
2765         if (IS_ERR(rt))
2766                 return rt;
2767
2768         if (flp4->flowi4_proto)
2769                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2770                                                         flowi4_to_flowi(flp4),
2771                                                         sk, 0);
2772
2773         return rt;
2774 }
2775 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2776
2777 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2778                                       struct net_device *dev,
2779                                       struct net *net, __be32 *saddr,
2780                                       const struct ip_tunnel_info *info,
2781                                       u8 protocol, bool use_cache)
2782 {
2783 #ifdef CONFIG_DST_CACHE
2784         struct dst_cache *dst_cache;
2785 #endif
2786         struct rtable *rt = NULL;
2787         struct flowi4 fl4;
2788         __u8 tos;
2789
2790 #ifdef CONFIG_DST_CACHE
2791         dst_cache = (struct dst_cache *)&info->dst_cache;
2792         if (use_cache) {
2793                 rt = dst_cache_get_ip4(dst_cache, saddr);
2794                 if (rt)
2795                         return rt;
2796         }
2797 #endif
2798         memset(&fl4, 0, sizeof(fl4));
2799         fl4.flowi4_mark = skb->mark;
2800         fl4.flowi4_proto = protocol;
2801         fl4.daddr = info->key.u.ipv4.dst;
2802         fl4.saddr = info->key.u.ipv4.src;
2803         tos = info->key.tos;
2804         fl4.flowi4_tos = RT_TOS(tos);
2805
2806         rt = ip_route_output_key(net, &fl4);
2807         if (IS_ERR(rt)) {
2808                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2809                 return ERR_PTR(-ENETUNREACH);
2810         }
2811         if (rt->dst.dev == dev) { /* is this necessary? */
2812                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2813                 ip_rt_put(rt);
2814                 return ERR_PTR(-ELOOP);
2815         }
2816 #ifdef CONFIG_DST_CACHE
2817         if (use_cache)
2818                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2819 #endif
2820         *saddr = fl4.saddr;
2821         return rt;
2822 }
2823 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2824
2825 /* called with rcu_read_lock held */
2826 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2827                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2828                         struct sk_buff *skb, u32 portid, u32 seq,
2829                         unsigned int flags)
2830 {
2831         struct rtmsg *r;
2832         struct nlmsghdr *nlh;
2833         unsigned long expires = 0;
2834         u32 error;
2835         u32 metrics[RTAX_MAX];
2836
2837         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2838         if (!nlh)
2839                 return -EMSGSIZE;
2840
2841         r = nlmsg_data(nlh);
2842         r->rtm_family    = AF_INET;
2843         r->rtm_dst_len  = 32;
2844         r->rtm_src_len  = 0;
2845         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2846         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2847         if (nla_put_u32(skb, RTA_TABLE, table_id))
2848                 goto nla_put_failure;
2849         r->rtm_type     = rt->rt_type;
2850         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2851         r->rtm_protocol = RTPROT_UNSPEC;
2852         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2853         if (rt->rt_flags & RTCF_NOTIFY)
2854                 r->rtm_flags |= RTM_F_NOTIFY;
2855         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2856                 r->rtm_flags |= RTCF_DOREDIRECT;
2857
2858         if (nla_put_in_addr(skb, RTA_DST, dst))
2859                 goto nla_put_failure;
2860         if (src) {
2861                 r->rtm_src_len = 32;
2862                 if (nla_put_in_addr(skb, RTA_SRC, src))
2863                         goto nla_put_failure;
2864         }
2865         if (rt->dst.dev &&
2866             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2867                 goto nla_put_failure;
2868 #ifdef CONFIG_IP_ROUTE_CLASSID
2869         if (rt->dst.tclassid &&
2870             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2871                 goto nla_put_failure;
2872 #endif
2873         if (fl4 && !rt_is_input_route(rt) &&
2874             fl4->saddr != src) {
2875                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2876                         goto nla_put_failure;
2877         }
2878         if (rt->rt_uses_gateway) {
2879                 if (rt->rt_gw_family == AF_INET &&
2880                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2881                         goto nla_put_failure;
2882                 } else if (rt->rt_gw_family == AF_INET6) {
2883                         int alen = sizeof(struct in6_addr);
2884                         struct nlattr *nla;
2885                         struct rtvia *via;
2886
2887                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2888                         if (!nla)
2889                                 goto nla_put_failure;
2890
2891                         via = nla_data(nla);
2892                         via->rtvia_family = AF_INET6;
2893                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2894                 }
2895         }
2896
2897         expires = rt->dst.expires;
2898         if (expires) {
2899                 unsigned long now = jiffies;
2900
2901                 if (time_before(now, expires))
2902                         expires -= now;
2903                 else
2904                         expires = 0;
2905         }
2906
2907         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2908         if (rt->rt_pmtu && expires)
2909                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2910         if (rt->rt_mtu_locked && expires)
2911                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2912         if (rtnetlink_put_metrics(skb, metrics) < 0)
2913                 goto nla_put_failure;
2914
2915         if (fl4) {
2916                 if (fl4->flowi4_mark &&
2917                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2918                         goto nla_put_failure;
2919
2920                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2921                     nla_put_u32(skb, RTA_UID,
2922                                 from_kuid_munged(current_user_ns(),
2923                                                  fl4->flowi4_uid)))
2924                         goto nla_put_failure;
2925
2926                 if (rt_is_input_route(rt)) {
2927 #ifdef CONFIG_IP_MROUTE
2928                         if (ipv4_is_multicast(dst) &&
2929                             !ipv4_is_local_multicast(dst) &&
2930                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2931                                 int err = ipmr_get_route(net, skb,
2932                                                          fl4->saddr, fl4->daddr,
2933                                                          r, portid);
2934
2935                                 if (err <= 0) {
2936                                         if (err == 0)
2937                                                 return 0;
2938                                         goto nla_put_failure;
2939                                 }
2940                         } else
2941 #endif
2942                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2943                                         goto nla_put_failure;
2944                 }
2945         }
2946
2947         error = rt->dst.error;
2948
2949         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2950                 goto nla_put_failure;
2951
2952         nlmsg_end(skb, nlh);
2953         return 0;
2954
2955 nla_put_failure:
2956         nlmsg_cancel(skb, nlh);
2957         return -EMSGSIZE;
2958 }
2959
2960 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2961                             struct netlink_callback *cb, u32 table_id,
2962                             struct fnhe_hash_bucket *bucket, int genid,
2963                             int *fa_index, int fa_start, unsigned int flags)
2964 {
2965         int i;
2966
2967         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2968                 struct fib_nh_exception *fnhe;
2969
2970                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2971                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2972                         struct rtable *rt;
2973                         int err;
2974
2975                         if (*fa_index < fa_start)
2976                                 goto next;
2977
2978                         if (fnhe->fnhe_genid != genid)
2979                                 goto next;
2980
2981                         if (fnhe->fnhe_expires &&
2982                             time_after(jiffies, fnhe->fnhe_expires))
2983                                 goto next;
2984
2985                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2986                         if (!rt)
2987                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2988                         if (!rt)
2989                                 goto next;
2990
2991                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2992                                            table_id, NULL, skb,
2993                                            NETLINK_CB(cb->skb).portid,
2994                                            cb->nlh->nlmsg_seq, flags);
2995                         if (err)
2996                                 return err;
2997 next:
2998                         (*fa_index)++;
2999                 }
3000         }
3001
3002         return 0;
3003 }
3004
3005 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3006                        u32 table_id, struct fib_info *fi,
3007                        int *fa_index, int fa_start, unsigned int flags)
3008 {
3009         struct net *net = sock_net(cb->skb->sk);
3010         int nhsel, genid = fnhe_genid(net);
3011
3012         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3013                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3014                 struct fnhe_hash_bucket *bucket;
3015                 int err;
3016
3017                 if (nhc->nhc_flags & RTNH_F_DEAD)
3018                         continue;
3019
3020                 rcu_read_lock();
3021                 bucket = rcu_dereference(nhc->nhc_exceptions);
3022                 err = 0;
3023                 if (bucket)
3024                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3025                                                genid, fa_index, fa_start,
3026                                                flags);
3027                 rcu_read_unlock();
3028                 if (err)
3029                         return err;
3030         }
3031
3032         return 0;
3033 }
3034
3035 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3036                                                    u8 ip_proto, __be16 sport,
3037                                                    __be16 dport)
3038 {
3039         struct sk_buff *skb;
3040         struct iphdr *iph;
3041
3042         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3043         if (!skb)
3044                 return NULL;
3045
3046         /* Reserve room for dummy headers, this skb can pass
3047          * through good chunk of routing engine.
3048          */
3049         skb_reset_mac_header(skb);
3050         skb_reset_network_header(skb);
3051         skb->protocol = htons(ETH_P_IP);
3052         iph = skb_put(skb, sizeof(struct iphdr));
3053         iph->protocol = ip_proto;
3054         iph->saddr = src;
3055         iph->daddr = dst;
3056         iph->version = 0x4;
3057         iph->frag_off = 0;
3058         iph->ihl = 0x5;
3059         skb_set_transport_header(skb, skb->len);
3060
3061         switch (iph->protocol) {
3062         case IPPROTO_UDP: {
3063                 struct udphdr *udph;
3064
3065                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3066                 udph->source = sport;
3067                 udph->dest = dport;
3068                 udph->len = sizeof(struct udphdr);
3069                 udph->check = 0;
3070                 break;
3071         }
3072         case IPPROTO_TCP: {
3073                 struct tcphdr *tcph;
3074
3075                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3076                 tcph->source    = sport;
3077                 tcph->dest      = dport;
3078                 tcph->doff      = sizeof(struct tcphdr) / 4;
3079                 tcph->rst = 1;
3080                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3081                                             src, dst, 0);
3082                 break;
3083         }
3084         case IPPROTO_ICMP: {
3085                 struct icmphdr *icmph;
3086
3087                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3088                 icmph->type = ICMP_ECHO;
3089                 icmph->code = 0;
3090         }
3091         }
3092
3093         return skb;
3094 }
3095
3096 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3097                                        const struct nlmsghdr *nlh,
3098                                        struct nlattr **tb,
3099                                        struct netlink_ext_ack *extack)
3100 {
3101         struct rtmsg *rtm;
3102         int i, err;
3103
3104         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3105                 NL_SET_ERR_MSG(extack,
3106                                "ipv4: Invalid header for route get request");
3107                 return -EINVAL;
3108         }
3109
3110         if (!netlink_strict_get_check(skb))
3111                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3112                                               rtm_ipv4_policy, extack);
3113
3114         rtm = nlmsg_data(nlh);
3115         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3116             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3117             rtm->rtm_table || rtm->rtm_protocol ||
3118             rtm->rtm_scope || rtm->rtm_type) {
3119                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3120                 return -EINVAL;
3121         }
3122
3123         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3124                                RTM_F_LOOKUP_TABLE |
3125                                RTM_F_FIB_MATCH)) {
3126                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3127                 return -EINVAL;
3128         }
3129
3130         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3131                                             rtm_ipv4_policy, extack);
3132         if (err)
3133                 return err;
3134
3135         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3136             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3137                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3138                 return -EINVAL;
3139         }
3140
3141         for (i = 0; i <= RTA_MAX; i++) {
3142                 if (!tb[i])
3143                         continue;
3144
3145                 switch (i) {
3146                 case RTA_IIF:
3147                 case RTA_OIF:
3148                 case RTA_SRC:
3149                 case RTA_DST:
3150                 case RTA_IP_PROTO:
3151                 case RTA_SPORT:
3152                 case RTA_DPORT:
3153                 case RTA_MARK:
3154                 case RTA_UID:
3155                         break;
3156                 default:
3157                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3158                         return -EINVAL;
3159                 }
3160         }
3161
3162         return 0;
3163 }
3164
3165 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3166                              struct netlink_ext_ack *extack)
3167 {
3168         struct net *net = sock_net(in_skb->sk);
3169         struct nlattr *tb[RTA_MAX+1];
3170         u32 table_id = RT_TABLE_MAIN;
3171         __be16 sport = 0, dport = 0;
3172         struct fib_result res = {};
3173         u8 ip_proto = IPPROTO_UDP;
3174         struct rtable *rt = NULL;
3175         struct sk_buff *skb;
3176         struct rtmsg *rtm;
3177         struct flowi4 fl4 = {};
3178         __be32 dst = 0;
3179         __be32 src = 0;
3180         kuid_t uid;
3181         u32 iif;
3182         int err;
3183         int mark;
3184
3185         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3186         if (err < 0)
3187                 return err;
3188
3189         rtm = nlmsg_data(nlh);
3190         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3191         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3192         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3193         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3194         if (tb[RTA_UID])
3195                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3196         else
3197                 uid = (iif ? INVALID_UID : current_uid());
3198
3199         if (tb[RTA_IP_PROTO]) {
3200                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3201                                                   &ip_proto, AF_INET, extack);
3202                 if (err)
3203                         return err;
3204         }
3205
3206         if (tb[RTA_SPORT])
3207                 sport = nla_get_be16(tb[RTA_SPORT]);
3208
3209         if (tb[RTA_DPORT])
3210                 dport = nla_get_be16(tb[RTA_DPORT]);
3211
3212         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3213         if (!skb)
3214                 return -ENOBUFS;
3215
3216         fl4.daddr = dst;
3217         fl4.saddr = src;
3218         fl4.flowi4_tos = rtm->rtm_tos;
3219         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3220         fl4.flowi4_mark = mark;
3221         fl4.flowi4_uid = uid;
3222         if (sport)
3223                 fl4.fl4_sport = sport;
3224         if (dport)
3225                 fl4.fl4_dport = dport;
3226         fl4.flowi4_proto = ip_proto;
3227
3228         rcu_read_lock();
3229
3230         if (iif) {
3231                 struct net_device *dev;
3232
3233                 dev = dev_get_by_index_rcu(net, iif);
3234                 if (!dev) {
3235                         err = -ENODEV;
3236                         goto errout_rcu;
3237                 }
3238
3239                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3240                 skb->dev        = dev;
3241                 skb->mark       = mark;
3242                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3243                                          dev, &res);
3244
3245                 rt = skb_rtable(skb);
3246                 if (err == 0 && rt->dst.error)
3247                         err = -rt->dst.error;
3248         } else {
3249                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3250                 skb->dev = net->loopback_dev;
3251                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3252                 err = 0;
3253                 if (IS_ERR(rt))
3254                         err = PTR_ERR(rt);
3255                 else
3256                         skb_dst_set(skb, &rt->dst);
3257         }
3258
3259         if (err)
3260                 goto errout_rcu;
3261
3262         if (rtm->rtm_flags & RTM_F_NOTIFY)
3263                 rt->rt_flags |= RTCF_NOTIFY;
3264
3265         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3266                 table_id = res.table ? res.table->tb_id : 0;
3267
3268         /* reset skb for netlink reply msg */
3269         skb_trim(skb, 0);
3270         skb_reset_network_header(skb);
3271         skb_reset_transport_header(skb);
3272         skb_reset_mac_header(skb);
3273
3274         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3275                 struct fib_rt_info fri;
3276
3277                 if (!res.fi) {
3278                         err = fib_props[res.type].error;
3279                         if (!err)
3280                                 err = -EHOSTUNREACH;
3281                         goto errout_rcu;
3282                 }
3283                 fri.fi = res.fi;
3284                 fri.tb_id = table_id;
3285                 fri.dst = res.prefix;
3286                 fri.dst_len = res.prefixlen;
3287                 fri.tos = fl4.flowi4_tos;
3288                 fri.type = rt->rt_type;
3289                 fri.offload = 0;
3290                 fri.trap = 0;
3291                 if (res.fa_head) {
3292                         struct fib_alias *fa;
3293
3294                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3295                                 u8 slen = 32 - fri.dst_len;
3296
3297                                 if (fa->fa_slen == slen &&
3298                                     fa->tb_id == fri.tb_id &&
3299                                     fa->fa_tos == fri.tos &&
3300                                     fa->fa_info == res.fi &&
3301                                     fa->fa_type == fri.type) {
3302                                         fri.offload = fa->offload;
3303                                         fri.trap = fa->trap;
3304                                         break;
3305                                 }
3306                         }
3307                 }
3308                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3309                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3310         } else {
3311                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3312                                    NETLINK_CB(in_skb).portid,
3313                                    nlh->nlmsg_seq, 0);
3314         }
3315         if (err < 0)
3316                 goto errout_rcu;
3317
3318         rcu_read_unlock();
3319
3320         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3321
3322 errout_free:
3323         return err;
3324 errout_rcu:
3325         rcu_read_unlock();
3326         kfree_skb(skb);
3327         goto errout_free;
3328 }
3329
3330 void ip_rt_multicast_event(struct in_device *in_dev)
3331 {
3332         rt_cache_flush(dev_net(in_dev->dev));
3333 }
3334
3335 #ifdef CONFIG_SYSCTL
3336 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3337 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3338 static int ip_rt_gc_elasticity __read_mostly    = 8;
3339 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3340
3341 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3342                 void *buffer, size_t *lenp, loff_t *ppos)
3343 {
3344         struct net *net = (struct net *)__ctl->extra1;
3345
3346         if (write) {
3347                 rt_cache_flush(net);
3348                 fnhe_genid_bump(net);
3349                 return 0;
3350         }
3351
3352         return -EINVAL;
3353 }
3354
3355 static struct ctl_table ipv4_route_table[] = {
3356         {
3357                 .procname       = "gc_thresh",
3358                 .data           = &ipv4_dst_ops.gc_thresh,
3359                 .maxlen         = sizeof(int),
3360                 .mode           = 0644,
3361                 .proc_handler   = proc_dointvec,
3362         },
3363         {
3364                 .procname       = "max_size",
3365                 .data           = &ip_rt_max_size,
3366                 .maxlen         = sizeof(int),
3367                 .mode           = 0644,
3368                 .proc_handler   = proc_dointvec,
3369         },
3370         {
3371                 /*  Deprecated. Use gc_min_interval_ms */
3372
3373                 .procname       = "gc_min_interval",
3374                 .data           = &ip_rt_gc_min_interval,
3375                 .maxlen         = sizeof(int),
3376                 .mode           = 0644,
3377                 .proc_handler   = proc_dointvec_jiffies,
3378         },
3379         {
3380                 .procname       = "gc_min_interval_ms",
3381                 .data           = &ip_rt_gc_min_interval,
3382                 .maxlen         = sizeof(int),
3383                 .mode           = 0644,
3384                 .proc_handler   = proc_dointvec_ms_jiffies,
3385         },
3386         {
3387                 .procname       = "gc_timeout",
3388                 .data           = &ip_rt_gc_timeout,
3389                 .maxlen         = sizeof(int),
3390                 .mode           = 0644,
3391                 .proc_handler   = proc_dointvec_jiffies,
3392         },
3393         {
3394                 .procname       = "gc_interval",
3395                 .data           = &ip_rt_gc_interval,
3396                 .maxlen         = sizeof(int),
3397                 .mode           = 0644,
3398                 .proc_handler   = proc_dointvec_jiffies,
3399         },
3400         {
3401                 .procname       = "redirect_load",
3402                 .data           = &ip_rt_redirect_load,
3403                 .maxlen         = sizeof(int),
3404                 .mode           = 0644,
3405                 .proc_handler   = proc_dointvec,
3406         },
3407         {
3408                 .procname       = "redirect_number",
3409                 .data           = &ip_rt_redirect_number,
3410                 .maxlen         = sizeof(int),
3411                 .mode           = 0644,
3412                 .proc_handler   = proc_dointvec,
3413         },
3414         {
3415                 .procname       = "redirect_silence",
3416                 .data           = &ip_rt_redirect_silence,
3417                 .maxlen         = sizeof(int),
3418                 .mode           = 0644,
3419                 .proc_handler   = proc_dointvec,
3420         },
3421         {
3422                 .procname       = "error_cost",
3423                 .data           = &ip_rt_error_cost,
3424                 .maxlen         = sizeof(int),
3425                 .mode           = 0644,
3426                 .proc_handler   = proc_dointvec,
3427         },
3428         {
3429                 .procname       = "error_burst",
3430                 .data           = &ip_rt_error_burst,
3431                 .maxlen         = sizeof(int),
3432                 .mode           = 0644,
3433                 .proc_handler   = proc_dointvec,
3434         },
3435         {
3436                 .procname       = "gc_elasticity",
3437                 .data           = &ip_rt_gc_elasticity,
3438                 .maxlen         = sizeof(int),
3439                 .mode           = 0644,
3440                 .proc_handler   = proc_dointvec,
3441         },
3442         {
3443                 .procname       = "mtu_expires",
3444                 .data           = &ip_rt_mtu_expires,
3445                 .maxlen         = sizeof(int),
3446                 .mode           = 0644,
3447                 .proc_handler   = proc_dointvec_jiffies,
3448         },
3449         {
3450                 .procname       = "min_pmtu",
3451                 .data           = &ip_rt_min_pmtu,
3452                 .maxlen         = sizeof(int),
3453                 .mode           = 0644,
3454                 .proc_handler   = proc_dointvec_minmax,
3455                 .extra1         = &ip_min_valid_pmtu,
3456         },
3457         {
3458                 .procname       = "min_adv_mss",
3459                 .data           = &ip_rt_min_advmss,
3460                 .maxlen         = sizeof(int),
3461                 .mode           = 0644,
3462                 .proc_handler   = proc_dointvec,
3463         },
3464         { }
3465 };
3466
3467 static const char ipv4_route_flush_procname[] = "flush";
3468
3469 static struct ctl_table ipv4_route_flush_table[] = {
3470         {
3471                 .procname       = ipv4_route_flush_procname,
3472                 .maxlen         = sizeof(int),
3473                 .mode           = 0200,
3474                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3475         },
3476         { },
3477 };
3478
3479 static __net_init int sysctl_route_net_init(struct net *net)
3480 {
3481         struct ctl_table *tbl;
3482
3483         tbl = ipv4_route_flush_table;
3484         if (!net_eq(net, &init_net)) {
3485                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3486                 if (!tbl)
3487                         goto err_dup;
3488
3489                 /* Don't export non-whitelisted sysctls to unprivileged users */
3490                 if (net->user_ns != &init_user_ns) {
3491                         if (tbl[0].procname != ipv4_route_flush_procname)
3492                                 tbl[0].procname = NULL;
3493                 }
3494         }
3495         tbl[0].extra1 = net;
3496
3497         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3498         if (!net->ipv4.route_hdr)
3499                 goto err_reg;
3500         return 0;
3501
3502 err_reg:
3503         if (tbl != ipv4_route_flush_table)
3504                 kfree(tbl);
3505 err_dup:
3506         return -ENOMEM;
3507 }
3508
3509 static __net_exit void sysctl_route_net_exit(struct net *net)
3510 {
3511         struct ctl_table *tbl;
3512
3513         tbl = net->ipv4.route_hdr->ctl_table_arg;
3514         unregister_net_sysctl_table(net->ipv4.route_hdr);
3515         BUG_ON(tbl == ipv4_route_flush_table);
3516         kfree(tbl);
3517 }
3518
3519 static __net_initdata struct pernet_operations sysctl_route_ops = {
3520         .init = sysctl_route_net_init,
3521         .exit = sysctl_route_net_exit,
3522 };
3523 #endif
3524
3525 static __net_init int rt_genid_init(struct net *net)
3526 {
3527         atomic_set(&net->ipv4.rt_genid, 0);
3528         atomic_set(&net->fnhe_genid, 0);
3529         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3530         return 0;
3531 }
3532
3533 static __net_initdata struct pernet_operations rt_genid_ops = {
3534         .init = rt_genid_init,
3535 };
3536
3537 static int __net_init ipv4_inetpeer_init(struct net *net)
3538 {
3539         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3540
3541         if (!bp)
3542                 return -ENOMEM;
3543         inet_peer_base_init(bp);
3544         net->ipv4.peers = bp;
3545         return 0;
3546 }
3547
3548 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3549 {
3550         struct inet_peer_base *bp = net->ipv4.peers;
3551
3552         net->ipv4.peers = NULL;
3553         inetpeer_invalidate_tree(bp);
3554         kfree(bp);
3555 }
3556
3557 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3558         .init   =       ipv4_inetpeer_init,
3559         .exit   =       ipv4_inetpeer_exit,
3560 };
3561
3562 #ifdef CONFIG_IP_ROUTE_CLASSID
3563 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3564 #endif /* CONFIG_IP_ROUTE_CLASSID */
3565
3566 int __init ip_rt_init(void)
3567 {
3568         int cpu;
3569
3570         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3571                                   GFP_KERNEL);
3572         if (!ip_idents)
3573                 panic("IP: failed to allocate ip_idents\n");
3574
3575         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3576
3577         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3578         if (!ip_tstamps)
3579                 panic("IP: failed to allocate ip_tstamps\n");
3580
3581         for_each_possible_cpu(cpu) {
3582                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3583
3584                 INIT_LIST_HEAD(&ul->head);
3585                 spin_lock_init(&ul->lock);
3586         }
3587 #ifdef CONFIG_IP_ROUTE_CLASSID
3588         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3589         if (!ip_rt_acct)
3590                 panic("IP: failed to allocate ip_rt_acct\n");
3591 #endif
3592
3593         ipv4_dst_ops.kmem_cachep =
3594                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3595                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3596
3597         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3598
3599         if (dst_entries_init(&ipv4_dst_ops) < 0)
3600                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3601
3602         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3603                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3604
3605         ipv4_dst_ops.gc_thresh = ~0;
3606         ip_rt_max_size = INT_MAX;
3607
3608         devinet_init();
3609         ip_fib_init();
3610
3611         if (ip_rt_proc_init())
3612                 pr_err("Unable to create route proc files\n");
3613 #ifdef CONFIG_XFRM
3614         xfrm_init();
3615         xfrm4_init();
3616 #endif
3617         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3618                       RTNL_FLAG_DOIT_UNLOCKED);
3619
3620 #ifdef CONFIG_SYSCTL
3621         register_pernet_subsys(&sysctl_route_ops);
3622 #endif
3623         register_pernet_subsys(&rt_genid_ops);
3624         register_pernet_subsys(&ipv4_inetpeer_ops);
3625         return 0;
3626 }
3627
3628 #ifdef CONFIG_SYSCTL
3629 /*
3630  * We really need to sanitize the damn ipv4 init order, then all
3631  * this nonsense will go away.
3632  */
3633 void __init ip_static_sysctl_init(void)
3634 {
3635         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3636 }
3637 #endif