2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
117 #include "fib_lookup.h"
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
122 #define RT_GC_TIMEOUT (300*HZ)
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void ipv4_link_failure(struct sk_buff *skb);
145 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb, u32 mtu);
147 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb);
149 static void ipv4_dst_destroy(struct dst_entry *dst);
151 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
157 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
162 static struct dst_ops ipv4_dst_ops = {
164 .check = ipv4_dst_check,
165 .default_advmss = ipv4_default_advmss,
167 .cow_metrics = ipv4_cow_metrics,
168 .destroy = ipv4_dst_destroy,
169 .negative_advice = ipv4_negative_advice,
170 .link_failure = ipv4_link_failure,
171 .update_pmtu = ip_rt_update_pmtu,
172 .redirect = ip_do_redirect,
173 .local_out = __ip_local_out,
174 .neigh_lookup = ipv4_neigh_lookup,
175 .confirm_neigh = ipv4_confirm_neigh,
178 #define ECN_OR_COST(class) TC_PRIO_##class
180 const __u8 ip_tos2prio[16] = {
182 ECN_OR_COST(BESTEFFORT),
184 ECN_OR_COST(BESTEFFORT),
190 ECN_OR_COST(INTERACTIVE),
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK)
198 EXPORT_SYMBOL(ip_tos2prio);
200 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
201 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
203 #ifdef CONFIG_PROC_FS
204 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
208 return SEQ_START_TOKEN;
211 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
217 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
221 static int rt_cache_seq_show(struct seq_file *seq, void *v)
223 if (v == SEQ_START_TOKEN)
224 seq_printf(seq, "%-127s\n",
225 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
226 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
231 static const struct seq_operations rt_cache_seq_ops = {
232 .start = rt_cache_seq_start,
233 .next = rt_cache_seq_next,
234 .stop = rt_cache_seq_stop,
235 .show = rt_cache_seq_show,
238 static int rt_cache_seq_open(struct inode *inode, struct file *file)
240 return seq_open(file, &rt_cache_seq_ops);
243 static const struct file_operations rt_cache_seq_fops = {
244 .open = rt_cache_seq_open,
247 .release = seq_release,
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
256 return SEQ_START_TOKEN;
258 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259 if (!cpu_possible(cpu))
262 return &per_cpu(rt_cache_stat, cpu);
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272 if (!cpu_possible(cpu))
275 return &per_cpu(rt_cache_stat, cpu);
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
288 struct rt_cache_stat *st = v;
290 if (v == SEQ_START_TOKEN) {
291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 dst_entries_get_slow(&ipv4_dst_ops),
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
320 static const struct seq_operations rt_cpu_seq_ops = {
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
330 return seq_open(file, &rt_cpu_seq_ops);
333 static const struct file_operations rt_cpu_seq_fops = {
334 .open = rt_cpu_seq_open,
337 .release = seq_release,
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 struct ip_rt_acct *dst, *src;
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 return single_open(file, rt_acct_proc_show, NULL);
370 static const struct file_operations rt_acct_proc_fops = {
371 .open = rt_acct_proc_open,
374 .release = single_release,
378 static int __net_init ip_rt_do_proc_init(struct net *net)
380 struct proc_dir_entry *pde;
382 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
387 pde = proc_create("rt_cache", S_IRUGO,
388 net->proc_net_stat, &rt_cpu_seq_fops);
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 #ifdef CONFIG_IP_ROUTE_CLASSID
401 remove_proc_entry("rt_cache", net->proc_net_stat);
404 remove_proc_entry("rt_cache", net->proc_net);
409 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 remove_proc_entry("rt_cache", net->proc_net_stat);
412 remove_proc_entry("rt_cache", net->proc_net);
413 #ifdef CONFIG_IP_ROUTE_CLASSID
414 remove_proc_entry("rt_acct", net->proc_net);
418 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
419 .init = ip_rt_do_proc_init,
420 .exit = ip_rt_do_proc_exit,
424 static int __init ip_rt_proc_init(void)
426 return register_pernet_subsys(&ip_rt_proc_ops);
430 static inline int ip_rt_proc_init(void)
434 #endif /* CONFIG_PROC_FS */
436 static inline bool rt_is_expired(const struct rtable *rth)
438 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
441 void rt_cache_flush(struct net *net)
443 rt_genid_bump_ipv4(net);
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
450 struct net_device *dev = dst->dev;
451 const __be32 *pkey = daddr;
452 const struct rtable *rt;
455 rt = (const struct rtable *) dst;
457 pkey = (const __be32 *) &rt->rt_gateway;
459 pkey = &ip_hdr(skb)->daddr;
461 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
464 return neigh_create(&arp_tbl, pkey, dev);
467 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
469 struct net_device *dev = dst->dev;
470 const __be32 *pkey = daddr;
471 const struct rtable *rt;
473 rt = (const struct rtable *)dst;
475 pkey = (const __be32 *)&rt->rt_gateway;
478 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
481 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
484 #define IP_IDENTS_SZ 2048u
486 static atomic_t *ip_idents __read_mostly;
487 static u32 *ip_tstamps __read_mostly;
489 /* In order to protect privacy, we add a perturbation to identifiers
490 * if one generator is seldom used. This makes hard for an attacker
491 * to infer how many packets were sent between two points in time.
493 u32 ip_idents_reserve(u32 hash, int segs)
495 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
496 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
497 u32 old = READ_ONCE(*p_tstamp);
498 u32 now = (u32)jiffies;
501 if (old != now && cmpxchg(p_tstamp, old, now) == old)
502 delta = prandom_u32_max(now - old);
504 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
506 old = (u32)atomic_read(p_id);
507 new = old + delta + segs;
508 } while (atomic_cmpxchg(p_id, old, new) != old);
512 EXPORT_SYMBOL(ip_idents_reserve);
514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
516 static u32 ip_idents_hashrnd __read_mostly;
519 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
521 hash = jhash_3words((__force u32)iph->daddr,
522 (__force u32)iph->saddr,
523 iph->protocol ^ net_hash_mix(net),
525 id = ip_idents_reserve(hash, segs);
528 EXPORT_SYMBOL(__ip_select_ident);
530 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
531 const struct sock *sk,
532 const struct iphdr *iph,
534 u8 prot, u32 mark, int flow_flags)
537 const struct inet_sock *inet = inet_sk(sk);
539 oif = sk->sk_bound_dev_if;
541 tos = RT_CONN_FLAGS(sk);
542 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 flowi4_init_output(fl4, oif, mark, tos,
545 RT_SCOPE_UNIVERSE, prot,
547 iph->daddr, iph->saddr, 0, 0,
548 sock_net_uid(net, sk));
551 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
552 const struct sock *sk)
554 const struct net *net = dev_net(skb->dev);
555 const struct iphdr *iph = ip_hdr(skb);
556 int oif = skb->dev->ifindex;
557 u8 tos = RT_TOS(iph->tos);
558 u8 prot = iph->protocol;
559 u32 mark = skb->mark;
561 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
564 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
566 const struct inet_sock *inet = inet_sk(sk);
567 const struct ip_options_rcu *inet_opt;
568 __be32 daddr = inet->inet_daddr;
571 inet_opt = rcu_dereference(inet->inet_opt);
572 if (inet_opt && inet_opt->opt.srr)
573 daddr = inet_opt->opt.faddr;
574 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
575 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
576 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
577 inet_sk_flowi_flags(sk),
578 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
582 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
583 const struct sk_buff *skb)
586 build_skb_flow_key(fl4, skb, sk);
588 build_sk_flow_key(fl4, sk);
591 static DEFINE_SPINLOCK(fnhe_lock);
593 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
597 rt = rcu_dereference(fnhe->fnhe_rth_input);
599 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
600 dst_dev_put(&rt->dst);
601 dst_release(&rt->dst);
603 rt = rcu_dereference(fnhe->fnhe_rth_output);
605 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
606 dst_dev_put(&rt->dst);
607 dst_release(&rt->dst);
611 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
613 struct fib_nh_exception *fnhe, *oldest;
615 oldest = rcu_dereference(hash->chain);
616 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
617 fnhe = rcu_dereference(fnhe->fnhe_next)) {
618 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
621 fnhe_flush_routes(oldest);
625 static inline u32 fnhe_hashfun(__be32 daddr)
627 static u32 fnhe_hashrnd __read_mostly;
630 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
631 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
632 return hash_32(hval, FNHE_HASH_SHIFT);
635 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
637 rt->rt_pmtu = fnhe->fnhe_pmtu;
638 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
639 rt->dst.expires = fnhe->fnhe_expires;
642 rt->rt_flags |= RTCF_REDIRECTED;
643 rt->rt_gateway = fnhe->fnhe_gw;
644 rt->rt_uses_gateway = 1;
648 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
649 u32 pmtu, bool lock, unsigned long expires)
651 struct fnhe_hash_bucket *hash;
652 struct fib_nh_exception *fnhe;
658 genid = fnhe_genid(dev_net(nh->nh_dev));
659 hval = fnhe_hashfun(daddr);
661 spin_lock_bh(&fnhe_lock);
663 hash = rcu_dereference(nh->nh_exceptions);
665 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
668 rcu_assign_pointer(nh->nh_exceptions, hash);
674 for (fnhe = rcu_dereference(hash->chain); fnhe;
675 fnhe = rcu_dereference(fnhe->fnhe_next)) {
676 if (fnhe->fnhe_daddr == daddr)
682 if (fnhe->fnhe_genid != genid)
683 fnhe->fnhe_genid = genid;
687 fnhe->fnhe_pmtu = pmtu;
688 fnhe->fnhe_mtu_locked = lock;
690 fnhe->fnhe_expires = max(1UL, expires);
691 /* Update all cached dsts too */
692 rt = rcu_dereference(fnhe->fnhe_rth_input);
694 fill_route_from_fnhe(rt, fnhe);
695 rt = rcu_dereference(fnhe->fnhe_rth_output);
697 fill_route_from_fnhe(rt, fnhe);
699 if (depth > FNHE_RECLAIM_DEPTH)
700 fnhe = fnhe_oldest(hash);
702 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
706 fnhe->fnhe_next = hash->chain;
707 rcu_assign_pointer(hash->chain, fnhe);
709 fnhe->fnhe_genid = genid;
710 fnhe->fnhe_daddr = daddr;
712 fnhe->fnhe_pmtu = pmtu;
713 fnhe->fnhe_mtu_locked = lock;
714 fnhe->fnhe_expires = expires;
716 /* Exception created; mark the cached routes for the nexthop
717 * stale, so anyone caching it rechecks if this exception
720 rt = rcu_dereference(nh->nh_rth_input);
722 rt->dst.obsolete = DST_OBSOLETE_KILL;
724 for_each_possible_cpu(i) {
725 struct rtable __rcu **prt;
726 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
727 rt = rcu_dereference(*prt);
729 rt->dst.obsolete = DST_OBSOLETE_KILL;
733 fnhe->fnhe_stamp = jiffies;
736 spin_unlock_bh(&fnhe_lock);
739 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
742 __be32 new_gw = icmp_hdr(skb)->un.gateway;
743 __be32 old_gw = ip_hdr(skb)->saddr;
744 struct net_device *dev = skb->dev;
745 struct in_device *in_dev;
746 struct fib_result res;
750 switch (icmp_hdr(skb)->code & 7) {
752 case ICMP_REDIR_NETTOS:
753 case ICMP_REDIR_HOST:
754 case ICMP_REDIR_HOSTTOS:
761 if (rt->rt_gateway != old_gw)
764 in_dev = __in_dev_get_rcu(dev);
769 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
770 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
771 ipv4_is_zeronet(new_gw))
772 goto reject_redirect;
774 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
775 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
776 goto reject_redirect;
777 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
778 goto reject_redirect;
780 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
781 goto reject_redirect;
784 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
786 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
788 if (!(n->nud_state & NUD_VALID)) {
789 neigh_event_send(n, NULL);
791 if (fib_lookup(net, fl4, &res, 0) == 0) {
792 struct fib_nh *nh = &FIB_RES_NH(res);
794 update_or_create_fnhe(nh, fl4->daddr, new_gw,
796 jiffies + ip_rt_gc_timeout);
799 rt->dst.obsolete = DST_OBSOLETE_KILL;
800 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
807 #ifdef CONFIG_IP_ROUTE_VERBOSE
808 if (IN_DEV_LOG_MARTIANS(in_dev)) {
809 const struct iphdr *iph = (const struct iphdr *) skb->data;
810 __be32 daddr = iph->daddr;
811 __be32 saddr = iph->saddr;
813 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
814 " Advised path = %pI4 -> %pI4\n",
815 &old_gw, dev->name, &new_gw,
822 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
826 const struct iphdr *iph = (const struct iphdr *) skb->data;
827 struct net *net = dev_net(skb->dev);
828 int oif = skb->dev->ifindex;
829 u8 tos = RT_TOS(iph->tos);
830 u8 prot = iph->protocol;
831 u32 mark = skb->mark;
833 rt = (struct rtable *) dst;
835 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
836 __ip_do_redirect(rt, skb, &fl4, true);
839 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
841 struct rtable *rt = (struct rtable *)dst;
842 struct dst_entry *ret = dst;
845 if (dst->obsolete > 0) {
848 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
859 * 1. The first ip_rt_redirect_number redirects are sent
860 * with exponential backoff, then we stop sending them at all,
861 * assuming that the host ignores our redirects.
862 * 2. If we did not see packets requiring redirects
863 * during ip_rt_redirect_silence, we assume that the host
864 * forgot redirected route and start to send redirects again.
866 * This algorithm is much cheaper and more intelligent than dumb load limiting
869 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
870 * and "frag. need" (breaks PMTU discovery) in icmp.c.
873 void ip_rt_send_redirect(struct sk_buff *skb)
875 struct rtable *rt = skb_rtable(skb);
876 struct in_device *in_dev;
877 struct inet_peer *peer;
883 in_dev = __in_dev_get_rcu(rt->dst.dev);
884 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
888 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
889 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
892 net = dev_net(rt->dst.dev);
893 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
895 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
896 rt_nexthop(rt, ip_hdr(skb)->daddr));
900 /* No redirected packets during ip_rt_redirect_silence;
901 * reset the algorithm.
903 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
904 peer->rate_tokens = 0;
906 /* Too many ignored redirects; do not send anything
907 * set dst.rate_last to the last seen redirected packet.
909 if (peer->rate_tokens >= ip_rt_redirect_number) {
910 peer->rate_last = jiffies;
914 /* Check for load limit; set rate_last to the latest sent
917 if (peer->rate_tokens == 0 ||
920 (ip_rt_redirect_load << peer->rate_tokens)))) {
921 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
923 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
924 peer->rate_last = jiffies;
926 #ifdef CONFIG_IP_ROUTE_VERBOSE
928 peer->rate_tokens == ip_rt_redirect_number)
929 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
930 &ip_hdr(skb)->saddr, inet_iif(skb),
931 &ip_hdr(skb)->daddr, &gw);
938 static int ip_error(struct sk_buff *skb)
940 struct rtable *rt = skb_rtable(skb);
941 struct net_device *dev = skb->dev;
942 struct in_device *in_dev;
943 struct inet_peer *peer;
949 if (netif_is_l3_master(skb->dev)) {
950 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
955 in_dev = __in_dev_get_rcu(dev);
957 /* IP on this device is disabled. */
961 net = dev_net(rt->dst.dev);
962 if (!IN_DEV_FORWARD(in_dev)) {
963 switch (rt->dst.error) {
965 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
969 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
975 switch (rt->dst.error) {
980 code = ICMP_HOST_UNREACH;
983 code = ICMP_NET_UNREACH;
984 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
987 code = ICMP_PKT_FILTERED;
991 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
992 l3mdev_master_ifindex(skb->dev), 1);
997 peer->rate_tokens += now - peer->rate_last;
998 if (peer->rate_tokens > ip_rt_error_burst)
999 peer->rate_tokens = ip_rt_error_burst;
1000 peer->rate_last = now;
1001 if (peer->rate_tokens >= ip_rt_error_cost)
1002 peer->rate_tokens -= ip_rt_error_cost;
1008 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1010 out: kfree_skb(skb);
1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1016 struct dst_entry *dst = &rt->dst;
1017 struct fib_result res;
1020 if (ip_mtu_locked(dst))
1023 if (ipv4_mtu(dst) < mtu)
1026 if (mtu < ip_rt_min_pmtu) {
1028 mtu = ip_rt_min_pmtu;
1031 if (rt->rt_pmtu == mtu &&
1032 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1036 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037 struct fib_nh *nh = &FIB_RES_NH(res);
1039 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1040 jiffies + ip_rt_mtu_expires);
1045 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046 struct sk_buff *skb, u32 mtu)
1048 struct rtable *rt = (struct rtable *) dst;
1051 ip_rt_build_flow_key(&fl4, sk, skb);
1052 __ip_rt_update_pmtu(rt, &fl4, mtu);
1055 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1056 int oif, u32 mark, u8 protocol, int flow_flags)
1058 const struct iphdr *iph = (const struct iphdr *) skb->data;
1063 mark = IP4_REPLY_MARK(net, skb->mark);
1065 __build_flow_key(net, &fl4, NULL, iph, oif,
1066 RT_TOS(iph->tos), protocol, mark, flow_flags);
1067 rt = __ip_route_output_key(net, &fl4);
1069 __ip_rt_update_pmtu(rt, &fl4, mtu);
1073 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1075 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1077 const struct iphdr *iph = (const struct iphdr *) skb->data;
1081 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1083 if (!fl4.flowi4_mark)
1084 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1086 rt = __ip_route_output_key(sock_net(sk), &fl4);
1088 __ip_rt_update_pmtu(rt, &fl4, mtu);
1093 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1095 const struct iphdr *iph = (const struct iphdr *) skb->data;
1098 struct dst_entry *odst = NULL;
1100 struct net *net = sock_net(sk);
1104 if (!ip_sk_accept_pmtu(sk))
1107 odst = sk_dst_get(sk);
1109 if (sock_owned_by_user(sk) || !odst) {
1110 __ipv4_sk_update_pmtu(skb, sk, mtu);
1114 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1116 rt = (struct rtable *)odst;
1117 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1118 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1125 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1127 if (!dst_check(&rt->dst, 0)) {
1129 dst_release(&rt->dst);
1131 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1139 sk_dst_set(sk, &rt->dst);
1145 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1147 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1148 int oif, u32 mark, u8 protocol, int flow_flags)
1150 const struct iphdr *iph = (const struct iphdr *) skb->data;
1154 __build_flow_key(net, &fl4, NULL, iph, oif,
1155 RT_TOS(iph->tos), protocol, mark, flow_flags);
1156 rt = __ip_route_output_key(net, &fl4);
1158 __ip_do_redirect(rt, skb, &fl4, false);
1162 EXPORT_SYMBOL_GPL(ipv4_redirect);
1164 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1166 const struct iphdr *iph = (const struct iphdr *) skb->data;
1169 struct net *net = sock_net(sk);
1171 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1172 rt = __ip_route_output_key(net, &fl4);
1174 __ip_do_redirect(rt, skb, &fl4, false);
1178 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1180 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182 struct rtable *rt = (struct rtable *) dst;
1184 /* All IPV4 dsts are created with ->obsolete set to the value
1185 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1186 * into this function always.
1188 * When a PMTU/redirect information update invalidates a route,
1189 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1190 * DST_OBSOLETE_DEAD by dst_free().
1192 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1197 static void ipv4_link_failure(struct sk_buff *skb)
1201 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1203 rt = skb_rtable(skb);
1205 dst_set_expires(&rt->dst, 0);
1208 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1210 pr_debug("%s: %pI4 -> %pI4, %s\n",
1211 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1212 skb->dev ? skb->dev->name : "?");
1219 We do not cache source address of outgoing interface,
1220 because it is used only by IP RR, TS and SRR options,
1221 so that it out of fast path.
1223 BTW remember: "addr" is allowed to be not aligned
1227 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1231 if (rt_is_output_route(rt))
1232 src = ip_hdr(skb)->saddr;
1234 struct fib_result res;
1240 memset(&fl4, 0, sizeof(fl4));
1241 fl4.daddr = iph->daddr;
1242 fl4.saddr = iph->saddr;
1243 fl4.flowi4_tos = RT_TOS(iph->tos);
1244 fl4.flowi4_oif = rt->dst.dev->ifindex;
1245 fl4.flowi4_iif = skb->dev->ifindex;
1246 fl4.flowi4_mark = skb->mark;
1249 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1250 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1252 src = inet_select_addr(rt->dst.dev,
1253 rt_nexthop(rt, iph->daddr),
1257 memcpy(addr, &src, 4);
1260 #ifdef CONFIG_IP_ROUTE_CLASSID
1261 static void set_class_tag(struct rtable *rt, u32 tag)
1263 if (!(rt->dst.tclassid & 0xFFFF))
1264 rt->dst.tclassid |= tag & 0xFFFF;
1265 if (!(rt->dst.tclassid & 0xFFFF0000))
1266 rt->dst.tclassid |= tag & 0xFFFF0000;
1270 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1272 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1273 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1276 return min(advmss, IPV4_MAX_PMTU - header_size);
1279 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1281 const struct rtable *rt = (const struct rtable *) dst;
1282 unsigned int mtu = rt->rt_pmtu;
1284 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1285 mtu = dst_metric_raw(dst, RTAX_MTU);
1290 mtu = READ_ONCE(dst->dev->mtu);
1292 if (unlikely(ip_mtu_locked(dst))) {
1293 if (rt->rt_uses_gateway && mtu > 576)
1297 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1299 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1302 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1304 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1305 struct fib_nh_exception *fnhe;
1311 hval = fnhe_hashfun(daddr);
1313 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1314 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1315 if (fnhe->fnhe_daddr == daddr)
1321 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1322 __be32 daddr, const bool do_cache)
1326 spin_lock_bh(&fnhe_lock);
1328 if (daddr == fnhe->fnhe_daddr) {
1329 struct rtable __rcu **porig;
1330 struct rtable *orig;
1331 int genid = fnhe_genid(dev_net(rt->dst.dev));
1333 if (rt_is_input_route(rt))
1334 porig = &fnhe->fnhe_rth_input;
1336 porig = &fnhe->fnhe_rth_output;
1337 orig = rcu_dereference(*porig);
1339 if (fnhe->fnhe_genid != genid) {
1340 fnhe->fnhe_genid = genid;
1342 fnhe->fnhe_pmtu = 0;
1343 fnhe->fnhe_expires = 0;
1344 fnhe_flush_routes(fnhe);
1347 fill_route_from_fnhe(rt, fnhe);
1348 if (!rt->rt_gateway)
1349 rt->rt_gateway = daddr;
1353 rcu_assign_pointer(*porig, rt);
1355 dst_dev_put(&orig->dst);
1356 dst_release(&orig->dst);
1361 fnhe->fnhe_stamp = jiffies;
1363 spin_unlock_bh(&fnhe_lock);
1368 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1370 struct rtable *orig, *prev, **p;
1373 if (rt_is_input_route(rt)) {
1374 p = (struct rtable **)&nh->nh_rth_input;
1376 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1380 /* hold dst before doing cmpxchg() to avoid race condition
1384 prev = cmpxchg(p, orig, rt);
1387 dst_dev_put(&orig->dst);
1388 dst_release(&orig->dst);
1391 dst_release(&rt->dst);
1398 struct uncached_list {
1400 struct list_head head;
1403 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1405 void rt_add_uncached_list(struct rtable *rt)
1407 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1409 rt->rt_uncached_list = ul;
1411 spin_lock_bh(&ul->lock);
1412 list_add_tail(&rt->rt_uncached, &ul->head);
1413 spin_unlock_bh(&ul->lock);
1416 void rt_del_uncached_list(struct rtable *rt)
1418 if (!list_empty(&rt->rt_uncached)) {
1419 struct uncached_list *ul = rt->rt_uncached_list;
1421 spin_lock_bh(&ul->lock);
1422 list_del(&rt->rt_uncached);
1423 spin_unlock_bh(&ul->lock);
1427 static void ipv4_dst_destroy(struct dst_entry *dst)
1429 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1430 struct rtable *rt = (struct rtable *)dst;
1432 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1435 rt_del_uncached_list(rt);
1438 void rt_flush_dev(struct net_device *dev)
1440 struct net *net = dev_net(dev);
1444 for_each_possible_cpu(cpu) {
1445 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1447 spin_lock_bh(&ul->lock);
1448 list_for_each_entry(rt, &ul->head, rt_uncached) {
1449 if (rt->dst.dev != dev)
1451 rt->dst.dev = net->loopback_dev;
1452 dev_hold(rt->dst.dev);
1455 spin_unlock_bh(&ul->lock);
1459 static bool rt_cache_valid(const struct rtable *rt)
1462 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1466 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1467 const struct fib_result *res,
1468 struct fib_nh_exception *fnhe,
1469 struct fib_info *fi, u16 type, u32 itag,
1470 const bool do_cache)
1472 bool cached = false;
1475 struct fib_nh *nh = &FIB_RES_NH(*res);
1477 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1478 rt->rt_gateway = nh->nh_gw;
1479 rt->rt_uses_gateway = 1;
1481 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1482 if (fi->fib_metrics != &dst_default_metrics) {
1483 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1484 refcount_inc(&fi->fib_metrics->refcnt);
1486 #ifdef CONFIG_IP_ROUTE_CLASSID
1487 rt->dst.tclassid = nh->nh_tclassid;
1489 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1491 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1493 cached = rt_cache_route(nh, rt);
1494 if (unlikely(!cached)) {
1495 /* Routes we intend to cache in nexthop exception or
1496 * FIB nexthop have the DST_NOCACHE bit clear.
1497 * However, if we are unsuccessful at storing this
1498 * route into the cache we really need to set it.
1500 if (!rt->rt_gateway)
1501 rt->rt_gateway = daddr;
1502 rt_add_uncached_list(rt);
1505 rt_add_uncached_list(rt);
1507 #ifdef CONFIG_IP_ROUTE_CLASSID
1508 #ifdef CONFIG_IP_MULTIPLE_TABLES
1509 set_class_tag(rt, res->tclassid);
1511 set_class_tag(rt, itag);
1515 struct rtable *rt_dst_alloc(struct net_device *dev,
1516 unsigned int flags, u16 type,
1517 bool nopolicy, bool noxfrm, bool will_cache)
1521 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1522 (will_cache ? 0 : DST_HOST) |
1523 (nopolicy ? DST_NOPOLICY : 0) |
1524 (noxfrm ? DST_NOXFRM : 0));
1527 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1528 rt->rt_flags = flags;
1530 rt->rt_is_input = 0;
1533 rt->rt_mtu_locked = 0;
1535 rt->rt_uses_gateway = 0;
1536 INIT_LIST_HEAD(&rt->rt_uncached);
1538 rt->dst.output = ip_output;
1539 if (flags & RTCF_LOCAL)
1540 rt->dst.input = ip_local_deliver;
1545 EXPORT_SYMBOL(rt_dst_alloc);
1547 /* called in rcu_read_lock() section */
1548 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1549 u8 tos, struct net_device *dev,
1550 struct in_device *in_dev, u32 *itag)
1554 /* Primary sanity checks. */
1558 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1559 skb->protocol != htons(ETH_P_IP))
1562 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1565 if (ipv4_is_zeronet(saddr)) {
1566 if (!ipv4_is_local_multicast(daddr))
1569 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1577 /* called in rcu_read_lock() section */
1578 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1579 u8 tos, struct net_device *dev, int our)
1581 struct in_device *in_dev = __in_dev_get_rcu(dev);
1582 unsigned int flags = RTCF_MULTICAST;
1587 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1592 flags |= RTCF_LOCAL;
1594 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1595 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1599 #ifdef CONFIG_IP_ROUTE_CLASSID
1600 rth->dst.tclassid = itag;
1602 rth->dst.output = ip_rt_bug;
1603 rth->rt_is_input= 1;
1605 #ifdef CONFIG_IP_MROUTE
1606 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1607 rth->dst.input = ip_mr_input;
1609 RT_CACHE_STAT_INC(in_slow_mc);
1611 skb_dst_set(skb, &rth->dst);
1616 static void ip_handle_martian_source(struct net_device *dev,
1617 struct in_device *in_dev,
1618 struct sk_buff *skb,
1622 RT_CACHE_STAT_INC(in_martian_src);
1623 #ifdef CONFIG_IP_ROUTE_VERBOSE
1624 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1626 * RFC1812 recommendation, if source is martian,
1627 * the only hint is MAC header.
1629 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1630 &daddr, &saddr, dev->name);
1631 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1632 print_hex_dump(KERN_WARNING, "ll header: ",
1633 DUMP_PREFIX_OFFSET, 16, 1,
1634 skb_mac_header(skb),
1635 dev->hard_header_len, true);
1641 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1643 struct fnhe_hash_bucket *hash;
1644 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1645 u32 hval = fnhe_hashfun(daddr);
1647 spin_lock_bh(&fnhe_lock);
1649 hash = rcu_dereference_protected(nh->nh_exceptions,
1650 lockdep_is_held(&fnhe_lock));
1653 fnhe_p = &hash->chain;
1654 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1656 if (fnhe->fnhe_daddr == daddr) {
1657 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1658 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1659 fnhe_flush_routes(fnhe);
1660 kfree_rcu(fnhe, rcu);
1663 fnhe_p = &fnhe->fnhe_next;
1664 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1665 lockdep_is_held(&fnhe_lock));
1668 spin_unlock_bh(&fnhe_lock);
1671 /* called in rcu_read_lock() section */
1672 static int __mkroute_input(struct sk_buff *skb,
1673 const struct fib_result *res,
1674 struct in_device *in_dev,
1675 __be32 daddr, __be32 saddr, u32 tos)
1677 struct fib_nh_exception *fnhe;
1680 struct in_device *out_dev;
1684 /* get a working reference to the output device */
1685 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1687 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1691 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1692 in_dev->dev, in_dev, &itag);
1694 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1700 do_cache = res->fi && !itag;
1701 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1702 skb->protocol == htons(ETH_P_IP) &&
1703 (IN_DEV_SHARED_MEDIA(out_dev) ||
1704 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1705 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1707 if (skb->protocol != htons(ETH_P_IP)) {
1708 /* Not IP (i.e. ARP). Do not create route, if it is
1709 * invalid for proxy arp. DNAT routes are always valid.
1711 * Proxy arp feature have been extended to allow, ARP
1712 * replies back to the same interface, to support
1713 * Private VLAN switch technologies. See arp.c.
1715 if (out_dev == in_dev &&
1716 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1722 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1725 rth = rcu_dereference(fnhe->fnhe_rth_input);
1726 if (rth && rth->dst.expires &&
1727 time_after(jiffies, rth->dst.expires)) {
1728 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1735 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1738 if (rt_cache_valid(rth)) {
1739 skb_dst_set_noref(skb, &rth->dst);
1744 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1745 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1746 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1752 rth->rt_is_input = 1;
1753 RT_CACHE_STAT_INC(in_slow_tot);
1755 rth->dst.input = ip_forward;
1757 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1759 lwtunnel_set_redirect(&rth->dst);
1760 skb_dst_set(skb, &rth->dst);
1767 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1768 /* To make ICMP packets follow the right flow, the multipath hash is
1769 * calculated from the inner IP addresses.
1771 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1772 struct flow_keys *hash_keys)
1774 const struct iphdr *outer_iph = ip_hdr(skb);
1775 const struct iphdr *key_iph = outer_iph;
1776 const struct iphdr *inner_iph;
1777 const struct icmphdr *icmph;
1778 struct iphdr _inner_iph;
1779 struct icmphdr _icmph;
1781 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1784 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1787 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1792 if (icmph->type != ICMP_DEST_UNREACH &&
1793 icmph->type != ICMP_REDIRECT &&
1794 icmph->type != ICMP_TIME_EXCEEDED &&
1795 icmph->type != ICMP_PARAMETERPROB)
1798 inner_iph = skb_header_pointer(skb,
1799 outer_iph->ihl * 4 + sizeof(_icmph),
1800 sizeof(_inner_iph), &_inner_iph);
1804 key_iph = inner_iph;
1806 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1807 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1810 /* if skb is set it will be used and fl4 can be NULL */
1811 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1812 const struct sk_buff *skb, struct flow_keys *flkeys)
1814 struct flow_keys hash_keys;
1817 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1819 memset(&hash_keys, 0, sizeof(hash_keys));
1820 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1822 ip_multipath_l3_keys(skb, &hash_keys);
1824 hash_keys.addrs.v4addrs.src = fl4->saddr;
1825 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1829 /* skb is currently provided only when forwarding */
1831 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1832 struct flow_keys keys;
1834 /* short-circuit if we already have L4 hash present */
1836 return skb_get_hash_raw(skb) >> 1;
1838 memset(&hash_keys, 0, sizeof(hash_keys));
1841 skb_flow_dissect_flow_keys(skb, &keys, flag);
1845 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1846 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1847 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1848 hash_keys.ports.src = flkeys->ports.src;
1849 hash_keys.ports.dst = flkeys->ports.dst;
1850 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1852 memset(&hash_keys, 0, sizeof(hash_keys));
1853 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1854 hash_keys.addrs.v4addrs.src = fl4->saddr;
1855 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1856 hash_keys.ports.src = fl4->fl4_sport;
1857 hash_keys.ports.dst = fl4->fl4_dport;
1858 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1862 mhash = flow_hash_from_keys(&hash_keys);
1866 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1868 static int ip_mkroute_input(struct sk_buff *skb,
1869 struct fib_result *res,
1870 struct in_device *in_dev,
1871 __be32 daddr, __be32 saddr, u32 tos,
1872 struct flow_keys *hkeys)
1874 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1875 if (res->fi && res->fi->fib_nhs > 1) {
1876 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1878 fib_select_multipath(res, h);
1882 /* create a routing cache entry */
1883 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1887 * NOTE. We drop all the packets that has local source
1888 * addresses, because every properly looped back packet
1889 * must have correct destination already attached by output routine.
1891 * Such approach solves two big problems:
1892 * 1. Not simplex devices are handled properly.
1893 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1894 * called with rcu_read_lock()
1897 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1898 u8 tos, struct net_device *dev,
1899 struct fib_result *res)
1901 struct in_device *in_dev = __in_dev_get_rcu(dev);
1902 struct flow_keys *flkeys = NULL, _flkeys;
1903 struct net *net = dev_net(dev);
1904 struct ip_tunnel_info *tun_info;
1906 unsigned int flags = 0;
1912 /* IP on this device is disabled. */
1917 /* Check for the most weird martians, which can be not detected
1921 tun_info = skb_tunnel_info(skb);
1922 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1923 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1925 fl4.flowi4_tun_key.tun_id = 0;
1928 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1929 goto martian_source;
1933 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1936 /* Accept zero addresses only to limited broadcast;
1937 * I even do not know to fix it or not. Waiting for complains :-)
1939 if (ipv4_is_zeronet(saddr))
1940 goto martian_source;
1942 if (ipv4_is_zeronet(daddr))
1943 goto martian_destination;
1945 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1946 * and call it once if daddr or/and saddr are loopback addresses
1948 if (ipv4_is_loopback(daddr)) {
1949 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1950 goto martian_destination;
1951 } else if (ipv4_is_loopback(saddr)) {
1952 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1953 goto martian_source;
1957 * Now we are ready to route packet.
1960 fl4.flowi4_iif = dev->ifindex;
1961 fl4.flowi4_mark = skb->mark;
1962 fl4.flowi4_tos = tos;
1963 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1964 fl4.flowi4_flags = 0;
1967 fl4.flowi4_uid = sock_net_uid(net, NULL);
1969 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
1972 err = fib_lookup(net, &fl4, res, 0);
1974 if (!IN_DEV_FORWARD(in_dev))
1975 err = -EHOSTUNREACH;
1979 if (res->type == RTN_BROADCAST)
1982 if (res->type == RTN_LOCAL) {
1983 err = fib_validate_source(skb, saddr, daddr, tos,
1984 0, dev, in_dev, &itag);
1986 goto martian_source;
1990 if (!IN_DEV_FORWARD(in_dev)) {
1991 err = -EHOSTUNREACH;
1994 if (res->type != RTN_UNICAST)
1995 goto martian_destination;
1997 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2001 if (skb->protocol != htons(ETH_P_IP))
2004 if (!ipv4_is_zeronet(saddr)) {
2005 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2008 goto martian_source;
2010 flags |= RTCF_BROADCAST;
2011 res->type = RTN_BROADCAST;
2012 RT_CACHE_STAT_INC(in_brd);
2018 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019 if (rt_cache_valid(rth)) {
2020 skb_dst_set_noref(skb, &rth->dst);
2028 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029 flags | RTCF_LOCAL, res->type,
2030 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2034 rth->dst.output= ip_rt_bug;
2035 #ifdef CONFIG_IP_ROUTE_CLASSID
2036 rth->dst.tclassid = itag;
2038 rth->rt_is_input = 1;
2040 RT_CACHE_STAT_INC(in_slow_tot);
2041 if (res->type == RTN_UNREACHABLE) {
2042 rth->dst.input= ip_error;
2043 rth->dst.error= -err;
2044 rth->rt_flags &= ~RTCF_LOCAL;
2048 struct fib_nh *nh = &FIB_RES_NH(*res);
2050 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2051 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2052 WARN_ON(rth->dst.input == lwtunnel_input);
2053 rth->dst.lwtstate->orig_input = rth->dst.input;
2054 rth->dst.input = lwtunnel_input;
2057 if (unlikely(!rt_cache_route(nh, rth)))
2058 rt_add_uncached_list(rth);
2060 skb_dst_set(skb, &rth->dst);
2065 RT_CACHE_STAT_INC(in_no_route);
2066 res->type = RTN_UNREACHABLE;
2072 * Do not cache martian addresses: they should be logged (RFC1812)
2074 martian_destination:
2075 RT_CACHE_STAT_INC(in_martian_dst);
2076 #ifdef CONFIG_IP_ROUTE_VERBOSE
2077 if (IN_DEV_LOG_MARTIANS(in_dev))
2078 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2079 &daddr, &saddr, dev->name);
2091 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2095 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096 u8 tos, struct net_device *dev)
2098 struct fib_result res;
2101 tos &= IPTOS_RT_MASK;
2103 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2108 EXPORT_SYMBOL(ip_route_input_noref);
2110 /* called with rcu_read_lock held */
2111 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2112 u8 tos, struct net_device *dev, struct fib_result *res)
2114 /* Multicast recognition logic is moved from route cache to here.
2115 The problem was that too many Ethernet cards have broken/missing
2116 hardware multicast filters :-( As result the host on multicasting
2117 network acquires a lot of useless route cache entries, sort of
2118 SDR messages from all the world. Now we try to get rid of them.
2119 Really, provided software IP multicast filter is organized
2120 reasonably (at least, hashed), it does not result in a slowdown
2121 comparing with route cache reject entries.
2122 Note, that multicast routers are not affected, because
2123 route cache entry is created eventually.
2125 if (ipv4_is_multicast(daddr)) {
2126 struct in_device *in_dev = __in_dev_get_rcu(dev);
2131 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2132 ip_hdr(skb)->protocol);
2134 /* check l3 master if no match yet */
2135 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2136 struct in_device *l3_in_dev;
2138 l3_in_dev = __in_dev_get_rcu(skb->dev);
2140 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2141 ip_hdr(skb)->protocol);
2145 #ifdef CONFIG_IP_MROUTE
2147 (!ipv4_is_local_multicast(daddr) &&
2148 IN_DEV_MFORWARD(in_dev))
2151 err = ip_route_input_mc(skb, daddr, saddr,
2157 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2160 /* called with rcu_read_lock() */
2161 static struct rtable *__mkroute_output(const struct fib_result *res,
2162 const struct flowi4 *fl4, int orig_oif,
2163 struct net_device *dev_out,
2166 struct fib_info *fi = res->fi;
2167 struct fib_nh_exception *fnhe;
2168 struct in_device *in_dev;
2169 u16 type = res->type;
2173 in_dev = __in_dev_get_rcu(dev_out);
2175 return ERR_PTR(-EINVAL);
2177 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2178 if (ipv4_is_loopback(fl4->saddr) &&
2179 !(dev_out->flags & IFF_LOOPBACK) &&
2180 !netif_is_l3_master(dev_out))
2181 return ERR_PTR(-EINVAL);
2183 if (ipv4_is_lbcast(fl4->daddr))
2184 type = RTN_BROADCAST;
2185 else if (ipv4_is_multicast(fl4->daddr))
2186 type = RTN_MULTICAST;
2187 else if (ipv4_is_zeronet(fl4->daddr))
2188 return ERR_PTR(-EINVAL);
2190 if (dev_out->flags & IFF_LOOPBACK)
2191 flags |= RTCF_LOCAL;
2194 if (type == RTN_BROADCAST) {
2195 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2197 } else if (type == RTN_MULTICAST) {
2198 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2199 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2201 flags &= ~RTCF_LOCAL;
2204 /* If multicast route do not exist use
2205 * default one, but do not gateway in this case.
2208 if (fi && res->prefixlen < 4)
2210 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2211 (orig_oif != dev_out->ifindex)) {
2212 /* For local routes that require a particular output interface
2213 * we do not want to cache the result. Caching the result
2214 * causes incorrect behaviour when there are multiple source
2215 * addresses on the interface, the end result being that if the
2216 * intended recipient is waiting on that interface for the
2217 * packet he won't receive it because it will be delivered on
2218 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2219 * be set to the loopback interface as well.
2225 do_cache &= fi != NULL;
2227 struct rtable __rcu **prth;
2228 struct fib_nh *nh = &FIB_RES_NH(*res);
2230 fnhe = find_exception(nh, fl4->daddr);
2232 prth = &fnhe->fnhe_rth_output;
2233 rth = rcu_dereference(*prth);
2234 if (rth && rth->dst.expires &&
2235 time_after(jiffies, rth->dst.expires)) {
2236 ip_del_fnhe(nh, fl4->daddr);
2243 if (unlikely(fl4->flowi4_flags &
2244 FLOWI_FLAG_KNOWN_NH &&
2246 nh->nh_scope == RT_SCOPE_LINK))) {
2250 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2251 rth = rcu_dereference(*prth);
2254 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2259 rth = rt_dst_alloc(dev_out, flags, type,
2260 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2261 IN_DEV_CONF_GET(in_dev, NOXFRM),
2264 return ERR_PTR(-ENOBUFS);
2266 rth->rt_iif = orig_oif;
2268 RT_CACHE_STAT_INC(out_slow_tot);
2270 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2271 if (flags & RTCF_LOCAL &&
2272 !(dev_out->flags & IFF_LOOPBACK)) {
2273 rth->dst.output = ip_mc_output;
2274 RT_CACHE_STAT_INC(out_slow_mc);
2276 #ifdef CONFIG_IP_MROUTE
2277 if (type == RTN_MULTICAST) {
2278 if (IN_DEV_MFORWARD(in_dev) &&
2279 !ipv4_is_local_multicast(fl4->daddr)) {
2280 rth->dst.input = ip_mr_input;
2281 rth->dst.output = ip_mc_output;
2287 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2288 lwtunnel_set_redirect(&rth->dst);
2294 * Major route resolver routine.
2297 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2298 const struct sk_buff *skb)
2300 __u8 tos = RT_FL_TOS(fl4);
2301 struct fib_result res;
2308 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2309 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2310 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2311 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2314 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2319 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2321 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2322 struct fib_result *res,
2323 const struct sk_buff *skb)
2325 struct net_device *dev_out = NULL;
2326 int orig_oif = fl4->flowi4_oif;
2327 unsigned int flags = 0;
2329 int err = -ENETUNREACH;
2332 rth = ERR_PTR(-EINVAL);
2333 if (ipv4_is_multicast(fl4->saddr) ||
2334 ipv4_is_lbcast(fl4->saddr) ||
2335 ipv4_is_zeronet(fl4->saddr))
2338 /* I removed check for oif == dev_out->oif here.
2339 It was wrong for two reasons:
2340 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2341 is assigned to multiple interfaces.
2342 2. Moreover, we are allowed to send packets with saddr
2343 of another iface. --ANK
2346 if (fl4->flowi4_oif == 0 &&
2347 (ipv4_is_multicast(fl4->daddr) ||
2348 ipv4_is_lbcast(fl4->daddr))) {
2349 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2350 dev_out = __ip_dev_find(net, fl4->saddr, false);
2354 /* Special hack: user can direct multicasts
2355 and limited broadcast via necessary interface
2356 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2357 This hack is not just for fun, it allows
2358 vic,vat and friends to work.
2359 They bind socket to loopback, set ttl to zero
2360 and expect that it will work.
2361 From the viewpoint of routing cache they are broken,
2362 because we are not allowed to build multicast path
2363 with loopback source addr (look, routing cache
2364 cannot know, that ttl is zero, so that packet
2365 will not leave this host and route is valid).
2366 Luckily, this hack is good workaround.
2369 fl4->flowi4_oif = dev_out->ifindex;
2373 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2374 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2375 if (!__ip_dev_find(net, fl4->saddr, false))
2381 if (fl4->flowi4_oif) {
2382 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2383 rth = ERR_PTR(-ENODEV);
2387 /* RACE: Check return value of inet_select_addr instead. */
2388 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2389 rth = ERR_PTR(-ENETUNREACH);
2392 if (ipv4_is_local_multicast(fl4->daddr) ||
2393 ipv4_is_lbcast(fl4->daddr) ||
2394 fl4->flowi4_proto == IPPROTO_IGMP) {
2396 fl4->saddr = inet_select_addr(dev_out, 0,
2401 if (ipv4_is_multicast(fl4->daddr))
2402 fl4->saddr = inet_select_addr(dev_out, 0,
2404 else if (!fl4->daddr)
2405 fl4->saddr = inet_select_addr(dev_out, 0,
2411 fl4->daddr = fl4->saddr;
2413 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2414 dev_out = net->loopback_dev;
2415 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2416 res->type = RTN_LOCAL;
2417 flags |= RTCF_LOCAL;
2421 err = fib_lookup(net, fl4, res, 0);
2425 if (fl4->flowi4_oif &&
2426 (ipv4_is_multicast(fl4->daddr) ||
2427 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2428 /* Apparently, routing tables are wrong. Assume,
2429 that the destination is on link.
2432 Because we are allowed to send to iface
2433 even if it has NO routes and NO assigned
2434 addresses. When oif is specified, routing
2435 tables are looked up with only one purpose:
2436 to catch if destination is gatewayed, rather than
2437 direct. Moreover, if MSG_DONTROUTE is set,
2438 we send packet, ignoring both routing tables
2439 and ifaddr state. --ANK
2442 We could make it even if oif is unknown,
2443 likely IPv6, but we do not.
2446 if (fl4->saddr == 0)
2447 fl4->saddr = inet_select_addr(dev_out, 0,
2449 res->type = RTN_UNICAST;
2456 if (res->type == RTN_LOCAL) {
2458 if (res->fi->fib_prefsrc)
2459 fl4->saddr = res->fi->fib_prefsrc;
2461 fl4->saddr = fl4->daddr;
2464 /* L3 master device is the loopback for that domain */
2465 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2468 /* make sure orig_oif points to fib result device even
2469 * though packet rx/tx happens over loopback or l3mdev
2471 orig_oif = FIB_RES_OIF(*res);
2473 fl4->flowi4_oif = dev_out->ifindex;
2474 flags |= RTCF_LOCAL;
2478 fib_select_path(net, res, fl4, skb);
2480 dev_out = FIB_RES_DEV(*res);
2481 fl4->flowi4_oif = dev_out->ifindex;
2485 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2491 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2496 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2498 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2500 return mtu ? : dst->dev->mtu;
2503 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2504 struct sk_buff *skb, u32 mtu)
2508 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2509 struct sk_buff *skb)
2513 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2519 static struct dst_ops ipv4_dst_blackhole_ops = {
2521 .check = ipv4_blackhole_dst_check,
2522 .mtu = ipv4_blackhole_mtu,
2523 .default_advmss = ipv4_default_advmss,
2524 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2525 .redirect = ipv4_rt_blackhole_redirect,
2526 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2527 .neigh_lookup = ipv4_neigh_lookup,
2530 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2532 struct rtable *ort = (struct rtable *) dst_orig;
2535 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2537 struct dst_entry *new = &rt->dst;
2540 new->input = dst_discard;
2541 new->output = dst_discard_out;
2543 new->dev = net->loopback_dev;
2547 rt->rt_is_input = ort->rt_is_input;
2548 rt->rt_iif = ort->rt_iif;
2549 rt->rt_pmtu = ort->rt_pmtu;
2550 rt->rt_mtu_locked = ort->rt_mtu_locked;
2552 rt->rt_genid = rt_genid_ipv4(net);
2553 rt->rt_flags = ort->rt_flags;
2554 rt->rt_type = ort->rt_type;
2555 rt->rt_gateway = ort->rt_gateway;
2556 rt->rt_uses_gateway = ort->rt_uses_gateway;
2558 INIT_LIST_HEAD(&rt->rt_uncached);
2561 dst_release(dst_orig);
2563 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2566 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2567 const struct sock *sk)
2569 struct rtable *rt = __ip_route_output_key(net, flp4);
2574 if (flp4->flowi4_proto)
2575 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2576 flowi4_to_flowi(flp4),
2581 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2583 /* called with rcu_read_lock held */
2584 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2585 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2588 struct rtable *rt = skb_rtable(skb);
2590 struct nlmsghdr *nlh;
2591 unsigned long expires = 0;
2593 u32 metrics[RTAX_MAX];
2595 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2599 r = nlmsg_data(nlh);
2600 r->rtm_family = AF_INET;
2601 r->rtm_dst_len = 32;
2603 r->rtm_tos = fl4->flowi4_tos;
2604 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2605 if (nla_put_u32(skb, RTA_TABLE, table_id))
2606 goto nla_put_failure;
2607 r->rtm_type = rt->rt_type;
2608 r->rtm_scope = RT_SCOPE_UNIVERSE;
2609 r->rtm_protocol = RTPROT_UNSPEC;
2610 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2611 if (rt->rt_flags & RTCF_NOTIFY)
2612 r->rtm_flags |= RTM_F_NOTIFY;
2613 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2614 r->rtm_flags |= RTCF_DOREDIRECT;
2616 if (nla_put_in_addr(skb, RTA_DST, dst))
2617 goto nla_put_failure;
2619 r->rtm_src_len = 32;
2620 if (nla_put_in_addr(skb, RTA_SRC, src))
2621 goto nla_put_failure;
2624 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2625 goto nla_put_failure;
2626 #ifdef CONFIG_IP_ROUTE_CLASSID
2627 if (rt->dst.tclassid &&
2628 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2629 goto nla_put_failure;
2631 if (!rt_is_input_route(rt) &&
2632 fl4->saddr != src) {
2633 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2634 goto nla_put_failure;
2636 if (rt->rt_uses_gateway &&
2637 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2638 goto nla_put_failure;
2640 expires = rt->dst.expires;
2642 unsigned long now = jiffies;
2644 if (time_before(now, expires))
2650 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2651 if (rt->rt_pmtu && expires)
2652 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2653 if (rt->rt_mtu_locked && expires)
2654 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2655 if (rtnetlink_put_metrics(skb, metrics) < 0)
2656 goto nla_put_failure;
2658 if (fl4->flowi4_mark &&
2659 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2660 goto nla_put_failure;
2662 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2663 nla_put_u32(skb, RTA_UID,
2664 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2665 goto nla_put_failure;
2667 error = rt->dst.error;
2669 if (rt_is_input_route(rt)) {
2670 #ifdef CONFIG_IP_MROUTE
2671 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2672 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2673 int err = ipmr_get_route(net, skb,
2674 fl4->saddr, fl4->daddr,
2680 goto nla_put_failure;
2684 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2685 goto nla_put_failure;
2688 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2689 goto nla_put_failure;
2691 nlmsg_end(skb, nlh);
2695 nlmsg_cancel(skb, nlh);
2699 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2700 struct netlink_ext_ack *extack)
2702 struct net *net = sock_net(in_skb->sk);
2704 struct nlattr *tb[RTA_MAX+1];
2705 struct fib_result res = {};
2706 struct rtable *rt = NULL;
2713 struct sk_buff *skb;
2714 u32 table_id = RT_TABLE_MAIN;
2717 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2722 rtm = nlmsg_data(nlh);
2724 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2730 /* Reserve room for dummy headers, this skb can pass
2731 through good chunk of routing engine.
2733 skb_reset_mac_header(skb);
2734 skb_reset_network_header(skb);
2736 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2737 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2738 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2739 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2741 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2743 uid = (iif ? INVALID_UID : current_uid());
2745 /* Bugfix: need to give ip_route_input enough of an IP header to
2748 ip_hdr(skb)->protocol = IPPROTO_UDP;
2749 ip_hdr(skb)->saddr = src;
2750 ip_hdr(skb)->daddr = dst;
2752 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2754 memset(&fl4, 0, sizeof(fl4));
2757 fl4.flowi4_tos = rtm->rtm_tos;
2758 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2759 fl4.flowi4_mark = mark;
2760 fl4.flowi4_uid = uid;
2765 struct net_device *dev;
2767 dev = dev_get_by_index_rcu(net, iif);
2773 skb->protocol = htons(ETH_P_IP);
2776 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2779 rt = skb_rtable(skb);
2780 if (err == 0 && rt->dst.error)
2781 err = -rt->dst.error;
2783 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2784 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2789 skb_dst_set(skb, &rt->dst);
2795 if (rtm->rtm_flags & RTM_F_NOTIFY)
2796 rt->rt_flags |= RTCF_NOTIFY;
2798 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2799 table_id = res.table ? res.table->tb_id : 0;
2801 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2803 err = fib_props[res.type].error;
2805 err = -EHOSTUNREACH;
2808 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2809 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2810 rt->rt_type, res.prefix, res.prefixlen,
2811 fl4.flowi4_tos, res.fi, 0);
2813 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2814 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2821 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2831 void ip_rt_multicast_event(struct in_device *in_dev)
2833 rt_cache_flush(dev_net(in_dev->dev));
2836 #ifdef CONFIG_SYSCTL
2837 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2838 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2839 static int ip_rt_gc_elasticity __read_mostly = 8;
2840 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
2842 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2843 void __user *buffer,
2844 size_t *lenp, loff_t *ppos)
2846 struct net *net = (struct net *)__ctl->extra1;
2849 rt_cache_flush(net);
2850 fnhe_genid_bump(net);
2857 static struct ctl_table ipv4_route_table[] = {
2859 .procname = "gc_thresh",
2860 .data = &ipv4_dst_ops.gc_thresh,
2861 .maxlen = sizeof(int),
2863 .proc_handler = proc_dointvec,
2866 .procname = "max_size",
2867 .data = &ip_rt_max_size,
2868 .maxlen = sizeof(int),
2870 .proc_handler = proc_dointvec,
2873 /* Deprecated. Use gc_min_interval_ms */
2875 .procname = "gc_min_interval",
2876 .data = &ip_rt_gc_min_interval,
2877 .maxlen = sizeof(int),
2879 .proc_handler = proc_dointvec_jiffies,
2882 .procname = "gc_min_interval_ms",
2883 .data = &ip_rt_gc_min_interval,
2884 .maxlen = sizeof(int),
2886 .proc_handler = proc_dointvec_ms_jiffies,
2889 .procname = "gc_timeout",
2890 .data = &ip_rt_gc_timeout,
2891 .maxlen = sizeof(int),
2893 .proc_handler = proc_dointvec_jiffies,
2896 .procname = "gc_interval",
2897 .data = &ip_rt_gc_interval,
2898 .maxlen = sizeof(int),
2900 .proc_handler = proc_dointvec_jiffies,
2903 .procname = "redirect_load",
2904 .data = &ip_rt_redirect_load,
2905 .maxlen = sizeof(int),
2907 .proc_handler = proc_dointvec,
2910 .procname = "redirect_number",
2911 .data = &ip_rt_redirect_number,
2912 .maxlen = sizeof(int),
2914 .proc_handler = proc_dointvec,
2917 .procname = "redirect_silence",
2918 .data = &ip_rt_redirect_silence,
2919 .maxlen = sizeof(int),
2921 .proc_handler = proc_dointvec,
2924 .procname = "error_cost",
2925 .data = &ip_rt_error_cost,
2926 .maxlen = sizeof(int),
2928 .proc_handler = proc_dointvec,
2931 .procname = "error_burst",
2932 .data = &ip_rt_error_burst,
2933 .maxlen = sizeof(int),
2935 .proc_handler = proc_dointvec,
2938 .procname = "gc_elasticity",
2939 .data = &ip_rt_gc_elasticity,
2940 .maxlen = sizeof(int),
2942 .proc_handler = proc_dointvec,
2945 .procname = "mtu_expires",
2946 .data = &ip_rt_mtu_expires,
2947 .maxlen = sizeof(int),
2949 .proc_handler = proc_dointvec_jiffies,
2952 .procname = "min_pmtu",
2953 .data = &ip_rt_min_pmtu,
2954 .maxlen = sizeof(int),
2956 .proc_handler = proc_dointvec_minmax,
2957 .extra1 = &ip_min_valid_pmtu,
2960 .procname = "min_adv_mss",
2961 .data = &ip_rt_min_advmss,
2962 .maxlen = sizeof(int),
2964 .proc_handler = proc_dointvec,
2969 static struct ctl_table ipv4_route_flush_table[] = {
2971 .procname = "flush",
2972 .maxlen = sizeof(int),
2974 .proc_handler = ipv4_sysctl_rtcache_flush,
2979 static __net_init int sysctl_route_net_init(struct net *net)
2981 struct ctl_table *tbl;
2983 tbl = ipv4_route_flush_table;
2984 if (!net_eq(net, &init_net)) {
2985 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2989 /* Don't export sysctls to unprivileged users */
2990 if (net->user_ns != &init_user_ns)
2991 tbl[0].procname = NULL;
2993 tbl[0].extra1 = net;
2995 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2996 if (!net->ipv4.route_hdr)
3001 if (tbl != ipv4_route_flush_table)
3007 static __net_exit void sysctl_route_net_exit(struct net *net)
3009 struct ctl_table *tbl;
3011 tbl = net->ipv4.route_hdr->ctl_table_arg;
3012 unregister_net_sysctl_table(net->ipv4.route_hdr);
3013 BUG_ON(tbl == ipv4_route_flush_table);
3017 static __net_initdata struct pernet_operations sysctl_route_ops = {
3018 .init = sysctl_route_net_init,
3019 .exit = sysctl_route_net_exit,
3024 static __net_init int rt_genid_init(struct net *net)
3026 atomic_set(&net->ipv4.rt_genid, 0);
3027 atomic_set(&net->fnhe_genid, 0);
3028 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3032 static __net_initdata struct pernet_operations rt_genid_ops = {
3033 .init = rt_genid_init,
3037 static int __net_init ipv4_inetpeer_init(struct net *net)
3039 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3043 inet_peer_base_init(bp);
3044 net->ipv4.peers = bp;
3048 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3050 struct inet_peer_base *bp = net->ipv4.peers;
3052 net->ipv4.peers = NULL;
3053 inetpeer_invalidate_tree(bp);
3057 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3058 .init = ipv4_inetpeer_init,
3059 .exit = ipv4_inetpeer_exit,
3063 #ifdef CONFIG_IP_ROUTE_CLASSID
3064 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3065 #endif /* CONFIG_IP_ROUTE_CLASSID */
3067 int __init ip_rt_init(void)
3071 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3073 panic("IP: failed to allocate ip_idents\n");
3075 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3077 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3079 panic("IP: failed to allocate ip_tstamps\n");
3081 for_each_possible_cpu(cpu) {
3082 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3084 INIT_LIST_HEAD(&ul->head);
3085 spin_lock_init(&ul->lock);
3087 #ifdef CONFIG_IP_ROUTE_CLASSID
3088 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3090 panic("IP: failed to allocate ip_rt_acct\n");
3093 ipv4_dst_ops.kmem_cachep =
3094 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3095 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3097 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3099 if (dst_entries_init(&ipv4_dst_ops) < 0)
3100 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3102 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3103 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3105 ipv4_dst_ops.gc_thresh = ~0;
3106 ip_rt_max_size = INT_MAX;
3111 if (ip_rt_proc_init())
3112 pr_err("Unable to create route proc files\n");
3117 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3118 RTNL_FLAG_DOIT_UNLOCKED);
3120 #ifdef CONFIG_SYSCTL
3121 register_pernet_subsys(&sysctl_route_ops);
3123 register_pernet_subsys(&rt_genid_ops);
3124 register_pernet_subsys(&ipv4_inetpeer_ops);
3128 #ifdef CONFIG_SYSCTL
3130 * We really need to sanitize the damn ipv4 init order, then all
3131 * this nonsense will go away.
3133 void __init ip_static_sysctl_init(void)
3135 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);