1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * IPv6 output functions
4 * Linux INET6 implementation
7 * Pedro Roque <roque@di.fc.ul.pt>
9 * Based on linux/net/ipv4/ip_output.c
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 const struct in6_addr *nexthop;
64 struct neighbour *neigh;
67 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
68 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
70 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
71 ((mroute6_is_socket(net, skb) &&
72 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
73 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
74 &ipv6_hdr(skb)->saddr))) {
75 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
77 /* Do not check for IFF_ALLMULTI; multicast routing
78 is not supported in any case.
81 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
82 net, sk, newskb, NULL, newskb->dev,
85 if (ipv6_hdr(skb)->hop_limit == 0) {
86 IP6_INC_STATS(net, idev,
87 IPSTATS_MIB_OUTDISCARDS);
93 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
95 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
96 IPV6_ADDR_SCOPE_NODELOCAL &&
97 !(dev->flags & IFF_LOOPBACK)) {
103 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
104 int res = lwtunnel_xmit(skb);
106 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
111 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
112 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
113 if (unlikely(!neigh))
114 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
115 if (!IS_ERR(neigh)) {
116 sock_confirm_neigh(skb, neigh);
117 ret = neigh_output(neigh, skb, false);
118 rcu_read_unlock_bh();
121 rcu_read_unlock_bh();
123 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
129 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
130 struct sk_buff *skb, unsigned int mtu)
132 struct sk_buff *segs, *nskb;
133 netdev_features_t features;
136 /* Please see corresponding comment in ip_finish_output_gso
137 * describing the cases where GSO segment length exceeds the
140 features = netif_skb_features(skb);
141 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
142 if (IS_ERR_OR_NULL(segs)) {
149 skb_list_walk_safe(segs, segs, nskb) {
152 skb_mark_not_on_list(segs);
153 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
161 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
165 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
166 /* Policy lookup after SNAT yielded a new policy */
167 if (skb_dst(skb)->xfrm) {
168 IPCB(skb)->flags |= IPSKB_REROUTED;
169 return dst_output(net, sk, skb);
173 mtu = ip6_skb_dst_mtu(skb);
174 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
175 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
177 if ((skb->len > mtu && !skb_is_gso(skb)) ||
178 dst_allfrag(skb_dst(skb)) ||
179 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
180 return ip6_fragment(net, sk, skb, ip6_finish_output2);
182 return ip6_finish_output2(net, sk, skb);
185 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
189 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
191 case NET_XMIT_SUCCESS:
192 return __ip6_finish_output(net, sk, skb);
194 return __ip6_finish_output(net, sk, skb) ? : ret;
201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
203 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
204 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
206 skb->protocol = htons(ETH_P_IPV6);
209 if (unlikely(idev->cnf.disable_ipv6)) {
210 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
215 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
216 net, sk, skb, indev, dev,
218 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
220 EXPORT_SYMBOL(ip6_output);
222 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
224 if (!np->autoflowlabel_set)
225 return ip6_default_np_autolabel(net);
227 return np->autoflowlabel;
231 * xmit an sk_buff (used by TCP, SCTP and DCCP)
232 * Note : socket lock is not held for SYNACK packets, but might be modified
233 * by calls to skb_set_owner_w() and ipv6_local_error(),
234 * which are using proper atomic operations or spinlocks.
236 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
237 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
239 struct net *net = sock_net(sk);
240 const struct ipv6_pinfo *np = inet6_sk(sk);
241 struct in6_addr *first_hop = &fl6->daddr;
242 struct dst_entry *dst = skb_dst(skb);
243 unsigned int head_room;
245 u8 proto = fl6->flowi6_proto;
246 int seg_len = skb->len;
250 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
252 head_room += opt->opt_nflen + opt->opt_flen;
254 if (unlikely(skb_headroom(skb) < head_room)) {
255 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
257 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
258 IPSTATS_MIB_OUTDISCARDS);
263 skb_set_owner_w(skb2, skb->sk);
269 seg_len += opt->opt_nflen + opt->opt_flen;
272 ipv6_push_frag_opts(skb, opt, &proto);
275 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
279 skb_push(skb, sizeof(struct ipv6hdr));
280 skb_reset_network_header(skb);
284 * Fill in the IPv6 header
287 hlimit = np->hop_limit;
289 hlimit = ip6_dst_hoplimit(dst);
291 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
292 ip6_autoflowlabel(net, np), fl6));
294 hdr->payload_len = htons(seg_len);
295 hdr->nexthdr = proto;
296 hdr->hop_limit = hlimit;
298 hdr->saddr = fl6->saddr;
299 hdr->daddr = *first_hop;
301 skb->protocol = htons(ETH_P_IPV6);
302 skb->priority = priority;
306 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
307 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
308 IPSTATS_MIB_OUT, skb->len);
310 /* if egress device is enslaved to an L3 master device pass the
311 * skb to its handler for processing
313 skb = l3mdev_ip6_out((struct sock *)sk, skb);
317 /* hooks should never assume socket lock is held.
318 * we promote our socket to non const
320 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
321 net, (struct sock *)sk, skb, NULL, dst->dev,
326 /* ipv6_local_error() does not require socket lock,
327 * we promote our socket to non const
329 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
331 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
335 EXPORT_SYMBOL(ip6_xmit);
337 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
339 struct ip6_ra_chain *ra;
340 struct sock *last = NULL;
342 read_lock(&ip6_ra_lock);
343 for (ra = ip6_ra_chain; ra; ra = ra->next) {
344 struct sock *sk = ra->sk;
345 if (sk && ra->sel == sel &&
346 (!sk->sk_bound_dev_if ||
347 sk->sk_bound_dev_if == skb->dev->ifindex)) {
348 struct ipv6_pinfo *np = inet6_sk(sk);
350 if (np && np->rtalert_isolate &&
351 !net_eq(sock_net(sk), dev_net(skb->dev))) {
355 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
357 rawv6_rcv(last, skb2);
364 rawv6_rcv(last, skb);
365 read_unlock(&ip6_ra_lock);
368 read_unlock(&ip6_ra_lock);
372 static int ip6_forward_proxy_check(struct sk_buff *skb)
374 struct ipv6hdr *hdr = ipv6_hdr(skb);
375 u8 nexthdr = hdr->nexthdr;
379 if (ipv6_ext_hdr(nexthdr)) {
380 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
384 offset = sizeof(struct ipv6hdr);
386 if (nexthdr == IPPROTO_ICMPV6) {
387 struct icmp6hdr *icmp6;
389 if (!pskb_may_pull(skb, (skb_network_header(skb) +
390 offset + 1 - skb->data)))
393 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
395 switch (icmp6->icmp6_type) {
396 case NDISC_ROUTER_SOLICITATION:
397 case NDISC_ROUTER_ADVERTISEMENT:
398 case NDISC_NEIGHBOUR_SOLICITATION:
399 case NDISC_NEIGHBOUR_ADVERTISEMENT:
401 /* For reaction involving unicast neighbor discovery
402 * message destined to the proxied address, pass it to
412 * The proxying router can't forward traffic sent to a link-local
413 * address, so signal the sender and discard the packet. This
414 * behavior is clarified by the MIPv6 specification.
416 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
417 dst_link_failure(skb);
424 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
427 struct dst_entry *dst = skb_dst(skb);
429 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
430 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
432 #ifdef CONFIG_NET_SWITCHDEV
433 if (skb->offload_l3_fwd_mark) {
440 return dst_output(net, sk, skb);
443 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
448 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
449 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
455 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
461 int ip6_forward(struct sk_buff *skb)
463 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
464 struct dst_entry *dst = skb_dst(skb);
465 struct ipv6hdr *hdr = ipv6_hdr(skb);
466 struct inet6_skb_parm *opt = IP6CB(skb);
467 struct net *net = dev_net(dst->dev);
470 if (net->ipv6.devconf_all->forwarding == 0)
473 if (skb->pkt_type != PACKET_HOST)
476 if (unlikely(skb->sk))
479 if (skb_warn_if_lro(skb))
482 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
483 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
487 skb_forward_csum(skb);
490 * We DO NOT make any processing on
491 * RA packets, pushing them to user level AS IS
492 * without ane WARRANTY that application will be able
493 * to interpret them. The reason is that we
494 * cannot make anything clever here.
496 * We are not end-node, so that if packet contains
497 * AH/ESP, we cannot make anything.
498 * Defragmentation also would be mistake, RA packets
499 * cannot be fragmented, because there is no warranty
500 * that different fragments will go along one path. --ANK
502 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
503 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
508 * check and decrement ttl
510 if (hdr->hop_limit <= 1) {
511 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
512 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
518 /* XXX: idev->cnf.proxy_ndp? */
519 if (net->ipv6.devconf_all->proxy_ndp &&
520 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
521 int proxied = ip6_forward_proxy_check(skb);
523 return ip6_input(skb);
524 else if (proxied < 0) {
525 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
530 if (!xfrm6_route_forward(skb)) {
531 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
536 /* IPv6 specs say nothing about it, but it is clear that we cannot
537 send redirects to source routed frames.
538 We don't send redirects to frames decapsulated from IPsec.
540 if (IP6CB(skb)->iif == dst->dev->ifindex &&
541 opt->srcrt == 0 && !skb_sec_path(skb)) {
542 struct in6_addr *target = NULL;
543 struct inet_peer *peer;
547 * incoming and outgoing devices are the same
551 rt = (struct rt6_info *) dst;
552 if (rt->rt6i_flags & RTF_GATEWAY)
553 target = &rt->rt6i_gateway;
555 target = &hdr->daddr;
557 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
559 /* Limit redirects both by destination (here)
560 and by source (inside ndisc_send_redirect)
562 if (inet_peer_xrlim_allow(peer, 1*HZ))
563 ndisc_send_redirect(skb, target);
567 int addrtype = ipv6_addr_type(&hdr->saddr);
569 /* This check is security critical. */
570 if (addrtype == IPV6_ADDR_ANY ||
571 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
573 if (addrtype & IPV6_ADDR_LINKLOCAL) {
574 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
575 ICMPV6_NOT_NEIGHBOUR, 0);
580 mtu = ip6_dst_mtu_forward(dst);
581 if (mtu < IPV6_MIN_MTU)
584 if (ip6_pkt_too_big(skb, mtu)) {
585 /* Again, force OUTPUT device used as source address */
587 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
588 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
589 __IP6_INC_STATS(net, ip6_dst_idev(dst),
590 IPSTATS_MIB_FRAGFAILS);
595 if (skb_cow(skb, dst->dev->hard_header_len)) {
596 __IP6_INC_STATS(net, ip6_dst_idev(dst),
597 IPSTATS_MIB_OUTDISCARDS);
603 /* Mangling hops number delayed to point after skb COW */
607 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
608 net, NULL, skb, skb->dev, dst->dev,
612 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
618 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
620 to->pkt_type = from->pkt_type;
621 to->priority = from->priority;
622 to->protocol = from->protocol;
624 skb_dst_set(to, dst_clone(skb_dst(from)));
626 to->mark = from->mark;
628 skb_copy_hash(to, from);
630 #ifdef CONFIG_NET_SCHED
631 to->tc_index = from->tc_index;
634 skb_ext_copy(to, from);
635 skb_copy_secmark(to, from);
638 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
639 u8 nexthdr, __be32 frag_id,
640 struct ip6_fraglist_iter *iter)
642 unsigned int first_len;
646 *prevhdr = NEXTHDR_FRAGMENT;
647 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
651 iter->frag = skb_shinfo(skb)->frag_list;
652 skb_frag_list_init(skb);
656 iter->frag_id = frag_id;
657 iter->nexthdr = nexthdr;
659 __skb_pull(skb, hlen);
660 fh = __skb_push(skb, sizeof(struct frag_hdr));
661 __skb_push(skb, hlen);
662 skb_reset_network_header(skb);
663 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
665 fh->nexthdr = nexthdr;
667 fh->frag_off = htons(IP6_MF);
668 fh->identification = frag_id;
670 first_len = skb_pagelen(skb);
671 skb->data_len = first_len - skb_headlen(skb);
672 skb->len = first_len;
673 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
677 EXPORT_SYMBOL(ip6_fraglist_init);
679 void ip6_fraglist_prepare(struct sk_buff *skb,
680 struct ip6_fraglist_iter *iter)
682 struct sk_buff *frag = iter->frag;
683 unsigned int hlen = iter->hlen;
686 frag->ip_summed = CHECKSUM_NONE;
687 skb_reset_transport_header(frag);
688 fh = __skb_push(frag, sizeof(struct frag_hdr));
689 __skb_push(frag, hlen);
690 skb_reset_network_header(frag);
691 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
692 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
693 fh->nexthdr = iter->nexthdr;
695 fh->frag_off = htons(iter->offset);
697 fh->frag_off |= htons(IP6_MF);
698 fh->identification = iter->frag_id;
699 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
700 ip6_copy_metadata(frag, skb);
702 EXPORT_SYMBOL(ip6_fraglist_prepare);
704 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
705 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
706 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
708 state->prevhdr = prevhdr;
709 state->nexthdr = nexthdr;
710 state->frag_id = frag_id;
715 state->left = skb->len - hlen; /* Space per frame */
716 state->ptr = hlen; /* Where to start from */
718 state->hroom = hdr_room;
719 state->troom = needed_tailroom;
723 EXPORT_SYMBOL(ip6_frag_init);
725 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
727 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
728 struct sk_buff *frag;
733 /* IF: it doesn't fit, use 'mtu' - the data space left */
734 if (len > state->mtu)
736 /* IF: we are not sending up to and including the packet end
737 then align the next start on an eight byte boundary */
738 if (len < state->left)
741 /* Allocate buffer */
742 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
743 state->hroom + state->troom, GFP_ATOMIC);
745 return ERR_PTR(-ENOMEM);
748 * Set up data on packet
751 ip6_copy_metadata(frag, skb);
752 skb_reserve(frag, state->hroom);
753 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
754 skb_reset_network_header(frag);
755 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
756 frag->transport_header = (frag->network_header + state->hlen +
757 sizeof(struct frag_hdr));
760 * Charge the memory for the fragment to any owner
764 skb_set_owner_w(frag, skb->sk);
767 * Copy the packet header into the new buffer.
769 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
771 fragnexthdr_offset = skb_network_header(frag);
772 fragnexthdr_offset += prevhdr - skb_network_header(skb);
773 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
776 * Build fragment header.
778 fh->nexthdr = state->nexthdr;
780 fh->identification = state->frag_id;
783 * Copy a block of the IP datagram.
785 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
789 fh->frag_off = htons(state->offset);
791 fh->frag_off |= htons(IP6_MF);
792 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
795 state->offset += len;
799 EXPORT_SYMBOL(ip6_frag_next);
801 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
802 int (*output)(struct net *, struct sock *, struct sk_buff *))
804 struct sk_buff *frag;
805 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
806 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
807 inet6_sk(skb->sk) : NULL;
808 struct ip6_frag_state state;
809 unsigned int mtu, hlen, nexthdr_offset;
810 ktime_t tstamp = skb->tstamp;
813 u8 *prevhdr, nexthdr = 0;
815 err = ip6_find_1stfragopt(skb, &prevhdr);
820 nexthdr_offset = prevhdr - skb_network_header(skb);
822 mtu = ip6_skb_dst_mtu(skb);
824 /* We must not fragment if the socket is set to force MTU discovery
825 * or if the skb it not generated by a local socket.
827 if (unlikely(!skb->ignore_df && skb->len > mtu))
830 if (IP6CB(skb)->frag_max_size) {
831 if (IP6CB(skb)->frag_max_size > mtu)
834 /* don't send fragments larger than what we received */
835 mtu = IP6CB(skb)->frag_max_size;
836 if (mtu < IPV6_MIN_MTU)
840 if (np && np->frag_size < mtu) {
844 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
846 mtu -= hlen + sizeof(struct frag_hdr);
848 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
849 &ipv6_hdr(skb)->saddr);
851 if (skb->ip_summed == CHECKSUM_PARTIAL &&
852 (err = skb_checksum_help(skb)))
855 prevhdr = skb_network_header(skb) + nexthdr_offset;
856 hroom = LL_RESERVED_SPACE(rt->dst.dev);
857 if (skb_has_frag_list(skb)) {
858 unsigned int first_len = skb_pagelen(skb);
859 struct ip6_fraglist_iter iter;
860 struct sk_buff *frag2;
862 if (first_len - hlen > mtu ||
863 ((first_len - hlen) & 7) ||
865 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
868 skb_walk_frags(skb, frag) {
869 /* Correct geometry. */
870 if (frag->len > mtu ||
871 ((frag->len & 7) && frag->next) ||
872 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
873 goto slow_path_clean;
875 /* Partially cloned skb? */
876 if (skb_shared(frag))
877 goto slow_path_clean;
882 frag->destructor = sock_wfree;
884 skb->truesize -= frag->truesize;
887 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
893 /* Prepare header of the next frame,
894 * before previous one went down. */
896 ip6_fraglist_prepare(skb, &iter);
898 skb->tstamp = tstamp;
899 err = output(net, sk, skb);
901 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
902 IPSTATS_MIB_FRAGCREATES);
904 if (err || !iter.frag)
907 skb = ip6_fraglist_next(&iter);
913 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
914 IPSTATS_MIB_FRAGOKS);
918 kfree_skb_list(iter.frag);
920 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
921 IPSTATS_MIB_FRAGFAILS);
925 skb_walk_frags(skb, frag2) {
929 frag2->destructor = NULL;
930 skb->truesize += frag2->truesize;
936 * Fragment the datagram.
939 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
940 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
944 * Keep copying data until we run out.
947 while (state.left > 0) {
948 frag = ip6_frag_next(skb, &state);
955 * Put this fragment into the sending queue.
957 frag->tstamp = tstamp;
958 err = output(net, sk, frag);
962 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
963 IPSTATS_MIB_FRAGCREATES);
965 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
966 IPSTATS_MIB_FRAGOKS);
971 if (skb->sk && dst_allfrag(skb_dst(skb)))
972 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
974 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
978 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
979 IPSTATS_MIB_FRAGFAILS);
984 static inline int ip6_rt_check(const struct rt6key *rt_key,
985 const struct in6_addr *fl_addr,
986 const struct in6_addr *addr_cache)
988 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
989 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
992 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
993 struct dst_entry *dst,
994 const struct flowi6 *fl6)
996 struct ipv6_pinfo *np = inet6_sk(sk);
1002 if (dst->ops->family != AF_INET6) {
1007 rt = (struct rt6_info *)dst;
1008 /* Yes, checking route validity in not connected
1009 * case is not very simple. Take into account,
1010 * that we do not support routing by source, TOS,
1011 * and MSG_DONTROUTE --ANK (980726)
1013 * 1. ip6_rt_check(): If route was host route,
1014 * check that cached destination is current.
1015 * If it is network route, we still may
1016 * check its validity using saved pointer
1017 * to the last used address: daddr_cache.
1018 * We do not want to save whole address now,
1019 * (because main consumer of this service
1020 * is tcp, which has not this problem),
1021 * so that the last trick works only on connected
1023 * 2. oif also should be the same.
1025 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1026 #ifdef CONFIG_IPV6_SUBTREES
1027 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1029 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1030 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1039 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1040 struct dst_entry **dst, struct flowi6 *fl6)
1042 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1043 struct neighbour *n;
1044 struct rt6_info *rt;
1049 /* The correct way to handle this would be to do
1050 * ip6_route_get_saddr, and then ip6_route_output; however,
1051 * the route-specific preferred source forces the
1052 * ip6_route_output call _before_ ip6_route_get_saddr.
1054 * In source specific routing (no src=any default route),
1055 * ip6_route_output will fail given src=any saddr, though, so
1056 * that's why we try it again later.
1058 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1059 struct fib6_info *from;
1060 struct rt6_info *rt;
1061 bool had_dst = *dst != NULL;
1064 *dst = ip6_route_output(net, sk, fl6);
1065 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1068 from = rt ? rcu_dereference(rt->from) : NULL;
1069 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1070 sk ? inet6_sk(sk)->srcprefs : 0,
1075 goto out_err_release;
1077 /* If we had an erroneous initial result, pretend it
1078 * never existed and let the SA-enabled version take
1081 if (!had_dst && (*dst)->error) {
1086 if (fl6->flowi6_oif)
1087 flags |= RT6_LOOKUP_F_IFACE;
1091 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1093 err = (*dst)->error;
1095 goto out_err_release;
1097 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1099 * Here if the dst entry we've looked up
1100 * has a neighbour entry that is in the INCOMPLETE
1101 * state and the src address from the flow is
1102 * marked as OPTIMISTIC, we release the found
1103 * dst entry and replace it instead with the
1104 * dst entry of the nexthop router
1106 rt = (struct rt6_info *) *dst;
1108 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1109 rt6_nexthop(rt, &fl6->daddr));
1110 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1111 rcu_read_unlock_bh();
1114 struct inet6_ifaddr *ifp;
1115 struct flowi6 fl_gw6;
1118 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1121 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1127 * We need to get the dst entry for the
1128 * default router instead
1131 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1132 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1133 *dst = ip6_route_output(net, sk, &fl_gw6);
1134 err = (*dst)->error;
1136 goto out_err_release;
1140 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1141 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1142 err = -EAFNOSUPPORT;
1143 goto out_err_release;
1152 if (err == -ENETUNREACH)
1153 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1158 * ip6_dst_lookup - perform route lookup on flow
1159 * @net: Network namespace to perform lookup in
1160 * @sk: socket which provides route info
1161 * @dst: pointer to dst_entry * for result
1162 * @fl6: flow to lookup
1164 * This function performs a route lookup on the given flow.
1166 * It returns zero on success, or a standard errno code on error.
1168 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1172 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1174 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1177 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1178 * @net: Network namespace to perform lookup in
1179 * @sk: socket which provides route info
1180 * @fl6: flow to lookup
1181 * @final_dst: final destination address for ipsec lookup
1183 * This function performs a route lookup on the given flow.
1185 * It returns a valid dst pointer on success, or a pointer encoded
1188 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1189 const struct in6_addr *final_dst)
1191 struct dst_entry *dst = NULL;
1194 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1196 return ERR_PTR(err);
1198 fl6->daddr = *final_dst;
1200 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1202 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1205 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1206 * @sk: socket which provides the dst cache and route info
1207 * @fl6: flow to lookup
1208 * @final_dst: final destination address for ipsec lookup
1209 * @connected: whether @sk is connected or not
1211 * This function performs a route lookup on the given flow with the
1212 * possibility of using the cached route in the socket if it is valid.
1213 * It will take the socket dst lock when operating on the dst cache.
1214 * As a result, this function can only be used in process context.
1216 * In addition, for a connected socket, cache the dst in the socket
1217 * if the current cache is not valid.
1219 * It returns a valid dst pointer on success, or a pointer encoded
1222 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1223 const struct in6_addr *final_dst,
1226 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1228 dst = ip6_sk_dst_check(sk, dst, fl6);
1232 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1233 if (connected && !IS_ERR(dst))
1234 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1238 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1241 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1242 * @skb: Packet for which lookup is done
1243 * @dev: Tunnel device
1244 * @net: Network namespace of tunnel device
1245 * @sock: Socket which provides route info
1246 * @saddr: Memory to store the src ip address
1247 * @info: Tunnel information
1248 * @protocol: IP protocol
1249 * @use_cache: Flag to enable cache usage
1250 * This function performs a route lookup on a tunnel
1252 * It returns a valid dst pointer and stores src address to be used in
1253 * tunnel in param saddr on success, else a pointer encoded error code.
1256 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1257 struct net_device *dev,
1259 struct socket *sock,
1260 struct in6_addr *saddr,
1261 const struct ip_tunnel_info *info,
1265 struct dst_entry *dst = NULL;
1266 #ifdef CONFIG_DST_CACHE
1267 struct dst_cache *dst_cache;
1272 #ifdef CONFIG_DST_CACHE
1273 dst_cache = (struct dst_cache *)&info->dst_cache;
1275 dst = dst_cache_get_ip6(dst_cache, saddr);
1280 memset(&fl6, 0, sizeof(fl6));
1281 fl6.flowi6_mark = skb->mark;
1282 fl6.flowi6_proto = protocol;
1283 fl6.daddr = info->key.u.ipv6.dst;
1284 fl6.saddr = info->key.u.ipv6.src;
1285 prio = info->key.tos;
1286 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1289 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1292 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1293 return ERR_PTR(-ENETUNREACH);
1295 if (dst->dev == dev) { /* is this necessary? */
1296 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1298 return ERR_PTR(-ELOOP);
1300 #ifdef CONFIG_DST_CACHE
1302 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1307 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1309 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1312 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1315 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1318 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1321 static void ip6_append_data_mtu(unsigned int *mtu,
1323 unsigned int fragheaderlen,
1324 struct sk_buff *skb,
1325 struct rt6_info *rt,
1326 unsigned int orig_mtu)
1328 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1330 /* first fragment, reserve header_len */
1331 *mtu = orig_mtu - rt->dst.header_len;
1335 * this fragment is not first, the headers
1336 * space is regarded as data space.
1340 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1341 + fragheaderlen - sizeof(struct frag_hdr);
1345 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1346 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1347 struct rt6_info *rt, struct flowi6 *fl6)
1349 struct ipv6_pinfo *np = inet6_sk(sk);
1351 struct ipv6_txoptions *opt = ipc6->opt;
1357 if (WARN_ON(v6_cork->opt))
1360 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1361 if (unlikely(!v6_cork->opt))
1364 v6_cork->opt->tot_len = sizeof(*opt);
1365 v6_cork->opt->opt_flen = opt->opt_flen;
1366 v6_cork->opt->opt_nflen = opt->opt_nflen;
1368 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1370 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1373 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1375 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1378 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1380 if (opt->hopopt && !v6_cork->opt->hopopt)
1383 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1385 if (opt->srcrt && !v6_cork->opt->srcrt)
1388 /* need source address above miyazawa*/
1391 cork->base.dst = &rt->dst;
1392 cork->fl.u.ip6 = *fl6;
1393 v6_cork->hop_limit = ipc6->hlimit;
1394 v6_cork->tclass = ipc6->tclass;
1395 if (rt->dst.flags & DST_XFRM_TUNNEL)
1396 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1397 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1399 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1400 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1401 if (np->frag_size < mtu) {
1403 mtu = np->frag_size;
1405 if (mtu < IPV6_MIN_MTU)
1407 cork->base.fragsize = mtu;
1408 cork->base.gso_size = ipc6->gso_size;
1409 cork->base.tx_flags = 0;
1410 cork->base.mark = ipc6->sockc.mark;
1411 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1413 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1414 cork->base.flags |= IPCORK_ALLFRAG;
1415 cork->base.length = 0;
1417 cork->base.transmit_time = ipc6->sockc.transmit_time;
1422 static int __ip6_append_data(struct sock *sk,
1424 struct sk_buff_head *queue,
1425 struct inet_cork *cork,
1426 struct inet6_cork *v6_cork,
1427 struct page_frag *pfrag,
1428 int getfrag(void *from, char *to, int offset,
1429 int len, int odd, struct sk_buff *skb),
1430 void *from, int length, int transhdrlen,
1431 unsigned int flags, struct ipcm6_cookie *ipc6)
1433 struct sk_buff *skb, *skb_prev = NULL;
1434 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1435 struct ubuf_info *uarg = NULL;
1437 int dst_exthdrlen = 0;
1443 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1444 struct ipv6_txoptions *opt = v6_cork->opt;
1445 int csummode = CHECKSUM_NONE;
1446 unsigned int maxnonfragsize, headersize;
1447 unsigned int wmem_alloc_delta = 0;
1448 bool paged, extra_uref = false;
1450 skb = skb_peek_tail(queue);
1452 exthdrlen = opt ? opt->opt_flen : 0;
1453 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1456 paged = !!cork->gso_size;
1457 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1460 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1461 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1462 tskey = sk->sk_tskey++;
1464 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1466 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1467 (opt ? opt->opt_nflen : 0);
1468 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1469 sizeof(struct frag_hdr);
1471 headersize = sizeof(struct ipv6hdr) +
1472 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1473 (dst_allfrag(&rt->dst) ?
1474 sizeof(struct frag_hdr) : 0) +
1475 rt->rt6i_nfheader_len;
1477 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1478 * the first fragment
1480 if (headersize + transhdrlen > mtu)
1483 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1484 (sk->sk_protocol == IPPROTO_UDP ||
1485 sk->sk_protocol == IPPROTO_RAW)) {
1486 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1487 sizeof(struct ipv6hdr));
1491 if (ip6_sk_ignore_df(sk))
1492 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1494 maxnonfragsize = mtu;
1496 if (cork->length + length > maxnonfragsize - headersize) {
1498 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1499 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1503 /* CHECKSUM_PARTIAL only with no extension headers and when
1504 * we are not going to fragment
1506 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1507 headersize == sizeof(struct ipv6hdr) &&
1508 length <= mtu - headersize &&
1509 (!(flags & MSG_MORE) || cork->gso_size) &&
1510 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1511 csummode = CHECKSUM_PARTIAL;
1513 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1514 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1517 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1518 if (rt->dst.dev->features & NETIF_F_SG &&
1519 csummode == CHECKSUM_PARTIAL) {
1523 skb_zcopy_set(skb, uarg, &extra_uref);
1528 * Let's try using as much space as possible.
1529 * Use MTU if total length of the message fits into the MTU.
1530 * Otherwise, we need to reserve fragment header and
1531 * fragment alignment (= 8-15 octects, in total).
1533 * Note that we may need to "move" the data from the tail
1534 * of the buffer to the new fragment when we split
1537 * FIXME: It may be fragmented into multiple chunks
1538 * at once if non-fragmentable extension headers
1543 cork->length += length;
1547 while (length > 0) {
1548 /* Check if the remaining data fits into current packet. */
1549 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1551 copy = maxfraglen - skb->len;
1555 unsigned int datalen;
1556 unsigned int fraglen;
1557 unsigned int fraggap;
1558 unsigned int alloclen;
1559 unsigned int pagedlen;
1561 /* There's no room in the current skb */
1563 fraggap = skb->len - maxfraglen;
1566 /* update mtu and maxfraglen if necessary */
1567 if (!skb || !skb_prev)
1568 ip6_append_data_mtu(&mtu, &maxfraglen,
1569 fragheaderlen, skb, rt,
1575 * If remaining data exceeds the mtu,
1576 * we know we need more fragment(s).
1578 datalen = length + fraggap;
1580 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1581 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1582 fraglen = datalen + fragheaderlen;
1585 if ((flags & MSG_MORE) &&
1586 !(rt->dst.dev->features&NETIF_F_SG))
1591 alloclen = min_t(int, fraglen, MAX_HEADER);
1592 pagedlen = fraglen - alloclen;
1595 alloclen += dst_exthdrlen;
1597 if (datalen != length + fraggap) {
1599 * this is not the last fragment, the trailer
1600 * space is regarded as data space.
1602 datalen += rt->dst.trailer_len;
1605 alloclen += rt->dst.trailer_len;
1606 fraglen = datalen + fragheaderlen;
1609 * We just reserve space for fragment header.
1610 * Note: this may be overallocation if the message
1611 * (without MSG_MORE) fits into the MTU.
1613 alloclen += sizeof(struct frag_hdr);
1615 copy = datalen - transhdrlen - fraggap - pagedlen;
1621 skb = sock_alloc_send_skb(sk,
1623 (flags & MSG_DONTWAIT), &err);
1626 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1628 skb = alloc_skb(alloclen + hh_len,
1636 * Fill in the control structures
1638 skb->protocol = htons(ETH_P_IPV6);
1639 skb->ip_summed = csummode;
1641 /* reserve for fragmentation and ipsec header */
1642 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1646 * Find where to start putting bytes
1648 data = skb_put(skb, fraglen - pagedlen);
1649 skb_set_network_header(skb, exthdrlen);
1650 data += fragheaderlen;
1651 skb->transport_header = (skb->network_header +
1654 skb->csum = skb_copy_and_csum_bits(
1655 skb_prev, maxfraglen,
1656 data + transhdrlen, fraggap);
1657 skb_prev->csum = csum_sub(skb_prev->csum,
1660 pskb_trim_unique(skb_prev, maxfraglen);
1663 getfrag(from, data + transhdrlen, offset,
1664 copy, fraggap, skb) < 0) {
1671 length -= copy + transhdrlen;
1676 /* Only the initial fragment is time stamped */
1677 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1679 skb_shinfo(skb)->tskey = tskey;
1681 skb_zcopy_set(skb, uarg, &extra_uref);
1683 if ((flags & MSG_CONFIRM) && !skb_prev)
1684 skb_set_dst_pending_confirm(skb, 1);
1687 * Put the packet on the pending queue
1689 if (!skb->destructor) {
1690 skb->destructor = sock_wfree;
1692 wmem_alloc_delta += skb->truesize;
1694 __skb_queue_tail(queue, skb);
1701 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1702 skb_tailroom(skb) >= copy) {
1706 if (getfrag(from, skb_put(skb, copy),
1707 offset, copy, off, skb) < 0) {
1708 __skb_trim(skb, off);
1712 } else if (!uarg || !uarg->zerocopy) {
1713 int i = skb_shinfo(skb)->nr_frags;
1716 if (!sk_page_frag_refill(sk, pfrag))
1719 if (!skb_can_coalesce(skb, i, pfrag->page,
1722 if (i == MAX_SKB_FRAGS)
1725 __skb_fill_page_desc(skb, i, pfrag->page,
1727 skb_shinfo(skb)->nr_frags = ++i;
1728 get_page(pfrag->page);
1730 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1732 page_address(pfrag->page) + pfrag->offset,
1733 offset, copy, skb->len, skb) < 0)
1736 pfrag->offset += copy;
1737 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1739 skb->data_len += copy;
1740 skb->truesize += copy;
1741 wmem_alloc_delta += copy;
1743 err = skb_zerocopy_iter_dgram(skb, from, copy);
1751 if (wmem_alloc_delta)
1752 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1758 net_zcopy_put_abort(uarg, extra_uref);
1759 cork->length -= length;
1760 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1761 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1765 int ip6_append_data(struct sock *sk,
1766 int getfrag(void *from, char *to, int offset, int len,
1767 int odd, struct sk_buff *skb),
1768 void *from, int length, int transhdrlen,
1769 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1770 struct rt6_info *rt, unsigned int flags)
1772 struct inet_sock *inet = inet_sk(sk);
1773 struct ipv6_pinfo *np = inet6_sk(sk);
1777 if (flags&MSG_PROBE)
1779 if (skb_queue_empty(&sk->sk_write_queue)) {
1783 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1788 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1789 length += exthdrlen;
1790 transhdrlen += exthdrlen;
1792 fl6 = &inet->cork.fl.u.ip6;
1796 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1797 &np->cork, sk_page_frag(sk), getfrag,
1798 from, length, transhdrlen, flags, ipc6);
1800 EXPORT_SYMBOL_GPL(ip6_append_data);
1802 static void ip6_cork_release(struct inet_cork_full *cork,
1803 struct inet6_cork *v6_cork)
1806 kfree(v6_cork->opt->dst0opt);
1807 kfree(v6_cork->opt->dst1opt);
1808 kfree(v6_cork->opt->hopopt);
1809 kfree(v6_cork->opt->srcrt);
1810 kfree(v6_cork->opt);
1811 v6_cork->opt = NULL;
1814 if (cork->base.dst) {
1815 dst_release(cork->base.dst);
1816 cork->base.dst = NULL;
1817 cork->base.flags &= ~IPCORK_ALLFRAG;
1819 memset(&cork->fl, 0, sizeof(cork->fl));
1822 struct sk_buff *__ip6_make_skb(struct sock *sk,
1823 struct sk_buff_head *queue,
1824 struct inet_cork_full *cork,
1825 struct inet6_cork *v6_cork)
1827 struct sk_buff *skb, *tmp_skb;
1828 struct sk_buff **tail_skb;
1829 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1830 struct ipv6_pinfo *np = inet6_sk(sk);
1831 struct net *net = sock_net(sk);
1832 struct ipv6hdr *hdr;
1833 struct ipv6_txoptions *opt = v6_cork->opt;
1834 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1835 struct flowi6 *fl6 = &cork->fl.u.ip6;
1836 unsigned char proto = fl6->flowi6_proto;
1838 skb = __skb_dequeue(queue);
1841 tail_skb = &(skb_shinfo(skb)->frag_list);
1843 /* move skb->data to ip header from ext header */
1844 if (skb->data < skb_network_header(skb))
1845 __skb_pull(skb, skb_network_offset(skb));
1846 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1847 __skb_pull(tmp_skb, skb_network_header_len(skb));
1848 *tail_skb = tmp_skb;
1849 tail_skb = &(tmp_skb->next);
1850 skb->len += tmp_skb->len;
1851 skb->data_len += tmp_skb->len;
1852 skb->truesize += tmp_skb->truesize;
1853 tmp_skb->destructor = NULL;
1857 /* Allow local fragmentation. */
1858 skb->ignore_df = ip6_sk_ignore_df(sk);
1860 *final_dst = fl6->daddr;
1861 __skb_pull(skb, skb_network_header_len(skb));
1862 if (opt && opt->opt_flen)
1863 ipv6_push_frag_opts(skb, opt, &proto);
1864 if (opt && opt->opt_nflen)
1865 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1867 skb_push(skb, sizeof(struct ipv6hdr));
1868 skb_reset_network_header(skb);
1869 hdr = ipv6_hdr(skb);
1871 ip6_flow_hdr(hdr, v6_cork->tclass,
1872 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1873 ip6_autoflowlabel(net, np), fl6));
1874 hdr->hop_limit = v6_cork->hop_limit;
1875 hdr->nexthdr = proto;
1876 hdr->saddr = fl6->saddr;
1877 hdr->daddr = *final_dst;
1879 skb->priority = sk->sk_priority;
1880 skb->mark = cork->base.mark;
1882 skb->tstamp = cork->base.transmit_time;
1884 skb_dst_set(skb, dst_clone(&rt->dst));
1885 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1886 if (proto == IPPROTO_ICMPV6) {
1887 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1889 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1890 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1893 ip6_cork_release(cork, v6_cork);
1898 int ip6_send_skb(struct sk_buff *skb)
1900 struct net *net = sock_net(skb->sk);
1901 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1904 err = ip6_local_out(net, skb->sk, skb);
1907 err = net_xmit_errno(err);
1909 IP6_INC_STATS(net, rt->rt6i_idev,
1910 IPSTATS_MIB_OUTDISCARDS);
1916 int ip6_push_pending_frames(struct sock *sk)
1918 struct sk_buff *skb;
1920 skb = ip6_finish_skb(sk);
1924 return ip6_send_skb(skb);
1926 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1928 static void __ip6_flush_pending_frames(struct sock *sk,
1929 struct sk_buff_head *queue,
1930 struct inet_cork_full *cork,
1931 struct inet6_cork *v6_cork)
1933 struct sk_buff *skb;
1935 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1937 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1938 IPSTATS_MIB_OUTDISCARDS);
1942 ip6_cork_release(cork, v6_cork);
1945 void ip6_flush_pending_frames(struct sock *sk)
1947 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1948 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1950 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1952 struct sk_buff *ip6_make_skb(struct sock *sk,
1953 int getfrag(void *from, char *to, int offset,
1954 int len, int odd, struct sk_buff *skb),
1955 void *from, int length, int transhdrlen,
1956 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1957 struct rt6_info *rt, unsigned int flags,
1958 struct inet_cork_full *cork)
1960 struct inet6_cork v6_cork;
1961 struct sk_buff_head queue;
1962 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1965 if (flags & MSG_PROBE)
1968 __skb_queue_head_init(&queue);
1970 cork->base.flags = 0;
1971 cork->base.addr = 0;
1972 cork->base.opt = NULL;
1973 cork->base.dst = NULL;
1975 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1977 ip6_cork_release(cork, &v6_cork);
1978 return ERR_PTR(err);
1980 if (ipc6->dontfrag < 0)
1981 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1983 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1984 ¤t->task_frag, getfrag, from,
1985 length + exthdrlen, transhdrlen + exthdrlen,
1988 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1989 return ERR_PTR(err);
1992 return __ip6_make_skb(sk, &queue, cork, &v6_cork);