1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * IPv6 output functions
4 * Linux INET6 implementation
7 * Pedro Roque <roque@di.fc.ul.pt>
9 * Based on linux/net/ipv4/ip_output.c
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 const struct in6_addr *nexthop;
64 struct neighbour *neigh;
67 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
68 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
70 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
71 ((mroute6_is_socket(net, skb) &&
72 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
73 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
74 &ipv6_hdr(skb)->saddr))) {
75 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
77 /* Do not check for IFF_ALLMULTI; multicast routing
78 is not supported in any case.
81 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
82 net, sk, newskb, NULL, newskb->dev,
85 if (ipv6_hdr(skb)->hop_limit == 0) {
86 IP6_INC_STATS(net, idev,
87 IPSTATS_MIB_OUTDISCARDS);
93 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
95 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
96 IPV6_ADDR_SCOPE_NODELOCAL &&
97 !(dev->flags & IFF_LOOPBACK)) {
103 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
104 int res = lwtunnel_xmit(skb);
106 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
111 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
112 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
113 if (unlikely(!neigh))
114 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
115 if (!IS_ERR(neigh)) {
116 sock_confirm_neigh(skb, neigh);
117 ret = neigh_output(neigh, skb, false);
118 rcu_read_unlock_bh();
121 rcu_read_unlock_bh();
123 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
129 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
130 struct sk_buff *skb, unsigned int mtu)
132 struct sk_buff *segs, *nskb;
133 netdev_features_t features;
136 /* Please see corresponding comment in ip_finish_output_gso
137 * describing the cases where GSO segment length exceeds the
140 features = netif_skb_features(skb);
141 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
142 if (IS_ERR_OR_NULL(segs)) {
149 skb_list_walk_safe(segs, segs, nskb) {
152 skb_mark_not_on_list(segs);
153 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
161 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
165 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
166 /* Policy lookup after SNAT yielded a new policy */
167 if (skb_dst(skb)->xfrm) {
168 IPCB(skb)->flags |= IPSKB_REROUTED;
169 return dst_output(net, sk, skb);
173 mtu = ip6_skb_dst_mtu(skb);
174 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
175 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
177 if ((skb->len > mtu && !skb_is_gso(skb)) ||
178 dst_allfrag(skb_dst(skb)) ||
179 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
180 return ip6_fragment(net, sk, skb, ip6_finish_output2);
182 return ip6_finish_output2(net, sk, skb);
185 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
189 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
191 case NET_XMIT_SUCCESS:
192 return __ip6_finish_output(net, sk, skb);
194 return __ip6_finish_output(net, sk, skb) ? : ret;
201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
203 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
204 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
206 skb->protocol = htons(ETH_P_IPV6);
209 if (unlikely(idev->cnf.disable_ipv6)) {
210 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
215 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
216 net, sk, skb, indev, dev,
218 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
221 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
223 if (!np->autoflowlabel_set)
224 return ip6_default_np_autolabel(net);
226 return np->autoflowlabel;
230 * xmit an sk_buff (used by TCP, SCTP and DCCP)
231 * Note : socket lock is not held for SYNACK packets, but might be modified
232 * by calls to skb_set_owner_w() and ipv6_local_error(),
233 * which are using proper atomic operations or spinlocks.
235 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
236 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
238 struct net *net = sock_net(sk);
239 const struct ipv6_pinfo *np = inet6_sk(sk);
240 struct in6_addr *first_hop = &fl6->daddr;
241 struct dst_entry *dst = skb_dst(skb);
242 unsigned int head_room;
244 u8 proto = fl6->flowi6_proto;
245 int seg_len = skb->len;
249 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
251 head_room += opt->opt_nflen + opt->opt_flen;
253 if (unlikely(skb_headroom(skb) < head_room)) {
254 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
256 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
257 IPSTATS_MIB_OUTDISCARDS);
262 skb_set_owner_w(skb2, skb->sk);
268 seg_len += opt->opt_nflen + opt->opt_flen;
271 ipv6_push_frag_opts(skb, opt, &proto);
274 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
278 skb_push(skb, sizeof(struct ipv6hdr));
279 skb_reset_network_header(skb);
283 * Fill in the IPv6 header
286 hlimit = np->hop_limit;
288 hlimit = ip6_dst_hoplimit(dst);
290 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
291 ip6_autoflowlabel(net, np), fl6));
293 hdr->payload_len = htons(seg_len);
294 hdr->nexthdr = proto;
295 hdr->hop_limit = hlimit;
297 hdr->saddr = fl6->saddr;
298 hdr->daddr = *first_hop;
300 skb->protocol = htons(ETH_P_IPV6);
301 skb->priority = priority;
305 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
306 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
307 IPSTATS_MIB_OUT, skb->len);
309 /* if egress device is enslaved to an L3 master device pass the
310 * skb to its handler for processing
312 skb = l3mdev_ip6_out((struct sock *)sk, skb);
316 /* hooks should never assume socket lock is held.
317 * we promote our socket to non const
319 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
320 net, (struct sock *)sk, skb, NULL, dst->dev,
325 /* ipv6_local_error() does not require socket lock,
326 * we promote our socket to non const
328 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
330 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
334 EXPORT_SYMBOL(ip6_xmit);
336 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
338 struct ip6_ra_chain *ra;
339 struct sock *last = NULL;
341 read_lock(&ip6_ra_lock);
342 for (ra = ip6_ra_chain; ra; ra = ra->next) {
343 struct sock *sk = ra->sk;
344 if (sk && ra->sel == sel &&
345 (!sk->sk_bound_dev_if ||
346 sk->sk_bound_dev_if == skb->dev->ifindex)) {
347 struct ipv6_pinfo *np = inet6_sk(sk);
349 if (np && np->rtalert_isolate &&
350 !net_eq(sock_net(sk), dev_net(skb->dev))) {
354 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
356 rawv6_rcv(last, skb2);
363 rawv6_rcv(last, skb);
364 read_unlock(&ip6_ra_lock);
367 read_unlock(&ip6_ra_lock);
371 static int ip6_forward_proxy_check(struct sk_buff *skb)
373 struct ipv6hdr *hdr = ipv6_hdr(skb);
374 u8 nexthdr = hdr->nexthdr;
378 if (ipv6_ext_hdr(nexthdr)) {
379 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
383 offset = sizeof(struct ipv6hdr);
385 if (nexthdr == IPPROTO_ICMPV6) {
386 struct icmp6hdr *icmp6;
388 if (!pskb_may_pull(skb, (skb_network_header(skb) +
389 offset + 1 - skb->data)))
392 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
394 switch (icmp6->icmp6_type) {
395 case NDISC_ROUTER_SOLICITATION:
396 case NDISC_ROUTER_ADVERTISEMENT:
397 case NDISC_NEIGHBOUR_SOLICITATION:
398 case NDISC_NEIGHBOUR_ADVERTISEMENT:
400 /* For reaction involving unicast neighbor discovery
401 * message destined to the proxied address, pass it to
411 * The proxying router can't forward traffic sent to a link-local
412 * address, so signal the sender and discard the packet. This
413 * behavior is clarified by the MIPv6 specification.
415 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
416 dst_link_failure(skb);
423 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
426 struct dst_entry *dst = skb_dst(skb);
428 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
429 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
431 #ifdef CONFIG_NET_SWITCHDEV
432 if (skb->offload_l3_fwd_mark) {
439 return dst_output(net, sk, skb);
442 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
447 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
448 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
454 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
460 int ip6_forward(struct sk_buff *skb)
462 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
463 struct dst_entry *dst = skb_dst(skb);
464 struct ipv6hdr *hdr = ipv6_hdr(skb);
465 struct inet6_skb_parm *opt = IP6CB(skb);
466 struct net *net = dev_net(dst->dev);
469 if (net->ipv6.devconf_all->forwarding == 0)
472 if (skb->pkt_type != PACKET_HOST)
475 if (unlikely(skb->sk))
478 if (skb_warn_if_lro(skb))
481 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
482 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
486 skb_forward_csum(skb);
489 * We DO NOT make any processing on
490 * RA packets, pushing them to user level AS IS
491 * without ane WARRANTY that application will be able
492 * to interpret them. The reason is that we
493 * cannot make anything clever here.
495 * We are not end-node, so that if packet contains
496 * AH/ESP, we cannot make anything.
497 * Defragmentation also would be mistake, RA packets
498 * cannot be fragmented, because there is no warranty
499 * that different fragments will go along one path. --ANK
501 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
502 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
507 * check and decrement ttl
509 if (hdr->hop_limit <= 1) {
510 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
511 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
517 /* XXX: idev->cnf.proxy_ndp? */
518 if (net->ipv6.devconf_all->proxy_ndp &&
519 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
520 int proxied = ip6_forward_proxy_check(skb);
522 return ip6_input(skb);
523 else if (proxied < 0) {
524 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
529 if (!xfrm6_route_forward(skb)) {
530 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
535 /* IPv6 specs say nothing about it, but it is clear that we cannot
536 send redirects to source routed frames.
537 We don't send redirects to frames decapsulated from IPsec.
539 if (IP6CB(skb)->iif == dst->dev->ifindex &&
540 opt->srcrt == 0 && !skb_sec_path(skb)) {
541 struct in6_addr *target = NULL;
542 struct inet_peer *peer;
546 * incoming and outgoing devices are the same
550 rt = (struct rt6_info *) dst;
551 if (rt->rt6i_flags & RTF_GATEWAY)
552 target = &rt->rt6i_gateway;
554 target = &hdr->daddr;
556 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
558 /* Limit redirects both by destination (here)
559 and by source (inside ndisc_send_redirect)
561 if (inet_peer_xrlim_allow(peer, 1*HZ))
562 ndisc_send_redirect(skb, target);
566 int addrtype = ipv6_addr_type(&hdr->saddr);
568 /* This check is security critical. */
569 if (addrtype == IPV6_ADDR_ANY ||
570 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
572 if (addrtype & IPV6_ADDR_LINKLOCAL) {
573 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
574 ICMPV6_NOT_NEIGHBOUR, 0);
579 mtu = ip6_dst_mtu_forward(dst);
580 if (mtu < IPV6_MIN_MTU)
583 if (ip6_pkt_too_big(skb, mtu)) {
584 /* Again, force OUTPUT device used as source address */
586 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
587 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
588 __IP6_INC_STATS(net, ip6_dst_idev(dst),
589 IPSTATS_MIB_FRAGFAILS);
594 if (skb_cow(skb, dst->dev->hard_header_len)) {
595 __IP6_INC_STATS(net, ip6_dst_idev(dst),
596 IPSTATS_MIB_OUTDISCARDS);
602 /* Mangling hops number delayed to point after skb COW */
606 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
607 net, NULL, skb, skb->dev, dst->dev,
611 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
617 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
619 to->pkt_type = from->pkt_type;
620 to->priority = from->priority;
621 to->protocol = from->protocol;
623 skb_dst_set(to, dst_clone(skb_dst(from)));
625 to->mark = from->mark;
627 skb_copy_hash(to, from);
629 #ifdef CONFIG_NET_SCHED
630 to->tc_index = from->tc_index;
633 skb_ext_copy(to, from);
634 skb_copy_secmark(to, from);
637 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
638 u8 nexthdr, __be32 frag_id,
639 struct ip6_fraglist_iter *iter)
641 unsigned int first_len;
645 *prevhdr = NEXTHDR_FRAGMENT;
646 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
650 iter->frag = skb_shinfo(skb)->frag_list;
651 skb_frag_list_init(skb);
655 iter->frag_id = frag_id;
656 iter->nexthdr = nexthdr;
658 __skb_pull(skb, hlen);
659 fh = __skb_push(skb, sizeof(struct frag_hdr));
660 __skb_push(skb, hlen);
661 skb_reset_network_header(skb);
662 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
664 fh->nexthdr = nexthdr;
666 fh->frag_off = htons(IP6_MF);
667 fh->identification = frag_id;
669 first_len = skb_pagelen(skb);
670 skb->data_len = first_len - skb_headlen(skb);
671 skb->len = first_len;
672 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
676 EXPORT_SYMBOL(ip6_fraglist_init);
678 void ip6_fraglist_prepare(struct sk_buff *skb,
679 struct ip6_fraglist_iter *iter)
681 struct sk_buff *frag = iter->frag;
682 unsigned int hlen = iter->hlen;
685 frag->ip_summed = CHECKSUM_NONE;
686 skb_reset_transport_header(frag);
687 fh = __skb_push(frag, sizeof(struct frag_hdr));
688 __skb_push(frag, hlen);
689 skb_reset_network_header(frag);
690 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
691 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
692 fh->nexthdr = iter->nexthdr;
694 fh->frag_off = htons(iter->offset);
696 fh->frag_off |= htons(IP6_MF);
697 fh->identification = iter->frag_id;
698 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
699 ip6_copy_metadata(frag, skb);
701 EXPORT_SYMBOL(ip6_fraglist_prepare);
703 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
704 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
705 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
707 state->prevhdr = prevhdr;
708 state->nexthdr = nexthdr;
709 state->frag_id = frag_id;
714 state->left = skb->len - hlen; /* Space per frame */
715 state->ptr = hlen; /* Where to start from */
717 state->hroom = hdr_room;
718 state->troom = needed_tailroom;
722 EXPORT_SYMBOL(ip6_frag_init);
724 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
726 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
727 struct sk_buff *frag;
732 /* IF: it doesn't fit, use 'mtu' - the data space left */
733 if (len > state->mtu)
735 /* IF: we are not sending up to and including the packet end
736 then align the next start on an eight byte boundary */
737 if (len < state->left)
740 /* Allocate buffer */
741 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
742 state->hroom + state->troom, GFP_ATOMIC);
744 return ERR_PTR(-ENOMEM);
747 * Set up data on packet
750 ip6_copy_metadata(frag, skb);
751 skb_reserve(frag, state->hroom);
752 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
753 skb_reset_network_header(frag);
754 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
755 frag->transport_header = (frag->network_header + state->hlen +
756 sizeof(struct frag_hdr));
759 * Charge the memory for the fragment to any owner
763 skb_set_owner_w(frag, skb->sk);
766 * Copy the packet header into the new buffer.
768 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
770 fragnexthdr_offset = skb_network_header(frag);
771 fragnexthdr_offset += prevhdr - skb_network_header(skb);
772 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
775 * Build fragment header.
777 fh->nexthdr = state->nexthdr;
779 fh->identification = state->frag_id;
782 * Copy a block of the IP datagram.
784 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
788 fh->frag_off = htons(state->offset);
790 fh->frag_off |= htons(IP6_MF);
791 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
794 state->offset += len;
798 EXPORT_SYMBOL(ip6_frag_next);
800 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
801 int (*output)(struct net *, struct sock *, struct sk_buff *))
803 struct sk_buff *frag;
804 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
805 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
806 inet6_sk(skb->sk) : NULL;
807 struct ip6_frag_state state;
808 unsigned int mtu, hlen, nexthdr_offset;
809 ktime_t tstamp = skb->tstamp;
812 u8 *prevhdr, nexthdr = 0;
814 err = ip6_find_1stfragopt(skb, &prevhdr);
819 nexthdr_offset = prevhdr - skb_network_header(skb);
821 mtu = ip6_skb_dst_mtu(skb);
823 /* We must not fragment if the socket is set to force MTU discovery
824 * or if the skb it not generated by a local socket.
826 if (unlikely(!skb->ignore_df && skb->len > mtu))
829 if (IP6CB(skb)->frag_max_size) {
830 if (IP6CB(skb)->frag_max_size > mtu)
833 /* don't send fragments larger than what we received */
834 mtu = IP6CB(skb)->frag_max_size;
835 if (mtu < IPV6_MIN_MTU)
839 if (np && np->frag_size < mtu) {
843 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
845 mtu -= hlen + sizeof(struct frag_hdr);
847 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
848 &ipv6_hdr(skb)->saddr);
850 if (skb->ip_summed == CHECKSUM_PARTIAL &&
851 (err = skb_checksum_help(skb)))
854 prevhdr = skb_network_header(skb) + nexthdr_offset;
855 hroom = LL_RESERVED_SPACE(rt->dst.dev);
856 if (skb_has_frag_list(skb)) {
857 unsigned int first_len = skb_pagelen(skb);
858 struct ip6_fraglist_iter iter;
859 struct sk_buff *frag2;
861 if (first_len - hlen > mtu ||
862 ((first_len - hlen) & 7) ||
864 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
867 skb_walk_frags(skb, frag) {
868 /* Correct geometry. */
869 if (frag->len > mtu ||
870 ((frag->len & 7) && frag->next) ||
871 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
872 goto slow_path_clean;
874 /* Partially cloned skb? */
875 if (skb_shared(frag))
876 goto slow_path_clean;
881 frag->destructor = sock_wfree;
883 skb->truesize -= frag->truesize;
886 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
892 /* Prepare header of the next frame,
893 * before previous one went down. */
895 ip6_fraglist_prepare(skb, &iter);
897 skb->tstamp = tstamp;
898 err = output(net, sk, skb);
900 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
901 IPSTATS_MIB_FRAGCREATES);
903 if (err || !iter.frag)
906 skb = ip6_fraglist_next(&iter);
912 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
913 IPSTATS_MIB_FRAGOKS);
917 kfree_skb_list(iter.frag);
919 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
920 IPSTATS_MIB_FRAGFAILS);
924 skb_walk_frags(skb, frag2) {
928 frag2->destructor = NULL;
929 skb->truesize += frag2->truesize;
935 * Fragment the datagram.
938 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
939 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
943 * Keep copying data until we run out.
946 while (state.left > 0) {
947 frag = ip6_frag_next(skb, &state);
954 * Put this fragment into the sending queue.
956 frag->tstamp = tstamp;
957 err = output(net, sk, frag);
961 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
962 IPSTATS_MIB_FRAGCREATES);
964 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
965 IPSTATS_MIB_FRAGOKS);
970 if (skb->sk && dst_allfrag(skb_dst(skb)))
971 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
973 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
977 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
978 IPSTATS_MIB_FRAGFAILS);
983 static inline int ip6_rt_check(const struct rt6key *rt_key,
984 const struct in6_addr *fl_addr,
985 const struct in6_addr *addr_cache)
987 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
988 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
991 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
992 struct dst_entry *dst,
993 const struct flowi6 *fl6)
995 struct ipv6_pinfo *np = inet6_sk(sk);
1001 if (dst->ops->family != AF_INET6) {
1006 rt = (struct rt6_info *)dst;
1007 /* Yes, checking route validity in not connected
1008 * case is not very simple. Take into account,
1009 * that we do not support routing by source, TOS,
1010 * and MSG_DONTROUTE --ANK (980726)
1012 * 1. ip6_rt_check(): If route was host route,
1013 * check that cached destination is current.
1014 * If it is network route, we still may
1015 * check its validity using saved pointer
1016 * to the last used address: daddr_cache.
1017 * We do not want to save whole address now,
1018 * (because main consumer of this service
1019 * is tcp, which has not this problem),
1020 * so that the last trick works only on connected
1022 * 2. oif also should be the same.
1024 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1025 #ifdef CONFIG_IPV6_SUBTREES
1026 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1028 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1029 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1038 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1039 struct dst_entry **dst, struct flowi6 *fl6)
1041 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1042 struct neighbour *n;
1043 struct rt6_info *rt;
1048 /* The correct way to handle this would be to do
1049 * ip6_route_get_saddr, and then ip6_route_output; however,
1050 * the route-specific preferred source forces the
1051 * ip6_route_output call _before_ ip6_route_get_saddr.
1053 * In source specific routing (no src=any default route),
1054 * ip6_route_output will fail given src=any saddr, though, so
1055 * that's why we try it again later.
1057 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1058 struct fib6_info *from;
1059 struct rt6_info *rt;
1060 bool had_dst = *dst != NULL;
1063 *dst = ip6_route_output(net, sk, fl6);
1064 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1067 from = rt ? rcu_dereference(rt->from) : NULL;
1068 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1069 sk ? inet6_sk(sk)->srcprefs : 0,
1074 goto out_err_release;
1076 /* If we had an erroneous initial result, pretend it
1077 * never existed and let the SA-enabled version take
1080 if (!had_dst && (*dst)->error) {
1085 if (fl6->flowi6_oif)
1086 flags |= RT6_LOOKUP_F_IFACE;
1090 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1092 err = (*dst)->error;
1094 goto out_err_release;
1096 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1098 * Here if the dst entry we've looked up
1099 * has a neighbour entry that is in the INCOMPLETE
1100 * state and the src address from the flow is
1101 * marked as OPTIMISTIC, we release the found
1102 * dst entry and replace it instead with the
1103 * dst entry of the nexthop router
1105 rt = (struct rt6_info *) *dst;
1107 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1108 rt6_nexthop(rt, &fl6->daddr));
1109 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1110 rcu_read_unlock_bh();
1113 struct inet6_ifaddr *ifp;
1114 struct flowi6 fl_gw6;
1117 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1120 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1126 * We need to get the dst entry for the
1127 * default router instead
1130 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1131 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1132 *dst = ip6_route_output(net, sk, &fl_gw6);
1133 err = (*dst)->error;
1135 goto out_err_release;
1139 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1140 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1141 err = -EAFNOSUPPORT;
1142 goto out_err_release;
1151 if (err == -ENETUNREACH)
1152 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1157 * ip6_dst_lookup - perform route lookup on flow
1158 * @net: Network namespace to perform lookup in
1159 * @sk: socket which provides route info
1160 * @dst: pointer to dst_entry * for result
1161 * @fl6: flow to lookup
1163 * This function performs a route lookup on the given flow.
1165 * It returns zero on success, or a standard errno code on error.
1167 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1171 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1173 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1176 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1177 * @net: Network namespace to perform lookup in
1178 * @sk: socket which provides route info
1179 * @fl6: flow to lookup
1180 * @final_dst: final destination address for ipsec lookup
1182 * This function performs a route lookup on the given flow.
1184 * It returns a valid dst pointer on success, or a pointer encoded
1187 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1188 const struct in6_addr *final_dst)
1190 struct dst_entry *dst = NULL;
1193 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1195 return ERR_PTR(err);
1197 fl6->daddr = *final_dst;
1199 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1201 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1204 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1205 * @sk: socket which provides the dst cache and route info
1206 * @fl6: flow to lookup
1207 * @final_dst: final destination address for ipsec lookup
1208 * @connected: whether @sk is connected or not
1210 * This function performs a route lookup on the given flow with the
1211 * possibility of using the cached route in the socket if it is valid.
1212 * It will take the socket dst lock when operating on the dst cache.
1213 * As a result, this function can only be used in process context.
1215 * In addition, for a connected socket, cache the dst in the socket
1216 * if the current cache is not valid.
1218 * It returns a valid dst pointer on success, or a pointer encoded
1221 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1222 const struct in6_addr *final_dst,
1225 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1227 dst = ip6_sk_dst_check(sk, dst, fl6);
1231 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1232 if (connected && !IS_ERR(dst))
1233 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1237 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1240 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1241 * @skb: Packet for which lookup is done
1242 * @dev: Tunnel device
1243 * @net: Network namespace of tunnel device
1244 * @sock: Socket which provides route info
1245 * @saddr: Memory to store the src ip address
1246 * @info: Tunnel information
1247 * @protocol: IP protocol
1248 * @use_cache: Flag to enable cache usage
1249 * This function performs a route lookup on a tunnel
1251 * It returns a valid dst pointer and stores src address to be used in
1252 * tunnel in param saddr on success, else a pointer encoded error code.
1255 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1256 struct net_device *dev,
1258 struct socket *sock,
1259 struct in6_addr *saddr,
1260 const struct ip_tunnel_info *info,
1264 struct dst_entry *dst = NULL;
1265 #ifdef CONFIG_DST_CACHE
1266 struct dst_cache *dst_cache;
1271 #ifdef CONFIG_DST_CACHE
1272 dst_cache = (struct dst_cache *)&info->dst_cache;
1274 dst = dst_cache_get_ip6(dst_cache, saddr);
1279 memset(&fl6, 0, sizeof(fl6));
1280 fl6.flowi6_mark = skb->mark;
1281 fl6.flowi6_proto = protocol;
1282 fl6.daddr = info->key.u.ipv6.dst;
1283 fl6.saddr = info->key.u.ipv6.src;
1284 prio = info->key.tos;
1285 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1288 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1291 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1292 return ERR_PTR(-ENETUNREACH);
1294 if (dst->dev == dev) { /* is this necessary? */
1295 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1297 return ERR_PTR(-ELOOP);
1299 #ifdef CONFIG_DST_CACHE
1301 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1306 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1308 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1311 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1314 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1317 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1320 static void ip6_append_data_mtu(unsigned int *mtu,
1322 unsigned int fragheaderlen,
1323 struct sk_buff *skb,
1324 struct rt6_info *rt,
1325 unsigned int orig_mtu)
1327 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1329 /* first fragment, reserve header_len */
1330 *mtu = orig_mtu - rt->dst.header_len;
1334 * this fragment is not first, the headers
1335 * space is regarded as data space.
1339 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1340 + fragheaderlen - sizeof(struct frag_hdr);
1344 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1345 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1346 struct rt6_info *rt, struct flowi6 *fl6)
1348 struct ipv6_pinfo *np = inet6_sk(sk);
1350 struct ipv6_txoptions *opt = ipc6->opt;
1356 if (WARN_ON(v6_cork->opt))
1359 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1360 if (unlikely(!v6_cork->opt))
1363 v6_cork->opt->tot_len = sizeof(*opt);
1364 v6_cork->opt->opt_flen = opt->opt_flen;
1365 v6_cork->opt->opt_nflen = opt->opt_nflen;
1367 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1369 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1372 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1374 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1377 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1379 if (opt->hopopt && !v6_cork->opt->hopopt)
1382 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1384 if (opt->srcrt && !v6_cork->opt->srcrt)
1387 /* need source address above miyazawa*/
1390 cork->base.dst = &rt->dst;
1391 cork->fl.u.ip6 = *fl6;
1392 v6_cork->hop_limit = ipc6->hlimit;
1393 v6_cork->tclass = ipc6->tclass;
1394 if (rt->dst.flags & DST_XFRM_TUNNEL)
1395 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1396 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1398 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1399 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1400 if (np->frag_size < mtu) {
1402 mtu = np->frag_size;
1404 if (mtu < IPV6_MIN_MTU)
1406 cork->base.fragsize = mtu;
1407 cork->base.gso_size = ipc6->gso_size;
1408 cork->base.tx_flags = 0;
1409 cork->base.mark = ipc6->sockc.mark;
1410 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1412 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1413 cork->base.flags |= IPCORK_ALLFRAG;
1414 cork->base.length = 0;
1416 cork->base.transmit_time = ipc6->sockc.transmit_time;
1421 static int __ip6_append_data(struct sock *sk,
1423 struct sk_buff_head *queue,
1424 struct inet_cork *cork,
1425 struct inet6_cork *v6_cork,
1426 struct page_frag *pfrag,
1427 int getfrag(void *from, char *to, int offset,
1428 int len, int odd, struct sk_buff *skb),
1429 void *from, int length, int transhdrlen,
1430 unsigned int flags, struct ipcm6_cookie *ipc6)
1432 struct sk_buff *skb, *skb_prev = NULL;
1433 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1434 struct ubuf_info *uarg = NULL;
1436 int dst_exthdrlen = 0;
1442 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1443 struct ipv6_txoptions *opt = v6_cork->opt;
1444 int csummode = CHECKSUM_NONE;
1445 unsigned int maxnonfragsize, headersize;
1446 unsigned int wmem_alloc_delta = 0;
1447 bool paged, extra_uref = false;
1449 skb = skb_peek_tail(queue);
1451 exthdrlen = opt ? opt->opt_flen : 0;
1452 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1455 paged = !!cork->gso_size;
1456 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1459 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1460 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1461 tskey = sk->sk_tskey++;
1463 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1465 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1466 (opt ? opt->opt_nflen : 0);
1467 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1468 sizeof(struct frag_hdr);
1470 headersize = sizeof(struct ipv6hdr) +
1471 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1472 (dst_allfrag(&rt->dst) ?
1473 sizeof(struct frag_hdr) : 0) +
1474 rt->rt6i_nfheader_len;
1476 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1477 * the first fragment
1479 if (headersize + transhdrlen > mtu)
1482 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1483 (sk->sk_protocol == IPPROTO_UDP ||
1484 sk->sk_protocol == IPPROTO_RAW)) {
1485 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1486 sizeof(struct ipv6hdr));
1490 if (ip6_sk_ignore_df(sk))
1491 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1493 maxnonfragsize = mtu;
1495 if (cork->length + length > maxnonfragsize - headersize) {
1497 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1498 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1502 /* CHECKSUM_PARTIAL only with no extension headers and when
1503 * we are not going to fragment
1505 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1506 headersize == sizeof(struct ipv6hdr) &&
1507 length <= mtu - headersize &&
1508 (!(flags & MSG_MORE) || cork->gso_size) &&
1509 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1510 csummode = CHECKSUM_PARTIAL;
1512 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1513 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1516 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1517 if (rt->dst.dev->features & NETIF_F_SG &&
1518 csummode == CHECKSUM_PARTIAL) {
1522 skb_zcopy_set(skb, uarg, &extra_uref);
1527 * Let's try using as much space as possible.
1528 * Use MTU if total length of the message fits into the MTU.
1529 * Otherwise, we need to reserve fragment header and
1530 * fragment alignment (= 8-15 octects, in total).
1532 * Note that we may need to "move" the data from the tail
1533 * of the buffer to the new fragment when we split
1536 * FIXME: It may be fragmented into multiple chunks
1537 * at once if non-fragmentable extension headers
1542 cork->length += length;
1546 while (length > 0) {
1547 /* Check if the remaining data fits into current packet. */
1548 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1550 copy = maxfraglen - skb->len;
1554 unsigned int datalen;
1555 unsigned int fraglen;
1556 unsigned int fraggap;
1557 unsigned int alloclen;
1558 unsigned int pagedlen;
1560 /* There's no room in the current skb */
1562 fraggap = skb->len - maxfraglen;
1565 /* update mtu and maxfraglen if necessary */
1566 if (!skb || !skb_prev)
1567 ip6_append_data_mtu(&mtu, &maxfraglen,
1568 fragheaderlen, skb, rt,
1574 * If remaining data exceeds the mtu,
1575 * we know we need more fragment(s).
1577 datalen = length + fraggap;
1579 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1580 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1581 fraglen = datalen + fragheaderlen;
1584 if ((flags & MSG_MORE) &&
1585 !(rt->dst.dev->features&NETIF_F_SG))
1590 alloclen = min_t(int, fraglen, MAX_HEADER);
1591 pagedlen = fraglen - alloclen;
1594 alloclen += dst_exthdrlen;
1596 if (datalen != length + fraggap) {
1598 * this is not the last fragment, the trailer
1599 * space is regarded as data space.
1601 datalen += rt->dst.trailer_len;
1604 alloclen += rt->dst.trailer_len;
1605 fraglen = datalen + fragheaderlen;
1608 * We just reserve space for fragment header.
1609 * Note: this may be overallocation if the message
1610 * (without MSG_MORE) fits into the MTU.
1612 alloclen += sizeof(struct frag_hdr);
1614 copy = datalen - transhdrlen - fraggap - pagedlen;
1620 skb = sock_alloc_send_skb(sk,
1622 (flags & MSG_DONTWAIT), &err);
1625 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1627 skb = alloc_skb(alloclen + hh_len,
1635 * Fill in the control structures
1637 skb->protocol = htons(ETH_P_IPV6);
1638 skb->ip_summed = csummode;
1640 /* reserve for fragmentation and ipsec header */
1641 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1645 * Find where to start putting bytes
1647 data = skb_put(skb, fraglen - pagedlen);
1648 skb_set_network_header(skb, exthdrlen);
1649 data += fragheaderlen;
1650 skb->transport_header = (skb->network_header +
1653 skb->csum = skb_copy_and_csum_bits(
1654 skb_prev, maxfraglen,
1655 data + transhdrlen, fraggap);
1656 skb_prev->csum = csum_sub(skb_prev->csum,
1659 pskb_trim_unique(skb_prev, maxfraglen);
1662 getfrag(from, data + transhdrlen, offset,
1663 copy, fraggap, skb) < 0) {
1670 length -= copy + transhdrlen;
1675 /* Only the initial fragment is time stamped */
1676 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1678 skb_shinfo(skb)->tskey = tskey;
1680 skb_zcopy_set(skb, uarg, &extra_uref);
1682 if ((flags & MSG_CONFIRM) && !skb_prev)
1683 skb_set_dst_pending_confirm(skb, 1);
1686 * Put the packet on the pending queue
1688 if (!skb->destructor) {
1689 skb->destructor = sock_wfree;
1691 wmem_alloc_delta += skb->truesize;
1693 __skb_queue_tail(queue, skb);
1700 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1701 skb_tailroom(skb) >= copy) {
1705 if (getfrag(from, skb_put(skb, copy),
1706 offset, copy, off, skb) < 0) {
1707 __skb_trim(skb, off);
1711 } else if (!uarg || !uarg->zerocopy) {
1712 int i = skb_shinfo(skb)->nr_frags;
1715 if (!sk_page_frag_refill(sk, pfrag))
1718 if (!skb_can_coalesce(skb, i, pfrag->page,
1721 if (i == MAX_SKB_FRAGS)
1724 __skb_fill_page_desc(skb, i, pfrag->page,
1726 skb_shinfo(skb)->nr_frags = ++i;
1727 get_page(pfrag->page);
1729 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1731 page_address(pfrag->page) + pfrag->offset,
1732 offset, copy, skb->len, skb) < 0)
1735 pfrag->offset += copy;
1736 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1738 skb->data_len += copy;
1739 skb->truesize += copy;
1740 wmem_alloc_delta += copy;
1742 err = skb_zerocopy_iter_dgram(skb, from, copy);
1750 if (wmem_alloc_delta)
1751 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1758 sock_zerocopy_put_abort(uarg, extra_uref);
1759 cork->length -= length;
1760 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1761 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1765 int ip6_append_data(struct sock *sk,
1766 int getfrag(void *from, char *to, int offset, int len,
1767 int odd, struct sk_buff *skb),
1768 void *from, int length, int transhdrlen,
1769 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1770 struct rt6_info *rt, unsigned int flags)
1772 struct inet_sock *inet = inet_sk(sk);
1773 struct ipv6_pinfo *np = inet6_sk(sk);
1777 if (flags&MSG_PROBE)
1779 if (skb_queue_empty(&sk->sk_write_queue)) {
1783 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1788 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1789 length += exthdrlen;
1790 transhdrlen += exthdrlen;
1792 fl6 = &inet->cork.fl.u.ip6;
1796 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1797 &np->cork, sk_page_frag(sk), getfrag,
1798 from, length, transhdrlen, flags, ipc6);
1800 EXPORT_SYMBOL_GPL(ip6_append_data);
1802 static void ip6_cork_release(struct inet_cork_full *cork,
1803 struct inet6_cork *v6_cork)
1806 kfree(v6_cork->opt->dst0opt);
1807 kfree(v6_cork->opt->dst1opt);
1808 kfree(v6_cork->opt->hopopt);
1809 kfree(v6_cork->opt->srcrt);
1810 kfree(v6_cork->opt);
1811 v6_cork->opt = NULL;
1814 if (cork->base.dst) {
1815 dst_release(cork->base.dst);
1816 cork->base.dst = NULL;
1817 cork->base.flags &= ~IPCORK_ALLFRAG;
1819 memset(&cork->fl, 0, sizeof(cork->fl));
1822 struct sk_buff *__ip6_make_skb(struct sock *sk,
1823 struct sk_buff_head *queue,
1824 struct inet_cork_full *cork,
1825 struct inet6_cork *v6_cork)
1827 struct sk_buff *skb, *tmp_skb;
1828 struct sk_buff **tail_skb;
1829 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1830 struct ipv6_pinfo *np = inet6_sk(sk);
1831 struct net *net = sock_net(sk);
1832 struct ipv6hdr *hdr;
1833 struct ipv6_txoptions *opt = v6_cork->opt;
1834 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1835 struct flowi6 *fl6 = &cork->fl.u.ip6;
1836 unsigned char proto = fl6->flowi6_proto;
1838 skb = __skb_dequeue(queue);
1841 tail_skb = &(skb_shinfo(skb)->frag_list);
1843 /* move skb->data to ip header from ext header */
1844 if (skb->data < skb_network_header(skb))
1845 __skb_pull(skb, skb_network_offset(skb));
1846 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1847 __skb_pull(tmp_skb, skb_network_header_len(skb));
1848 *tail_skb = tmp_skb;
1849 tail_skb = &(tmp_skb->next);
1850 skb->len += tmp_skb->len;
1851 skb->data_len += tmp_skb->len;
1852 skb->truesize += tmp_skb->truesize;
1853 tmp_skb->destructor = NULL;
1857 /* Allow local fragmentation. */
1858 skb->ignore_df = ip6_sk_ignore_df(sk);
1860 *final_dst = fl6->daddr;
1861 __skb_pull(skb, skb_network_header_len(skb));
1862 if (opt && opt->opt_flen)
1863 ipv6_push_frag_opts(skb, opt, &proto);
1864 if (opt && opt->opt_nflen)
1865 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1867 skb_push(skb, sizeof(struct ipv6hdr));
1868 skb_reset_network_header(skb);
1869 hdr = ipv6_hdr(skb);
1871 ip6_flow_hdr(hdr, v6_cork->tclass,
1872 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1873 ip6_autoflowlabel(net, np), fl6));
1874 hdr->hop_limit = v6_cork->hop_limit;
1875 hdr->nexthdr = proto;
1876 hdr->saddr = fl6->saddr;
1877 hdr->daddr = *final_dst;
1879 skb->priority = sk->sk_priority;
1880 skb->mark = cork->base.mark;
1882 skb->tstamp = cork->base.transmit_time;
1884 skb_dst_set(skb, dst_clone(&rt->dst));
1885 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1886 if (proto == IPPROTO_ICMPV6) {
1887 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1889 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1890 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1893 ip6_cork_release(cork, v6_cork);
1898 int ip6_send_skb(struct sk_buff *skb)
1900 struct net *net = sock_net(skb->sk);
1901 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1904 err = ip6_local_out(net, skb->sk, skb);
1907 err = net_xmit_errno(err);
1909 IP6_INC_STATS(net, rt->rt6i_idev,
1910 IPSTATS_MIB_OUTDISCARDS);
1916 int ip6_push_pending_frames(struct sock *sk)
1918 struct sk_buff *skb;
1920 skb = ip6_finish_skb(sk);
1924 return ip6_send_skb(skb);
1926 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1928 static void __ip6_flush_pending_frames(struct sock *sk,
1929 struct sk_buff_head *queue,
1930 struct inet_cork_full *cork,
1931 struct inet6_cork *v6_cork)
1933 struct sk_buff *skb;
1935 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1937 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1938 IPSTATS_MIB_OUTDISCARDS);
1942 ip6_cork_release(cork, v6_cork);
1945 void ip6_flush_pending_frames(struct sock *sk)
1947 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1948 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1950 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1952 struct sk_buff *ip6_make_skb(struct sock *sk,
1953 int getfrag(void *from, char *to, int offset,
1954 int len, int odd, struct sk_buff *skb),
1955 void *from, int length, int transhdrlen,
1956 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1957 struct rt6_info *rt, unsigned int flags,
1958 struct inet_cork_full *cork)
1960 struct inet6_cork v6_cork;
1961 struct sk_buff_head queue;
1962 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1965 if (flags & MSG_PROBE)
1968 __skb_queue_head_init(&queue);
1970 cork->base.flags = 0;
1971 cork->base.addr = 0;
1972 cork->base.opt = NULL;
1973 cork->base.dst = NULL;
1975 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1977 ip6_cork_release(cork, &v6_cork);
1978 return ERR_PTR(err);
1980 if (ipc6->dontfrag < 0)
1981 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1983 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1984 ¤t->task_frag, getfrag, from,
1985 length + exthdrlen, transhdrlen + exthdrlen,
1988 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1989 return ERR_PTR(err);
1992 return __ip6_make_skb(sk, &queue, cork, &v6_cork);