net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         const struct in6_addr *nexthop;
  64         struct neighbour *neigh;
  65         int ret;
  66
  67         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  68                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  69
  70                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  71                     ((mroute6_is_socket(net, skb) &&
  72                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  73                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  74                                          &ipv6_hdr(skb)->saddr))) {
  75                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  76
  77                         /* Do not check for IFF_ALLMULTI; multicast routing
  78                            is not supported in any case.
  79                          */
  80                         if (newskb)
  81                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  82                                         net, sk, newskb, NULL, newskb->dev,
  83                                         dev_loopback_xmit);
  84
  85                         if (ipv6_hdr(skb)->hop_limit == 0) {
  86                                 IP6_INC_STATS(net, idev,
  87                                               IPSTATS_MIB_OUTDISCARDS);
  88                                 kfree_skb(skb);
  89                                 return 0;
  90                         }
  91                 }
  92
  93                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  94
  95                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  96                     IPV6_ADDR_SCOPE_NODELOCAL &&
  97                     !(dev->flags & IFF_LOOPBACK)) {
  98                         kfree_skb(skb);
  99                         return 0;
 100                 }
 101         }
 102
 103         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 104                 int res = lwtunnel_xmit(skb);
 105
 106                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 107                         return res;
 108         }
 109
 110         rcu_read_lock_bh();
 111         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 112         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 113         if (unlikely(!neigh))
 114                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 115         if (!IS_ERR(neigh)) {
 116                 sock_confirm_neigh(skb, neigh);
 117                 ret = neigh_output(neigh, skb, false);
 118                 rcu_read_unlock_bh();
 119                 return ret;
 120         }
 121         rcu_read_unlock_bh();
 122
 123         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 124         kfree_skb(skb);
 125         return -EINVAL;
 126 }
 127
 128 static int
 129 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 130                                     struct sk_buff *skb, unsigned int mtu)
 131 {
 132         struct sk_buff *segs, *nskb;
 133         netdev_features_t features;
 134         int ret = 0;
 135
 136         /* Please see corresponding comment in ip_finish_output_gso
 137          * describing the cases where GSO segment length exceeds the
 138          * egress MTU.
 139          */
 140         features = netif_skb_features(skb);
 141         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 142         if (IS_ERR_OR_NULL(segs)) {
 143                 kfree_skb(skb);
 144                 return -ENOMEM;
 145         }
 146
 147         consume_skb(skb);
 148
 149         skb_list_walk_safe(segs, segs, nskb) {
 150                 int err;
 151
 152                 skb_mark_not_on_list(segs);
 153                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 154                 if (err && ret == 0)
 155                         ret = err;
 156         }
 157
 158         return ret;
 159 }
 160
 161 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 162 {
 163         unsigned int mtu;
 164
 165 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 166         /* Policy lookup after SNAT yielded a new policy */
 167         if (skb_dst(skb)->xfrm) {
 168                 IPCB(skb)->flags |= IPSKB_REROUTED;
 169                 return dst_output(net, sk, skb);
 170         }
 171 #endif
 172
 173         mtu = ip6_skb_dst_mtu(skb);
 174         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 175                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 176
 177         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 178             dst_allfrag(skb_dst(skb)) ||
 179             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 180                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 181         else
 182                 return ip6_finish_output2(net, sk, skb);
 183 }
 184
 185 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 186 {
 187         int ret;
 188
 189         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 190         switch (ret) {
 191         case NET_XMIT_SUCCESS:
 192                 return __ip6_finish_output(net, sk, skb);
 193         case NET_XMIT_CN:
 194                 return __ip6_finish_output(net, sk, skb) ? : ret;
 195         default:
 196                 kfree_skb(skb);
 197                 return ret;
 198         }
 199 }
 200
 201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 202 {
 203         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 204         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 205
 206         skb->protocol = htons(ETH_P_IPV6);
 207         skb->dev = dev;
 208
 209         if (unlikely(idev->cnf.disable_ipv6)) {
 210                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 211                 kfree_skb(skb);
 212                 return 0;
 213         }
 214
 215         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 216                             net, sk, skb, indev, dev,
 217                             ip6_finish_output,
 218                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 219 }
 220
 221 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 222 {
 223         if (!np->autoflowlabel_set)
 224                 return ip6_default_np_autolabel(net);
 225         else
 226                 return np->autoflowlabel;
 227 }
 228
 229 /*
 230  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 231  * Note : socket lock is not held for SYNACK packets, but might be modified
 232  * by calls to skb_set_owner_w() and ipv6_local_error(),
 233  * which are using proper atomic operations or spinlocks.
 234  */
 235 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 236              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 237 {
 238         struct net *net = sock_net(sk);
 239         const struct ipv6_pinfo *np = inet6_sk(sk);
 240         struct in6_addr *first_hop = &fl6->daddr;
 241         struct dst_entry *dst = skb_dst(skb);
 242         unsigned int head_room;
 243         struct ipv6hdr *hdr;
 244         u8  proto = fl6->flowi6_proto;
 245         int seg_len = skb->len;
 246         int hlimit = -1;
 247         u32 mtu;
 248
 249         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 250         if (opt)
 251                 head_room += opt->opt_nflen + opt->opt_flen;
 252
 253         if (unlikely(skb_headroom(skb) < head_room)) {
 254                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 255                 if (!skb2) {
 256                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 257                                       IPSTATS_MIB_OUTDISCARDS);
 258                         kfree_skb(skb);
 259                         return -ENOBUFS;
 260                 }
 261                 if (skb->sk)
 262                         skb_set_owner_w(skb2, skb->sk);
 263                 consume_skb(skb);
 264                 skb = skb2;
 265         }
 266
 267         if (opt) {
 268                 seg_len += opt->opt_nflen + opt->opt_flen;
 269
 270                 if (opt->opt_flen)
 271                         ipv6_push_frag_opts(skb, opt, &proto);
 272
 273                 if (opt->opt_nflen)
 274                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 275                                              &fl6->saddr);
 276         }
 277
 278         skb_push(skb, sizeof(struct ipv6hdr));
 279         skb_reset_network_header(skb);
 280         hdr = ipv6_hdr(skb);
 281
 282         /*
 283          *      Fill in the IPv6 header
 284          */
 285         if (np)
 286                 hlimit = np->hop_limit;
 287         if (hlimit < 0)
 288                 hlimit = ip6_dst_hoplimit(dst);
 289
 290         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 291                                 ip6_autoflowlabel(net, np), fl6));
 292
 293         hdr->payload_len = htons(seg_len);
 294         hdr->nexthdr = proto;
 295         hdr->hop_limit = hlimit;
 296
 297         hdr->saddr = fl6->saddr;
 298         hdr->daddr = *first_hop;
 299
 300         skb->protocol = htons(ETH_P_IPV6);
 301         skb->priority = priority;
 302         skb->mark = mark;
 303
 304         mtu = dst_mtu(dst);
 305         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 306                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 307                               IPSTATS_MIB_OUT, skb->len);
 308
 309                 /* if egress device is enslaved to an L3 master device pass the
 310                  * skb to its handler for processing
 311                  */
 312                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 313                 if (unlikely(!skb))
 314                         return 0;
 315
 316                 /* hooks should never assume socket lock is held.
 317                  * we promote our socket to non const
 318                  */
 319                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 320                                net, (struct sock *)sk, skb, NULL, dst->dev,
 321                                dst_output);
 322         }
 323
 324         skb->dev = dst->dev;
 325         /* ipv6_local_error() does not require socket lock,
 326          * we promote our socket to non const
 327          */
 328         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 329
 330         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 331         kfree_skb(skb);
 332         return -EMSGSIZE;
 333 }
 334 EXPORT_SYMBOL(ip6_xmit);
 335
 336 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 337 {
 338         struct ip6_ra_chain *ra;
 339         struct sock *last = NULL;
 340
 341         read_lock(&ip6_ra_lock);
 342         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 343                 struct sock *sk = ra->sk;
 344                 if (sk && ra->sel == sel &&
 345                     (!sk->sk_bound_dev_if ||
 346                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 347                         struct ipv6_pinfo *np = inet6_sk(sk);
 348
 349                         if (np && np->rtalert_isolate &&
 350                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 351                                 continue;
 352                         }
 353                         if (last) {
 354                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 355                                 if (skb2)
 356                                         rawv6_rcv(last, skb2);
 357                         }
 358                         last = sk;
 359                 }
 360         }
 361
 362         if (last) {
 363                 rawv6_rcv(last, skb);
 364                 read_unlock(&ip6_ra_lock);
 365                 return 1;
 366         }
 367         read_unlock(&ip6_ra_lock);
 368         return 0;
 369 }
 370
 371 static int ip6_forward_proxy_check(struct sk_buff *skb)
 372 {
 373         struct ipv6hdr *hdr = ipv6_hdr(skb);
 374         u8 nexthdr = hdr->nexthdr;
 375         __be16 frag_off;
 376         int offset;
 377
 378         if (ipv6_ext_hdr(nexthdr)) {
 379                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 380                 if (offset < 0)
 381                         return 0;
 382         } else
 383                 offset = sizeof(struct ipv6hdr);
 384
 385         if (nexthdr == IPPROTO_ICMPV6) {
 386                 struct icmp6hdr *icmp6;
 387
 388                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 389                                          offset + 1 - skb->data)))
 390                         return 0;
 391
 392                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 393
 394                 switch (icmp6->icmp6_type) {
 395                 case NDISC_ROUTER_SOLICITATION:
 396                 case NDISC_ROUTER_ADVERTISEMENT:
 397                 case NDISC_NEIGHBOUR_SOLICITATION:
 398                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 399                 case NDISC_REDIRECT:
 400                         /* For reaction involving unicast neighbor discovery
 401                          * message destined to the proxied address, pass it to
 402                          * input function.
 403                          */
 404                         return 1;
 405                 default:
 406                         break;
 407                 }
 408         }
 409
 410         /*
 411          * The proxying router can't forward traffic sent to a link-local
 412          * address, so signal the sender and discard the packet. This
 413          * behavior is clarified by the MIPv6 specification.
 414          */
 415         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 416                 dst_link_failure(skb);
 417                 return -1;
 418         }
 419
 420         return 0;
 421 }
 422
 423 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 424                                      struct sk_buff *skb)
 425 {
 426         struct dst_entry *dst = skb_dst(skb);
 427
 428         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 429         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 430
 431 #ifdef CONFIG_NET_SWITCHDEV
 432         if (skb->offload_l3_fwd_mark) {
 433                 consume_skb(skb);
 434                 return 0;
 435         }
 436 #endif
 437
 438         skb->tstamp = 0;
 439         return dst_output(net, sk, skb);
 440 }
 441
 442 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 443 {
 444         if (skb->len <= mtu)
 445                 return false;
 446
 447         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 448         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 449                 return true;
 450
 451         if (skb->ignore_df)
 452                 return false;
 453
 454         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 455                 return false;
 456
 457         return true;
 458 }
 459
 460 int ip6_forward(struct sk_buff *skb)
 461 {
 462         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 463         struct dst_entry *dst = skb_dst(skb);
 464         struct ipv6hdr *hdr = ipv6_hdr(skb);
 465         struct inet6_skb_parm *opt = IP6CB(skb);
 466         struct net *net = dev_net(dst->dev);
 467         u32 mtu;
 468
 469         if (net->ipv6.devconf_all->forwarding == 0)
 470                 goto error;
 471
 472         if (skb->pkt_type != PACKET_HOST)
 473                 goto drop;
 474
 475         if (unlikely(skb->sk))
 476                 goto drop;
 477
 478         if (skb_warn_if_lro(skb))
 479                 goto drop;
 480
 481         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 482                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 483                 goto drop;
 484         }
 485
 486         skb_forward_csum(skb);
 487
 488         /*
 489          *      We DO NOT make any processing on
 490          *      RA packets, pushing them to user level AS IS
 491          *      without ane WARRANTY that application will be able
 492          *      to interpret them. The reason is that we
 493          *      cannot make anything clever here.
 494          *
 495          *      We are not end-node, so that if packet contains
 496          *      AH/ESP, we cannot make anything.
 497          *      Defragmentation also would be mistake, RA packets
 498          *      cannot be fragmented, because there is no warranty
 499          *      that different fragments will go along one path. --ANK
 500          */
 501         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 502                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 503                         return 0;
 504         }
 505
 506         /*
 507          *      check and decrement ttl
 508          */
 509         if (hdr->hop_limit <= 1) {
 510                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 511                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 512
 513                 kfree_skb(skb);
 514                 return -ETIMEDOUT;
 515         }
 516
 517         /* XXX: idev->cnf.proxy_ndp? */
 518         if (net->ipv6.devconf_all->proxy_ndp &&
 519             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 520                 int proxied = ip6_forward_proxy_check(skb);
 521                 if (proxied > 0)
 522                         return ip6_input(skb);
 523                 else if (proxied < 0) {
 524                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 525                         goto drop;
 526                 }
 527         }
 528
 529         if (!xfrm6_route_forward(skb)) {
 530                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 531                 goto drop;
 532         }
 533         dst = skb_dst(skb);
 534
 535         /* IPv6 specs say nothing about it, but it is clear that we cannot
 536            send redirects to source routed frames.
 537            We don't send redirects to frames decapsulated from IPsec.
 538          */
 539         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 540             opt->srcrt == 0 && !skb_sec_path(skb)) {
 541                 struct in6_addr *target = NULL;
 542                 struct inet_peer *peer;
 543                 struct rt6_info *rt;
 544
 545                 /*
 546                  *      incoming and outgoing devices are the same
 547                  *      send a redirect.
 548                  */
 549
 550                 rt = (struct rt6_info *) dst;
 551                 if (rt->rt6i_flags & RTF_GATEWAY)
 552                         target = &rt->rt6i_gateway;
 553                 else
 554                         target = &hdr->daddr;
 555
 556                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 557
 558                 /* Limit redirects both by destination (here)
 559                    and by source (inside ndisc_send_redirect)
 560                  */
 561                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 562                         ndisc_send_redirect(skb, target);
 563                 if (peer)
 564                         inet_putpeer(peer);
 565         } else {
 566                 int addrtype = ipv6_addr_type(&hdr->saddr);
 567
 568                 /* This check is security critical. */
 569                 if (addrtype == IPV6_ADDR_ANY ||
 570                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 571                         goto error;
 572                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 573                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 574                                     ICMPV6_NOT_NEIGHBOUR, 0);
 575                         goto error;
 576                 }
 577         }
 578
 579         mtu = ip6_dst_mtu_forward(dst);
 580         if (mtu < IPV6_MIN_MTU)
 581                 mtu = IPV6_MIN_MTU;
 582
 583         if (ip6_pkt_too_big(skb, mtu)) {
 584                 /* Again, force OUTPUT device used as source address */
 585                 skb->dev = dst->dev;
 586                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 587                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 588                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 589                                 IPSTATS_MIB_FRAGFAILS);
 590                 kfree_skb(skb);
 591                 return -EMSGSIZE;
 592         }
 593
 594         if (skb_cow(skb, dst->dev->hard_header_len)) {
 595                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 596                                 IPSTATS_MIB_OUTDISCARDS);
 597                 goto drop;
 598         }
 599
 600         hdr = ipv6_hdr(skb);
 601
 602         /* Mangling hops number delayed to point after skb COW */
 603
 604         hdr->hop_limit--;
 605
 606         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 607                        net, NULL, skb, skb->dev, dst->dev,
 608                        ip6_forward_finish);
 609
 610 error:
 611         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 612 drop:
 613         kfree_skb(skb);
 614         return -EINVAL;
 615 }
 616
 617 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 618 {
 619         to->pkt_type = from->pkt_type;
 620         to->priority = from->priority;
 621         to->protocol = from->protocol;
 622         skb_dst_drop(to);
 623         skb_dst_set(to, dst_clone(skb_dst(from)));
 624         to->dev = from->dev;
 625         to->mark = from->mark;
 626
 627         skb_copy_hash(to, from);
 628
 629 #ifdef CONFIG_NET_SCHED
 630         to->tc_index = from->tc_index;
 631 #endif
 632         nf_copy(to, from);
 633         skb_ext_copy(to, from);
 634         skb_copy_secmark(to, from);
 635 }
 636
 637 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 638                       u8 nexthdr, __be32 frag_id,
 639                       struct ip6_fraglist_iter *iter)
 640 {
 641         unsigned int first_len;
 642         struct frag_hdr *fh;
 643
 644         /* BUILD HEADER */
 645         *prevhdr = NEXTHDR_FRAGMENT;
 646         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 647         if (!iter->tmp_hdr)
 648                 return -ENOMEM;
 649
 650         iter->frag = skb_shinfo(skb)->frag_list;
 651         skb_frag_list_init(skb);
 652
 653         iter->offset = 0;
 654         iter->hlen = hlen;
 655         iter->frag_id = frag_id;
 656         iter->nexthdr = nexthdr;
 657
 658         __skb_pull(skb, hlen);
 659         fh = __skb_push(skb, sizeof(struct frag_hdr));
 660         __skb_push(skb, hlen);
 661         skb_reset_network_header(skb);
 662         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 663
 664         fh->nexthdr = nexthdr;
 665         fh->reserved = 0;
 666         fh->frag_off = htons(IP6_MF);
 667         fh->identification = frag_id;
 668
 669         first_len = skb_pagelen(skb);
 670         skb->data_len = first_len - skb_headlen(skb);
 671         skb->len = first_len;
 672         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 673
 674         return 0;
 675 }
 676 EXPORT_SYMBOL(ip6_fraglist_init);
 677
 678 void ip6_fraglist_prepare(struct sk_buff *skb,
 679                           struct ip6_fraglist_iter *iter)
 680 {
 681         struct sk_buff *frag = iter->frag;
 682         unsigned int hlen = iter->hlen;
 683         struct frag_hdr *fh;
 684
 685         frag->ip_summed = CHECKSUM_NONE;
 686         skb_reset_transport_header(frag);
 687         fh = __skb_push(frag, sizeof(struct frag_hdr));
 688         __skb_push(frag, hlen);
 689         skb_reset_network_header(frag);
 690         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 691         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 692         fh->nexthdr = iter->nexthdr;
 693         fh->reserved = 0;
 694         fh->frag_off = htons(iter->offset);
 695         if (frag->next)
 696                 fh->frag_off |= htons(IP6_MF);
 697         fh->identification = iter->frag_id;
 698         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 699         ip6_copy_metadata(frag, skb);
 700 }
 701 EXPORT_SYMBOL(ip6_fraglist_prepare);
 702
 703 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 704                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 705                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 706 {
 707         state->prevhdr = prevhdr;
 708         state->nexthdr = nexthdr;
 709         state->frag_id = frag_id;
 710
 711         state->hlen = hlen;
 712         state->mtu = mtu;
 713
 714         state->left = skb->len - hlen;  /* Space per frame */
 715         state->ptr = hlen;              /* Where to start from */
 716
 717         state->hroom = hdr_room;
 718         state->troom = needed_tailroom;
 719
 720         state->offset = 0;
 721 }
 722 EXPORT_SYMBOL(ip6_frag_init);
 723
 724 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 725 {
 726         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 727         struct sk_buff *frag;
 728         struct frag_hdr *fh;
 729         unsigned int len;
 730
 731         len = state->left;
 732         /* IF: it doesn't fit, use 'mtu' - the data space left */
 733         if (len > state->mtu)
 734                 len = state->mtu;
 735         /* IF: we are not sending up to and including the packet end
 736            then align the next start on an eight byte boundary */
 737         if (len < state->left)
 738                 len &= ~7;
 739
 740         /* Allocate buffer */
 741         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 742                          state->hroom + state->troom, GFP_ATOMIC);
 743         if (!frag)
 744                 return ERR_PTR(-ENOMEM);
 745
 746         /*
 747          *      Set up data on packet
 748          */
 749
 750         ip6_copy_metadata(frag, skb);
 751         skb_reserve(frag, state->hroom);
 752         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 753         skb_reset_network_header(frag);
 754         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 755         frag->transport_header = (frag->network_header + state->hlen +
 756                                   sizeof(struct frag_hdr));
 757
 758         /*
 759          *      Charge the memory for the fragment to any owner
 760          *      it might possess
 761          */
 762         if (skb->sk)
 763                 skb_set_owner_w(frag, skb->sk);
 764
 765         /*
 766          *      Copy the packet header into the new buffer.
 767          */
 768         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 769
 770         fragnexthdr_offset = skb_network_header(frag);
 771         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 772         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 773
 774         /*
 775          *      Build fragment header.
 776          */
 777         fh->nexthdr = state->nexthdr;
 778         fh->reserved = 0;
 779         fh->identification = state->frag_id;
 780
 781         /*
 782          *      Copy a block of the IP datagram.
 783          */
 784         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 785                              len));
 786         state->left -= len;
 787
 788         fh->frag_off = htons(state->offset);
 789         if (state->left > 0)
 790                 fh->frag_off |= htons(IP6_MF);
 791         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 792
 793         state->ptr += len;
 794         state->offset += len;
 795
 796         return frag;
 797 }
 798 EXPORT_SYMBOL(ip6_frag_next);
 799
 800 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 801                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 802 {
 803         struct sk_buff *frag;
 804         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 805         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 806                                 inet6_sk(skb->sk) : NULL;
 807         struct ip6_frag_state state;
 808         unsigned int mtu, hlen, nexthdr_offset;
 809         ktime_t tstamp = skb->tstamp;
 810         int hroom, err = 0;
 811         __be32 frag_id;
 812         u8 *prevhdr, nexthdr = 0;
 813
 814         err = ip6_find_1stfragopt(skb, &prevhdr);
 815         if (err < 0)
 816                 goto fail;
 817         hlen = err;
 818         nexthdr = *prevhdr;
 819         nexthdr_offset = prevhdr - skb_network_header(skb);
 820
 821         mtu = ip6_skb_dst_mtu(skb);
 822
 823         /* We must not fragment if the socket is set to force MTU discovery
 824          * or if the skb it not generated by a local socket.
 825          */
 826         if (unlikely(!skb->ignore_df && skb->len > mtu))
 827                 goto fail_toobig;
 828
 829         if (IP6CB(skb)->frag_max_size) {
 830                 if (IP6CB(skb)->frag_max_size > mtu)
 831                         goto fail_toobig;
 832
 833                 /* don't send fragments larger than what we received */
 834                 mtu = IP6CB(skb)->frag_max_size;
 835                 if (mtu < IPV6_MIN_MTU)
 836                         mtu = IPV6_MIN_MTU;
 837         }
 838
 839         if (np && np->frag_size < mtu) {
 840                 if (np->frag_size)
 841                         mtu = np->frag_size;
 842         }
 843         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 844                 goto fail_toobig;
 845         mtu -= hlen + sizeof(struct frag_hdr);
 846
 847         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 848                                     &ipv6_hdr(skb)->saddr);
 849
 850         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 851             (err = skb_checksum_help(skb)))
 852                 goto fail;
 853
 854         prevhdr = skb_network_header(skb) + nexthdr_offset;
 855         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 856         if (skb_has_frag_list(skb)) {
 857                 unsigned int first_len = skb_pagelen(skb);
 858                 struct ip6_fraglist_iter iter;
 859                 struct sk_buff *frag2;
 860
 861                 if (first_len - hlen > mtu ||
 862                     ((first_len - hlen) & 7) ||
 863                     skb_cloned(skb) ||
 864                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 865                         goto slow_path;
 866
 867                 skb_walk_frags(skb, frag) {
 868                         /* Correct geometry. */
 869                         if (frag->len > mtu ||
 870                             ((frag->len & 7) && frag->next) ||
 871                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 872                                 goto slow_path_clean;
 873
 874                         /* Partially cloned skb? */
 875                         if (skb_shared(frag))
 876                                 goto slow_path_clean;
 877
 878                         BUG_ON(frag->sk);
 879                         if (skb->sk) {
 880                                 frag->sk = skb->sk;
 881                                 frag->destructor = sock_wfree;
 882                         }
 883                         skb->truesize -= frag->truesize;
 884                 }
 885
 886                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 887                                         &iter);
 888                 if (err < 0)
 889                         goto fail;
 890
 891                 for (;;) {
 892                         /* Prepare header of the next frame,
 893                          * before previous one went down. */
 894                         if (iter.frag)
 895                                 ip6_fraglist_prepare(skb, &iter);
 896
 897                         skb->tstamp = tstamp;
 898                         err = output(net, sk, skb);
 899                         if (!err)
 900                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 901                                               IPSTATS_MIB_FRAGCREATES);
 902
 903                         if (err || !iter.frag)
 904                                 break;
 905
 906                         skb = ip6_fraglist_next(&iter);
 907                 }
 908
 909                 kfree(iter.tmp_hdr);
 910
 911                 if (err == 0) {
 912                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 913                                       IPSTATS_MIB_FRAGOKS);
 914                         return 0;
 915                 }
 916
 917                 kfree_skb_list(iter.frag);
 918
 919                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 920                               IPSTATS_MIB_FRAGFAILS);
 921                 return err;
 922
 923 slow_path_clean:
 924                 skb_walk_frags(skb, frag2) {
 925                         if (frag2 == frag)
 926                                 break;
 927                         frag2->sk = NULL;
 928                         frag2->destructor = NULL;
 929                         skb->truesize += frag2->truesize;
 930                 }
 931         }
 932
 933 slow_path:
 934         /*
 935          *      Fragment the datagram.
 936          */
 937
 938         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 939                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 940                       &state);
 941
 942         /*
 943          *      Keep copying data until we run out.
 944          */
 945
 946         while (state.left > 0) {
 947                 frag = ip6_frag_next(skb, &state);
 948                 if (IS_ERR(frag)) {
 949                         err = PTR_ERR(frag);
 950                         goto fail;
 951                 }
 952
 953                 /*
 954                  *      Put this fragment into the sending queue.
 955                  */
 956                 frag->tstamp = tstamp;
 957                 err = output(net, sk, frag);
 958                 if (err)
 959                         goto fail;
 960
 961                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 962                               IPSTATS_MIB_FRAGCREATES);
 963         }
 964         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 965                       IPSTATS_MIB_FRAGOKS);
 966         consume_skb(skb);
 967         return err;
 968
 969 fail_toobig:
 970         if (skb->sk && dst_allfrag(skb_dst(skb)))
 971                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 972
 973         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 974         err = -EMSGSIZE;
 975
 976 fail:
 977         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 978                       IPSTATS_MIB_FRAGFAILS);
 979         kfree_skb(skb);
 980         return err;
 981 }
 982
 983 static inline int ip6_rt_check(const struct rt6key *rt_key,
 984                                const struct in6_addr *fl_addr,
 985                                const struct in6_addr *addr_cache)
 986 {
 987         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 988                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 989 }
 990
 991 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 992                                           struct dst_entry *dst,
 993                                           const struct flowi6 *fl6)
 994 {
 995         struct ipv6_pinfo *np = inet6_sk(sk);
 996         struct rt6_info *rt;
 997
 998         if (!dst)
 999                 goto out;
1000
1001         if (dst->ops->family != AF_INET6) {
1002                 dst_release(dst);
1003                 return NULL;
1004         }
1005
1006         rt = (struct rt6_info *)dst;
1007         /* Yes, checking route validity in not connected
1008          * case is not very simple. Take into account,
1009          * that we do not support routing by source, TOS,
1010          * and MSG_DONTROUTE            --ANK (980726)
1011          *
1012          * 1. ip6_rt_check(): If route was host route,
1013          *    check that cached destination is current.
1014          *    If it is network route, we still may
1015          *    check its validity using saved pointer
1016          *    to the last used address: daddr_cache.
1017          *    We do not want to save whole address now,
1018          *    (because main consumer of this service
1019          *    is tcp, which has not this problem),
1020          *    so that the last trick works only on connected
1021          *    sockets.
1022          * 2. oif also should be the same.
1023          */
1024         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1025 #ifdef CONFIG_IPV6_SUBTREES
1026             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1027 #endif
1028            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1029               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1030                 dst_release(dst);
1031                 dst = NULL;
1032         }
1033
1034 out:
1035         return dst;
1036 }
1037
1038 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1039                                struct dst_entry **dst, struct flowi6 *fl6)
1040 {
1041 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1042         struct neighbour *n;
1043         struct rt6_info *rt;
1044 #endif
1045         int err;
1046         int flags = 0;
1047
1048         /* The correct way to handle this would be to do
1049          * ip6_route_get_saddr, and then ip6_route_output; however,
1050          * the route-specific preferred source forces the
1051          * ip6_route_output call _before_ ip6_route_get_saddr.
1052          *
1053          * In source specific routing (no src=any default route),
1054          * ip6_route_output will fail given src=any saddr, though, so
1055          * that's why we try it again later.
1056          */
1057         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1058                 struct fib6_info *from;
1059                 struct rt6_info *rt;
1060                 bool had_dst = *dst != NULL;
1061
1062                 if (!had_dst)
1063                         *dst = ip6_route_output(net, sk, fl6);
1064                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1065
1066                 rcu_read_lock();
1067                 from = rt ? rcu_dereference(rt->from) : NULL;
1068                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1069                                           sk ? inet6_sk(sk)->srcprefs : 0,
1070                                           &fl6->saddr);
1071                 rcu_read_unlock();
1072
1073                 if (err)
1074                         goto out_err_release;
1075
1076                 /* If we had an erroneous initial result, pretend it
1077                  * never existed and let the SA-enabled version take
1078                  * over.
1079                  */
1080                 if (!had_dst && (*dst)->error) {
1081                         dst_release(*dst);
1082                         *dst = NULL;
1083                 }
1084
1085                 if (fl6->flowi6_oif)
1086                         flags |= RT6_LOOKUP_F_IFACE;
1087         }
1088
1089         if (!*dst)
1090                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1091
1092         err = (*dst)->error;
1093         if (err)
1094                 goto out_err_release;
1095
1096 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1097         /*
1098          * Here if the dst entry we've looked up
1099          * has a neighbour entry that is in the INCOMPLETE
1100          * state and the src address from the flow is
1101          * marked as OPTIMISTIC, we release the found
1102          * dst entry and replace it instead with the
1103          * dst entry of the nexthop router
1104          */
1105         rt = (struct rt6_info *) *dst;
1106         rcu_read_lock_bh();
1107         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1108                                       rt6_nexthop(rt, &fl6->daddr));
1109         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1110         rcu_read_unlock_bh();
1111
1112         if (err) {
1113                 struct inet6_ifaddr *ifp;
1114                 struct flowi6 fl_gw6;
1115                 int redirect;
1116
1117                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1118                                       (*dst)->dev, 1);
1119
1120                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1121                 if (ifp)
1122                         in6_ifa_put(ifp);
1123
1124                 if (redirect) {
1125                         /*
1126                          * We need to get the dst entry for the
1127                          * default router instead
1128                          */
1129                         dst_release(*dst);
1130                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1131                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1132                         *dst = ip6_route_output(net, sk, &fl_gw6);
1133                         err = (*dst)->error;
1134                         if (err)
1135                                 goto out_err_release;
1136                 }
1137         }
1138 #endif
1139         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1140             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1141                 err = -EAFNOSUPPORT;
1142                 goto out_err_release;
1143         }
1144
1145         return 0;
1146
1147 out_err_release:
1148         dst_release(*dst);
1149         *dst = NULL;
1150
1151         if (err == -ENETUNREACH)
1152                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1153         return err;
1154 }
1155
1156 /**
1157  *      ip6_dst_lookup - perform route lookup on flow
1158  *      @net: Network namespace to perform lookup in
1159  *      @sk: socket which provides route info
1160  *      @dst: pointer to dst_entry * for result
1161  *      @fl6: flow to lookup
1162  *
1163  *      This function performs a route lookup on the given flow.
1164  *
1165  *      It returns zero on success, or a standard errno code on error.
1166  */
1167 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1168                    struct flowi6 *fl6)
1169 {
1170         *dst = NULL;
1171         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1172 }
1173 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1174
1175 /**
1176  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1177  *      @net: Network namespace to perform lookup in
1178  *      @sk: socket which provides route info
1179  *      @fl6: flow to lookup
1180  *      @final_dst: final destination address for ipsec lookup
1181  *
1182  *      This function performs a route lookup on the given flow.
1183  *
1184  *      It returns a valid dst pointer on success, or a pointer encoded
1185  *      error code.
1186  */
1187 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1188                                       const struct in6_addr *final_dst)
1189 {
1190         struct dst_entry *dst = NULL;
1191         int err;
1192
1193         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1194         if (err)
1195                 return ERR_PTR(err);
1196         if (final_dst)
1197                 fl6->daddr = *final_dst;
1198
1199         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1200 }
1201 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1202
1203 /**
1204  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1205  *      @sk: socket which provides the dst cache and route info
1206  *      @fl6: flow to lookup
1207  *      @final_dst: final destination address for ipsec lookup
1208  *      @connected: whether @sk is connected or not
1209  *
1210  *      This function performs a route lookup on the given flow with the
1211  *      possibility of using the cached route in the socket if it is valid.
1212  *      It will take the socket dst lock when operating on the dst cache.
1213  *      As a result, this function can only be used in process context.
1214  *
1215  *      In addition, for a connected socket, cache the dst in the socket
1216  *      if the current cache is not valid.
1217  *
1218  *      It returns a valid dst pointer on success, or a pointer encoded
1219  *      error code.
1220  */
1221 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1222                                          const struct in6_addr *final_dst,
1223                                          bool connected)
1224 {
1225         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1226
1227         dst = ip6_sk_dst_check(sk, dst, fl6);
1228         if (dst)
1229                 return dst;
1230
1231         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1232         if (connected && !IS_ERR(dst))
1233                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1234
1235         return dst;
1236 }
1237 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1238
1239 /**
1240  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1241  *      @skb: Packet for which lookup is done
1242  *      @dev: Tunnel device
1243  *      @net: Network namespace of tunnel device
1244  *      @sock: Socket which provides route info
1245  *      @saddr: Memory to store the src ip address
1246  *      @info: Tunnel information
1247  *      @protocol: IP protocol
1248  *      @use_cache: Flag to enable cache usage
1249  *      This function performs a route lookup on a tunnel
1250  *
1251  *      It returns a valid dst pointer and stores src address to be used in
1252  *      tunnel in param saddr on success, else a pointer encoded error code.
1253  */
1254
1255 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1256                                         struct net_device *dev,
1257                                         struct net *net,
1258                                         struct socket *sock,
1259                                         struct in6_addr *saddr,
1260                                         const struct ip_tunnel_info *info,
1261                                         u8 protocol,
1262                                         bool use_cache)
1263 {
1264         struct dst_entry *dst = NULL;
1265 #ifdef CONFIG_DST_CACHE
1266         struct dst_cache *dst_cache;
1267 #endif
1268         struct flowi6 fl6;
1269         __u8 prio;
1270
1271 #ifdef CONFIG_DST_CACHE
1272         dst_cache = (struct dst_cache *)&info->dst_cache;
1273         if (use_cache) {
1274                 dst = dst_cache_get_ip6(dst_cache, saddr);
1275                 if (dst)
1276                         return dst;
1277         }
1278 #endif
1279         memset(&fl6, 0, sizeof(fl6));
1280         fl6.flowi6_mark = skb->mark;
1281         fl6.flowi6_proto = protocol;
1282         fl6.daddr = info->key.u.ipv6.dst;
1283         fl6.saddr = info->key.u.ipv6.src;
1284         prio = info->key.tos;
1285         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1286                                           info->key.label);
1287
1288         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1289                                               NULL);
1290         if (IS_ERR(dst)) {
1291                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1292                 return ERR_PTR(-ENETUNREACH);
1293         }
1294         if (dst->dev == dev) { /* is this necessary? */
1295                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1296                 dst_release(dst);
1297                 return ERR_PTR(-ELOOP);
1298         }
1299 #ifdef CONFIG_DST_CACHE
1300         if (use_cache)
1301                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1302 #endif
1303         *saddr = fl6.saddr;
1304         return dst;
1305 }
1306 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1307
1308 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1309                                                gfp_t gfp)
1310 {
1311         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1312 }
1313
1314 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1315                                                 gfp_t gfp)
1316 {
1317         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1318 }
1319
1320 static void ip6_append_data_mtu(unsigned int *mtu,
1321                                 int *maxfraglen,
1322                                 unsigned int fragheaderlen,
1323                                 struct sk_buff *skb,
1324                                 struct rt6_info *rt,
1325                                 unsigned int orig_mtu)
1326 {
1327         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1328                 if (!skb) {
1329                         /* first fragment, reserve header_len */
1330                         *mtu = orig_mtu - rt->dst.header_len;
1331
1332                 } else {
1333                         /*
1334                          * this fragment is not first, the headers
1335                          * space is regarded as data space.
1336                          */
1337                         *mtu = orig_mtu;
1338                 }
1339                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1340                               + fragheaderlen - sizeof(struct frag_hdr);
1341         }
1342 }
1343
1344 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1345                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1346                           struct rt6_info *rt, struct flowi6 *fl6)
1347 {
1348         struct ipv6_pinfo *np = inet6_sk(sk);
1349         unsigned int mtu;
1350         struct ipv6_txoptions *opt = ipc6->opt;
1351
1352         /*
1353          * setup for corking
1354          */
1355         if (opt) {
1356                 if (WARN_ON(v6_cork->opt))
1357                         return -EINVAL;
1358
1359                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1360                 if (unlikely(!v6_cork->opt))
1361                         return -ENOBUFS;
1362
1363                 v6_cork->opt->tot_len = sizeof(*opt);
1364                 v6_cork->opt->opt_flen = opt->opt_flen;
1365                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1366
1367                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1368                                                     sk->sk_allocation);
1369                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1370                         return -ENOBUFS;
1371
1372                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1373                                                     sk->sk_allocation);
1374                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1375                         return -ENOBUFS;
1376
1377                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1378                                                    sk->sk_allocation);
1379                 if (opt->hopopt && !v6_cork->opt->hopopt)
1380                         return -ENOBUFS;
1381
1382                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1383                                                     sk->sk_allocation);
1384                 if (opt->srcrt && !v6_cork->opt->srcrt)
1385                         return -ENOBUFS;
1386
1387                 /* need source address above miyazawa*/
1388         }
1389         dst_hold(&rt->dst);
1390         cork->base.dst = &rt->dst;
1391         cork->fl.u.ip6 = *fl6;
1392         v6_cork->hop_limit = ipc6->hlimit;
1393         v6_cork->tclass = ipc6->tclass;
1394         if (rt->dst.flags & DST_XFRM_TUNNEL)
1395                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1396                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1397         else
1398                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1399                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1400         if (np->frag_size < mtu) {
1401                 if (np->frag_size)
1402                         mtu = np->frag_size;
1403         }
1404         if (mtu < IPV6_MIN_MTU)
1405                 return -EINVAL;
1406         cork->base.fragsize = mtu;
1407         cork->base.gso_size = ipc6->gso_size;
1408         cork->base.tx_flags = 0;
1409         cork->base.mark = ipc6->sockc.mark;
1410         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1411
1412         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1413                 cork->base.flags |= IPCORK_ALLFRAG;
1414         cork->base.length = 0;
1415
1416         cork->base.transmit_time = ipc6->sockc.transmit_time;
1417
1418         return 0;
1419 }
1420
1421 static int __ip6_append_data(struct sock *sk,
1422                              struct flowi6 *fl6,
1423                              struct sk_buff_head *queue,
1424                              struct inet_cork *cork,
1425                              struct inet6_cork *v6_cork,
1426                              struct page_frag *pfrag,
1427                              int getfrag(void *from, char *to, int offset,
1428                                          int len, int odd, struct sk_buff *skb),
1429                              void *from, int length, int transhdrlen,
1430                              unsigned int flags, struct ipcm6_cookie *ipc6)
1431 {
1432         struct sk_buff *skb, *skb_prev = NULL;
1433         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1434         struct ubuf_info *uarg = NULL;
1435         int exthdrlen = 0;
1436         int dst_exthdrlen = 0;
1437         int hh_len;
1438         int copy;
1439         int err;
1440         int offset = 0;
1441         u32 tskey = 0;
1442         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1443         struct ipv6_txoptions *opt = v6_cork->opt;
1444         int csummode = CHECKSUM_NONE;
1445         unsigned int maxnonfragsize, headersize;
1446         unsigned int wmem_alloc_delta = 0;
1447         bool paged, extra_uref = false;
1448
1449         skb = skb_peek_tail(queue);
1450         if (!skb) {
1451                 exthdrlen = opt ? opt->opt_flen : 0;
1452                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1453         }
1454
1455         paged = !!cork->gso_size;
1456         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1457         orig_mtu = mtu;
1458
1459         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1460             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1461                 tskey = sk->sk_tskey++;
1462
1463         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1464
1465         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1466                         (opt ? opt->opt_nflen : 0);
1467         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1468                      sizeof(struct frag_hdr);
1469
1470         headersize = sizeof(struct ipv6hdr) +
1471                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1472                      (dst_allfrag(&rt->dst) ?
1473                       sizeof(struct frag_hdr) : 0) +
1474                      rt->rt6i_nfheader_len;
1475
1476         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1477          * the first fragment
1478          */
1479         if (headersize + transhdrlen > mtu)
1480                 goto emsgsize;
1481
1482         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1483             (sk->sk_protocol == IPPROTO_UDP ||
1484              sk->sk_protocol == IPPROTO_RAW)) {
1485                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1486                                 sizeof(struct ipv6hdr));
1487                 goto emsgsize;
1488         }
1489
1490         if (ip6_sk_ignore_df(sk))
1491                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1492         else
1493                 maxnonfragsize = mtu;
1494
1495         if (cork->length + length > maxnonfragsize - headersize) {
1496 emsgsize:
1497                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1498                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1499                 return -EMSGSIZE;
1500         }
1501
1502         /* CHECKSUM_PARTIAL only with no extension headers and when
1503          * we are not going to fragment
1504          */
1505         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1506             headersize == sizeof(struct ipv6hdr) &&
1507             length <= mtu - headersize &&
1508             (!(flags & MSG_MORE) || cork->gso_size) &&
1509             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1510                 csummode = CHECKSUM_PARTIAL;
1511
1512         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1513                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1514                 if (!uarg)
1515                         return -ENOBUFS;
1516                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1517                 if (rt->dst.dev->features & NETIF_F_SG &&
1518                     csummode == CHECKSUM_PARTIAL) {
1519                         paged = true;
1520                 } else {
1521                         uarg->zerocopy = 0;
1522                         skb_zcopy_set(skb, uarg, &extra_uref);
1523                 }
1524         }
1525
1526         /*
1527          * Let's try using as much space as possible.
1528          * Use MTU if total length of the message fits into the MTU.
1529          * Otherwise, we need to reserve fragment header and
1530          * fragment alignment (= 8-15 octects, in total).
1531          *
1532          * Note that we may need to "move" the data from the tail
1533          * of the buffer to the new fragment when we split
1534          * the message.
1535          *
1536          * FIXME: It may be fragmented into multiple chunks
1537          *        at once if non-fragmentable extension headers
1538          *        are too large.
1539          * --yoshfuji
1540          */
1541
1542         cork->length += length;
1543         if (!skb)
1544                 goto alloc_new_skb;
1545
1546         while (length > 0) {
1547                 /* Check if the remaining data fits into current packet. */
1548                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1549                 if (copy < length)
1550                         copy = maxfraglen - skb->len;
1551
1552                 if (copy <= 0) {
1553                         char *data;
1554                         unsigned int datalen;
1555                         unsigned int fraglen;
1556                         unsigned int fraggap;
1557                         unsigned int alloclen;
1558                         unsigned int pagedlen;
1559 alloc_new_skb:
1560                         /* There's no room in the current skb */
1561                         if (skb)
1562                                 fraggap = skb->len - maxfraglen;
1563                         else
1564                                 fraggap = 0;
1565                         /* update mtu and maxfraglen if necessary */
1566                         if (!skb || !skb_prev)
1567                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1568                                                     fragheaderlen, skb, rt,
1569                                                     orig_mtu);
1570
1571                         skb_prev = skb;
1572
1573                         /*
1574                          * If remaining data exceeds the mtu,
1575                          * we know we need more fragment(s).
1576                          */
1577                         datalen = length + fraggap;
1578
1579                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1580                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1581                         fraglen = datalen + fragheaderlen;
1582                         pagedlen = 0;
1583
1584                         if ((flags & MSG_MORE) &&
1585                             !(rt->dst.dev->features&NETIF_F_SG))
1586                                 alloclen = mtu;
1587                         else if (!paged)
1588                                 alloclen = fraglen;
1589                         else {
1590                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1591                                 pagedlen = fraglen - alloclen;
1592                         }
1593
1594                         alloclen += dst_exthdrlen;
1595
1596                         if (datalen != length + fraggap) {
1597                                 /*
1598                                  * this is not the last fragment, the trailer
1599                                  * space is regarded as data space.
1600                                  */
1601                                 datalen += rt->dst.trailer_len;
1602                         }
1603
1604                         alloclen += rt->dst.trailer_len;
1605                         fraglen = datalen + fragheaderlen;
1606
1607                         /*
1608                          * We just reserve space for fragment header.
1609                          * Note: this may be overallocation if the message
1610                          * (without MSG_MORE) fits into the MTU.
1611                          */
1612                         alloclen += sizeof(struct frag_hdr);
1613
1614                         copy = datalen - transhdrlen - fraggap - pagedlen;
1615                         if (copy < 0) {
1616                                 err = -EINVAL;
1617                                 goto error;
1618                         }
1619                         if (transhdrlen) {
1620                                 skb = sock_alloc_send_skb(sk,
1621                                                 alloclen + hh_len,
1622                                                 (flags & MSG_DONTWAIT), &err);
1623                         } else {
1624                                 skb = NULL;
1625                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1626                                     2 * sk->sk_sndbuf)
1627                                         skb = alloc_skb(alloclen + hh_len,
1628                                                         sk->sk_allocation);
1629                                 if (unlikely(!skb))
1630                                         err = -ENOBUFS;
1631                         }
1632                         if (!skb)
1633                                 goto error;
1634                         /*
1635                          *      Fill in the control structures
1636                          */
1637                         skb->protocol = htons(ETH_P_IPV6);
1638                         skb->ip_summed = csummode;
1639                         skb->csum = 0;
1640                         /* reserve for fragmentation and ipsec header */
1641                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1642                                     dst_exthdrlen);
1643
1644                         /*
1645                          *      Find where to start putting bytes
1646                          */
1647                         data = skb_put(skb, fraglen - pagedlen);
1648                         skb_set_network_header(skb, exthdrlen);
1649                         data += fragheaderlen;
1650                         skb->transport_header = (skb->network_header +
1651                                                  fragheaderlen);
1652                         if (fraggap) {
1653                                 skb->csum = skb_copy_and_csum_bits(
1654                                         skb_prev, maxfraglen,
1655                                         data + transhdrlen, fraggap);
1656                                 skb_prev->csum = csum_sub(skb_prev->csum,
1657                                                           skb->csum);
1658                                 data += fraggap;
1659                                 pskb_trim_unique(skb_prev, maxfraglen);
1660                         }
1661                         if (copy > 0 &&
1662                             getfrag(from, data + transhdrlen, offset,
1663                                     copy, fraggap, skb) < 0) {
1664                                 err = -EFAULT;
1665                                 kfree_skb(skb);
1666                                 goto error;
1667                         }
1668
1669                         offset += copy;
1670                         length -= copy + transhdrlen;
1671                         transhdrlen = 0;
1672                         exthdrlen = 0;
1673                         dst_exthdrlen = 0;
1674
1675                         /* Only the initial fragment is time stamped */
1676                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1677                         cork->tx_flags = 0;
1678                         skb_shinfo(skb)->tskey = tskey;
1679                         tskey = 0;
1680                         skb_zcopy_set(skb, uarg, &extra_uref);
1681
1682                         if ((flags & MSG_CONFIRM) && !skb_prev)
1683                                 skb_set_dst_pending_confirm(skb, 1);
1684
1685                         /*
1686                          * Put the packet on the pending queue
1687                          */
1688                         if (!skb->destructor) {
1689                                 skb->destructor = sock_wfree;
1690                                 skb->sk = sk;
1691                                 wmem_alloc_delta += skb->truesize;
1692                         }
1693                         __skb_queue_tail(queue, skb);
1694                         continue;
1695                 }
1696
1697                 if (copy > length)
1698                         copy = length;
1699
1700                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1701                     skb_tailroom(skb) >= copy) {
1702                         unsigned int off;
1703
1704                         off = skb->len;
1705                         if (getfrag(from, skb_put(skb, copy),
1706                                                 offset, copy, off, skb) < 0) {
1707                                 __skb_trim(skb, off);
1708                                 err = -EFAULT;
1709                                 goto error;
1710                         }
1711                 } else if (!uarg || !uarg->zerocopy) {
1712                         int i = skb_shinfo(skb)->nr_frags;
1713
1714                         err = -ENOMEM;
1715                         if (!sk_page_frag_refill(sk, pfrag))
1716                                 goto error;
1717
1718                         if (!skb_can_coalesce(skb, i, pfrag->page,
1719                                               pfrag->offset)) {
1720                                 err = -EMSGSIZE;
1721                                 if (i == MAX_SKB_FRAGS)
1722                                         goto error;
1723
1724                                 __skb_fill_page_desc(skb, i, pfrag->page,
1725                                                      pfrag->offset, 0);
1726                                 skb_shinfo(skb)->nr_frags = ++i;
1727                                 get_page(pfrag->page);
1728                         }
1729                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1730                         if (getfrag(from,
1731                                     page_address(pfrag->page) + pfrag->offset,
1732                                     offset, copy, skb->len, skb) < 0)
1733                                 goto error_efault;
1734
1735                         pfrag->offset += copy;
1736                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1737                         skb->len += copy;
1738                         skb->data_len += copy;
1739                         skb->truesize += copy;
1740                         wmem_alloc_delta += copy;
1741                 } else {
1742                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1743                         if (err < 0)
1744                                 goto error;
1745                 }
1746                 offset += copy;
1747                 length -= copy;
1748         }
1749
1750         if (wmem_alloc_delta)
1751                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1752         return 0;
1753
1754 error_efault:
1755         err = -EFAULT;
1756 error:
1757         if (uarg)
1758                 sock_zerocopy_put_abort(uarg, extra_uref);
1759         cork->length -= length;
1760         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1761         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1762         return err;
1763 }
1764
1765 int ip6_append_data(struct sock *sk,
1766                     int getfrag(void *from, char *to, int offset, int len,
1767                                 int odd, struct sk_buff *skb),
1768                     void *from, int length, int transhdrlen,
1769                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1770                     struct rt6_info *rt, unsigned int flags)
1771 {
1772         struct inet_sock *inet = inet_sk(sk);
1773         struct ipv6_pinfo *np = inet6_sk(sk);
1774         int exthdrlen;
1775         int err;
1776
1777         if (flags&MSG_PROBE)
1778                 return 0;
1779         if (skb_queue_empty(&sk->sk_write_queue)) {
1780                 /*
1781                  * setup for corking
1782                  */
1783                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1784                                      ipc6, rt, fl6);
1785                 if (err)
1786                         return err;
1787
1788                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1789                 length += exthdrlen;
1790                 transhdrlen += exthdrlen;
1791         } else {
1792                 fl6 = &inet->cork.fl.u.ip6;
1793                 transhdrlen = 0;
1794         }
1795
1796         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1797                                  &np->cork, sk_page_frag(sk), getfrag,
1798                                  from, length, transhdrlen, flags, ipc6);
1799 }
1800 EXPORT_SYMBOL_GPL(ip6_append_data);
1801
1802 static void ip6_cork_release(struct inet_cork_full *cork,
1803                              struct inet6_cork *v6_cork)
1804 {
1805         if (v6_cork->opt) {
1806                 kfree(v6_cork->opt->dst0opt);
1807                 kfree(v6_cork->opt->dst1opt);
1808                 kfree(v6_cork->opt->hopopt);
1809                 kfree(v6_cork->opt->srcrt);
1810                 kfree(v6_cork->opt);
1811                 v6_cork->opt = NULL;
1812         }
1813
1814         if (cork->base.dst) {
1815                 dst_release(cork->base.dst);
1816                 cork->base.dst = NULL;
1817                 cork->base.flags &= ~IPCORK_ALLFRAG;
1818         }
1819         memset(&cork->fl, 0, sizeof(cork->fl));
1820 }
1821
1822 struct sk_buff *__ip6_make_skb(struct sock *sk,
1823                                struct sk_buff_head *queue,
1824                                struct inet_cork_full *cork,
1825                                struct inet6_cork *v6_cork)
1826 {
1827         struct sk_buff *skb, *tmp_skb;
1828         struct sk_buff **tail_skb;
1829         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1830         struct ipv6_pinfo *np = inet6_sk(sk);
1831         struct net *net = sock_net(sk);
1832         struct ipv6hdr *hdr;
1833         struct ipv6_txoptions *opt = v6_cork->opt;
1834         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1835         struct flowi6 *fl6 = &cork->fl.u.ip6;
1836         unsigned char proto = fl6->flowi6_proto;
1837
1838         skb = __skb_dequeue(queue);
1839         if (!skb)
1840                 goto out;
1841         tail_skb = &(skb_shinfo(skb)->frag_list);
1842
1843         /* move skb->data to ip header from ext header */
1844         if (skb->data < skb_network_header(skb))
1845                 __skb_pull(skb, skb_network_offset(skb));
1846         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1847                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1848                 *tail_skb = tmp_skb;
1849                 tail_skb = &(tmp_skb->next);
1850                 skb->len += tmp_skb->len;
1851                 skb->data_len += tmp_skb->len;
1852                 skb->truesize += tmp_skb->truesize;
1853                 tmp_skb->destructor = NULL;
1854                 tmp_skb->sk = NULL;
1855         }
1856
1857         /* Allow local fragmentation. */
1858         skb->ignore_df = ip6_sk_ignore_df(sk);
1859
1860         *final_dst = fl6->daddr;
1861         __skb_pull(skb, skb_network_header_len(skb));
1862         if (opt && opt->opt_flen)
1863                 ipv6_push_frag_opts(skb, opt, &proto);
1864         if (opt && opt->opt_nflen)
1865                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1866
1867         skb_push(skb, sizeof(struct ipv6hdr));
1868         skb_reset_network_header(skb);
1869         hdr = ipv6_hdr(skb);
1870
1871         ip6_flow_hdr(hdr, v6_cork->tclass,
1872                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1873                                         ip6_autoflowlabel(net, np), fl6));
1874         hdr->hop_limit = v6_cork->hop_limit;
1875         hdr->nexthdr = proto;
1876         hdr->saddr = fl6->saddr;
1877         hdr->daddr = *final_dst;
1878
1879         skb->priority = sk->sk_priority;
1880         skb->mark = cork->base.mark;
1881
1882         skb->tstamp = cork->base.transmit_time;
1883
1884         skb_dst_set(skb, dst_clone(&rt->dst));
1885         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1886         if (proto == IPPROTO_ICMPV6) {
1887                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1888
1889                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1890                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1891         }
1892
1893         ip6_cork_release(cork, v6_cork);
1894 out:
1895         return skb;
1896 }
1897
1898 int ip6_send_skb(struct sk_buff *skb)
1899 {
1900         struct net *net = sock_net(skb->sk);
1901         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1902         int err;
1903
1904         err = ip6_local_out(net, skb->sk, skb);
1905         if (err) {
1906                 if (err > 0)
1907                         err = net_xmit_errno(err);
1908                 if (err)
1909                         IP6_INC_STATS(net, rt->rt6i_idev,
1910                                       IPSTATS_MIB_OUTDISCARDS);
1911         }
1912
1913         return err;
1914 }
1915
1916 int ip6_push_pending_frames(struct sock *sk)
1917 {
1918         struct sk_buff *skb;
1919
1920         skb = ip6_finish_skb(sk);
1921         if (!skb)
1922                 return 0;
1923
1924         return ip6_send_skb(skb);
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1927
1928 static void __ip6_flush_pending_frames(struct sock *sk,
1929                                        struct sk_buff_head *queue,
1930                                        struct inet_cork_full *cork,
1931                                        struct inet6_cork *v6_cork)
1932 {
1933         struct sk_buff *skb;
1934
1935         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1936                 if (skb_dst(skb))
1937                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1938                                       IPSTATS_MIB_OUTDISCARDS);
1939                 kfree_skb(skb);
1940         }
1941
1942         ip6_cork_release(cork, v6_cork);
1943 }
1944
1945 void ip6_flush_pending_frames(struct sock *sk)
1946 {
1947         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1948                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1949 }
1950 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1951
1952 struct sk_buff *ip6_make_skb(struct sock *sk,
1953                              int getfrag(void *from, char *to, int offset,
1954                                          int len, int odd, struct sk_buff *skb),
1955                              void *from, int length, int transhdrlen,
1956                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1957                              struct rt6_info *rt, unsigned int flags,
1958                              struct inet_cork_full *cork)
1959 {
1960         struct inet6_cork v6_cork;
1961         struct sk_buff_head queue;
1962         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1963         int err;
1964
1965         if (flags & MSG_PROBE)
1966                 return NULL;
1967
1968         __skb_queue_head_init(&queue);
1969
1970         cork->base.flags = 0;
1971         cork->base.addr = 0;
1972         cork->base.opt = NULL;
1973         cork->base.dst = NULL;
1974         v6_cork.opt = NULL;
1975         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1976         if (err) {
1977                 ip6_cork_release(cork, &v6_cork);
1978                 return ERR_PTR(err);
1979         }
1980         if (ipc6->dontfrag < 0)
1981                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1982
1983         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1984                                 &current->task_frag, getfrag, from,
1985                                 length + exthdrlen, transhdrlen + exthdrlen,
1986                                 flags, ipc6);
1987         if (err) {
1988                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1989                 return ERR_PTR(err);
1990         }
1991
1992         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1993 }