net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         const struct in6_addr *nexthop;
  64         struct neighbour *neigh;
  65         int ret;
  66
  67         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  68                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  69
  70                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  71                     ((mroute6_is_socket(net, skb) &&
  72                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  73                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  74                                          &ipv6_hdr(skb)->saddr))) {
  75                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  76
  77                         /* Do not check for IFF_ALLMULTI; multicast routing
  78                            is not supported in any case.
  79                          */
  80                         if (newskb)
  81                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  82                                         net, sk, newskb, NULL, newskb->dev,
  83                                         dev_loopback_xmit);
  84
  85                         if (ipv6_hdr(skb)->hop_limit == 0) {
  86                                 IP6_INC_STATS(net, idev,
  87                                               IPSTATS_MIB_OUTDISCARDS);
  88                                 kfree_skb(skb);
  89                                 return 0;
  90                         }
  91                 }
  92
  93                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  94
  95                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  96                     IPV6_ADDR_SCOPE_NODELOCAL &&
  97                     !(dev->flags & IFF_LOOPBACK)) {
  98                         kfree_skb(skb);
  99                         return 0;
 100                 }
 101         }
 102
 103         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 104                 int res = lwtunnel_xmit(skb);
 105
 106                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 107                         return res;
 108         }
 109
 110         rcu_read_lock_bh();
 111         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 112         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 113         if (unlikely(!neigh))
 114                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 115         if (!IS_ERR(neigh)) {
 116                 sock_confirm_neigh(skb, neigh);
 117                 ret = neigh_output(neigh, skb, false);
 118                 rcu_read_unlock_bh();
 119                 return ret;
 120         }
 121         rcu_read_unlock_bh();
 122
 123         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 124         kfree_skb(skb);
 125         return -EINVAL;
 126 }
 127
 128 static int
 129 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 130                                     struct sk_buff *skb, unsigned int mtu)
 131 {
 132         struct sk_buff *segs, *nskb;
 133         netdev_features_t features;
 134         int ret = 0;
 135
 136         /* Please see corresponding comment in ip_finish_output_gso
 137          * describing the cases where GSO segment length exceeds the
 138          * egress MTU.
 139          */
 140         features = netif_skb_features(skb);
 141         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 142         if (IS_ERR_OR_NULL(segs)) {
 143                 kfree_skb(skb);
 144                 return -ENOMEM;
 145         }
 146
 147         consume_skb(skb);
 148
 149         skb_list_walk_safe(segs, segs, nskb) {
 150                 int err;
 151
 152                 skb_mark_not_on_list(segs);
 153                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 154                 if (err && ret == 0)
 155                         ret = err;
 156         }
 157
 158         return ret;
 159 }
 160
 161 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 162 {
 163         unsigned int mtu;
 164
 165 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 166         /* Policy lookup after SNAT yielded a new policy */
 167         if (skb_dst(skb)->xfrm) {
 168                 IPCB(skb)->flags |= IPSKB_REROUTED;
 169                 return dst_output(net, sk, skb);
 170         }
 171 #endif
 172
 173         mtu = ip6_skb_dst_mtu(skb);
 174         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 175                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 176
 177         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 178             dst_allfrag(skb_dst(skb)) ||
 179             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 180                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 181         else
 182                 return ip6_finish_output2(net, sk, skb);
 183 }
 184
 185 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 186 {
 187         int ret;
 188
 189         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 190         switch (ret) {
 191         case NET_XMIT_SUCCESS:
 192                 return __ip6_finish_output(net, sk, skb);
 193         case NET_XMIT_CN:
 194                 return __ip6_finish_output(net, sk, skb) ? : ret;
 195         default:
 196                 kfree_skb(skb);
 197                 return ret;
 198         }
 199 }
 200
 201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 202 {
 203         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 204         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 205
 206         skb->protocol = htons(ETH_P_IPV6);
 207         skb->dev = dev;
 208
 209         if (unlikely(idev->cnf.disable_ipv6)) {
 210                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 211                 kfree_skb(skb);
 212                 return 0;
 213         }
 214
 215         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 216                             net, sk, skb, indev, dev,
 217                             ip6_finish_output,
 218                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 219 }
 220 EXPORT_SYMBOL(ip6_output);
 221
 222 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 223 {
 224         if (!np->autoflowlabel_set)
 225                 return ip6_default_np_autolabel(net);
 226         else
 227                 return np->autoflowlabel;
 228 }
 229
 230 /*
 231  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 232  * Note : socket lock is not held for SYNACK packets, but might be modified
 233  * by calls to skb_set_owner_w() and ipv6_local_error(),
 234  * which are using proper atomic operations or spinlocks.
 235  */
 236 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 237              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 238 {
 239         struct net *net = sock_net(sk);
 240         const struct ipv6_pinfo *np = inet6_sk(sk);
 241         struct in6_addr *first_hop = &fl6->daddr;
 242         struct dst_entry *dst = skb_dst(skb);
 243         unsigned int head_room;
 244         struct ipv6hdr *hdr;
 245         u8  proto = fl6->flowi6_proto;
 246         int seg_len = skb->len;
 247         int hlimit = -1;
 248         u32 mtu;
 249
 250         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 251         if (opt)
 252                 head_room += opt->opt_nflen + opt->opt_flen;
 253
 254         if (unlikely(skb_headroom(skb) < head_room)) {
 255                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 256                 if (!skb2) {
 257                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 258                                       IPSTATS_MIB_OUTDISCARDS);
 259                         kfree_skb(skb);
 260                         return -ENOBUFS;
 261                 }
 262                 if (skb->sk)
 263                         skb_set_owner_w(skb2, skb->sk);
 264                 consume_skb(skb);
 265                 skb = skb2;
 266         }
 267
 268         if (opt) {
 269                 seg_len += opt->opt_nflen + opt->opt_flen;
 270
 271                 if (opt->opt_flen)
 272                         ipv6_push_frag_opts(skb, opt, &proto);
 273
 274                 if (opt->opt_nflen)
 275                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 276                                              &fl6->saddr);
 277         }
 278
 279         skb_push(skb, sizeof(struct ipv6hdr));
 280         skb_reset_network_header(skb);
 281         hdr = ipv6_hdr(skb);
 282
 283         /*
 284          *      Fill in the IPv6 header
 285          */
 286         if (np)
 287                 hlimit = np->hop_limit;
 288         if (hlimit < 0)
 289                 hlimit = ip6_dst_hoplimit(dst);
 290
 291         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 292                                 ip6_autoflowlabel(net, np), fl6));
 293
 294         hdr->payload_len = htons(seg_len);
 295         hdr->nexthdr = proto;
 296         hdr->hop_limit = hlimit;
 297
 298         hdr->saddr = fl6->saddr;
 299         hdr->daddr = *first_hop;
 300
 301         skb->protocol = htons(ETH_P_IPV6);
 302         skb->priority = priority;
 303         skb->mark = mark;
 304
 305         mtu = dst_mtu(dst);
 306         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 307                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 308                               IPSTATS_MIB_OUT, skb->len);
 309
 310                 /* if egress device is enslaved to an L3 master device pass the
 311                  * skb to its handler for processing
 312                  */
 313                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 314                 if (unlikely(!skb))
 315                         return 0;
 316
 317                 /* hooks should never assume socket lock is held.
 318                  * we promote our socket to non const
 319                  */
 320                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 321                                net, (struct sock *)sk, skb, NULL, dst->dev,
 322                                dst_output);
 323         }
 324
 325         skb->dev = dst->dev;
 326         /* ipv6_local_error() does not require socket lock,
 327          * we promote our socket to non const
 328          */
 329         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 330
 331         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 332         kfree_skb(skb);
 333         return -EMSGSIZE;
 334 }
 335 EXPORT_SYMBOL(ip6_xmit);
 336
 337 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 338 {
 339         struct ip6_ra_chain *ra;
 340         struct sock *last = NULL;
 341
 342         read_lock(&ip6_ra_lock);
 343         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 344                 struct sock *sk = ra->sk;
 345                 if (sk && ra->sel == sel &&
 346                     (!sk->sk_bound_dev_if ||
 347                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 348                         struct ipv6_pinfo *np = inet6_sk(sk);
 349
 350                         if (np && np->rtalert_isolate &&
 351                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 352                                 continue;
 353                         }
 354                         if (last) {
 355                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 356                                 if (skb2)
 357                                         rawv6_rcv(last, skb2);
 358                         }
 359                         last = sk;
 360                 }
 361         }
 362
 363         if (last) {
 364                 rawv6_rcv(last, skb);
 365                 read_unlock(&ip6_ra_lock);
 366                 return 1;
 367         }
 368         read_unlock(&ip6_ra_lock);
 369         return 0;
 370 }
 371
 372 static int ip6_forward_proxy_check(struct sk_buff *skb)
 373 {
 374         struct ipv6hdr *hdr = ipv6_hdr(skb);
 375         u8 nexthdr = hdr->nexthdr;
 376         __be16 frag_off;
 377         int offset;
 378
 379         if (ipv6_ext_hdr(nexthdr)) {
 380                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 381                 if (offset < 0)
 382                         return 0;
 383         } else
 384                 offset = sizeof(struct ipv6hdr);
 385
 386         if (nexthdr == IPPROTO_ICMPV6) {
 387                 struct icmp6hdr *icmp6;
 388
 389                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 390                                          offset + 1 - skb->data)))
 391                         return 0;
 392
 393                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 394
 395                 switch (icmp6->icmp6_type) {
 396                 case NDISC_ROUTER_SOLICITATION:
 397                 case NDISC_ROUTER_ADVERTISEMENT:
 398                 case NDISC_NEIGHBOUR_SOLICITATION:
 399                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 400                 case NDISC_REDIRECT:
 401                         /* For reaction involving unicast neighbor discovery
 402                          * message destined to the proxied address, pass it to
 403                          * input function.
 404                          */
 405                         return 1;
 406                 default:
 407                         break;
 408                 }
 409         }
 410
 411         /*
 412          * The proxying router can't forward traffic sent to a link-local
 413          * address, so signal the sender and discard the packet. This
 414          * behavior is clarified by the MIPv6 specification.
 415          */
 416         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 417                 dst_link_failure(skb);
 418                 return -1;
 419         }
 420
 421         return 0;
 422 }
 423
 424 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 425                                      struct sk_buff *skb)
 426 {
 427         struct dst_entry *dst = skb_dst(skb);
 428
 429         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 430         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 431
 432 #ifdef CONFIG_NET_SWITCHDEV
 433         if (skb->offload_l3_fwd_mark) {
 434                 consume_skb(skb);
 435                 return 0;
 436         }
 437 #endif
 438
 439         skb->tstamp = 0;
 440         return dst_output(net, sk, skb);
 441 }
 442
 443 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 444 {
 445         if (skb->len <= mtu)
 446                 return false;
 447
 448         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 449         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 450                 return true;
 451
 452         if (skb->ignore_df)
 453                 return false;
 454
 455         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 456                 return false;
 457
 458         return true;
 459 }
 460
 461 int ip6_forward(struct sk_buff *skb)
 462 {
 463         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 464         struct dst_entry *dst = skb_dst(skb);
 465         struct ipv6hdr *hdr = ipv6_hdr(skb);
 466         struct inet6_skb_parm *opt = IP6CB(skb);
 467         struct net *net = dev_net(dst->dev);
 468         u32 mtu;
 469
 470         if (net->ipv6.devconf_all->forwarding == 0)
 471                 goto error;
 472
 473         if (skb->pkt_type != PACKET_HOST)
 474                 goto drop;
 475
 476         if (unlikely(skb->sk))
 477                 goto drop;
 478
 479         if (skb_warn_if_lro(skb))
 480                 goto drop;
 481
 482         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 483                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 484                 goto drop;
 485         }
 486
 487         skb_forward_csum(skb);
 488
 489         /*
 490          *      We DO NOT make any processing on
 491          *      RA packets, pushing them to user level AS IS
 492          *      without ane WARRANTY that application will be able
 493          *      to interpret them. The reason is that we
 494          *      cannot make anything clever here.
 495          *
 496          *      We are not end-node, so that if packet contains
 497          *      AH/ESP, we cannot make anything.
 498          *      Defragmentation also would be mistake, RA packets
 499          *      cannot be fragmented, because there is no warranty
 500          *      that different fragments will go along one path. --ANK
 501          */
 502         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 503                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 504                         return 0;
 505         }
 506
 507         /*
 508          *      check and decrement ttl
 509          */
 510         if (hdr->hop_limit <= 1) {
 511                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 512                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 513
 514                 kfree_skb(skb);
 515                 return -ETIMEDOUT;
 516         }
 517
 518         /* XXX: idev->cnf.proxy_ndp? */
 519         if (net->ipv6.devconf_all->proxy_ndp &&
 520             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 521                 int proxied = ip6_forward_proxy_check(skb);
 522                 if (proxied > 0)
 523                         return ip6_input(skb);
 524                 else if (proxied < 0) {
 525                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 526                         goto drop;
 527                 }
 528         }
 529
 530         if (!xfrm6_route_forward(skb)) {
 531                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 532                 goto drop;
 533         }
 534         dst = skb_dst(skb);
 535
 536         /* IPv6 specs say nothing about it, but it is clear that we cannot
 537            send redirects to source routed frames.
 538            We don't send redirects to frames decapsulated from IPsec.
 539          */
 540         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 541             opt->srcrt == 0 && !skb_sec_path(skb)) {
 542                 struct in6_addr *target = NULL;
 543                 struct inet_peer *peer;
 544                 struct rt6_info *rt;
 545
 546                 /*
 547                  *      incoming and outgoing devices are the same
 548                  *      send a redirect.
 549                  */
 550
 551                 rt = (struct rt6_info *) dst;
 552                 if (rt->rt6i_flags & RTF_GATEWAY)
 553                         target = &rt->rt6i_gateway;
 554                 else
 555                         target = &hdr->daddr;
 556
 557                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 558
 559                 /* Limit redirects both by destination (here)
 560                    and by source (inside ndisc_send_redirect)
 561                  */
 562                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 563                         ndisc_send_redirect(skb, target);
 564                 if (peer)
 565                         inet_putpeer(peer);
 566         } else {
 567                 int addrtype = ipv6_addr_type(&hdr->saddr);
 568
 569                 /* This check is security critical. */
 570                 if (addrtype == IPV6_ADDR_ANY ||
 571                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 572                         goto error;
 573                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 574                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 575                                     ICMPV6_NOT_NEIGHBOUR, 0);
 576                         goto error;
 577                 }
 578         }
 579
 580         mtu = ip6_dst_mtu_forward(dst);
 581         if (mtu < IPV6_MIN_MTU)
 582                 mtu = IPV6_MIN_MTU;
 583
 584         if (ip6_pkt_too_big(skb, mtu)) {
 585                 /* Again, force OUTPUT device used as source address */
 586                 skb->dev = dst->dev;
 587                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 588                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 589                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 590                                 IPSTATS_MIB_FRAGFAILS);
 591                 kfree_skb(skb);
 592                 return -EMSGSIZE;
 593         }
 594
 595         if (skb_cow(skb, dst->dev->hard_header_len)) {
 596                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 597                                 IPSTATS_MIB_OUTDISCARDS);
 598                 goto drop;
 599         }
 600
 601         hdr = ipv6_hdr(skb);
 602
 603         /* Mangling hops number delayed to point after skb COW */
 604
 605         hdr->hop_limit--;
 606
 607         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 608                        net, NULL, skb, skb->dev, dst->dev,
 609                        ip6_forward_finish);
 610
 611 error:
 612         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 613 drop:
 614         kfree_skb(skb);
 615         return -EINVAL;
 616 }
 617
 618 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 619 {
 620         to->pkt_type = from->pkt_type;
 621         to->priority = from->priority;
 622         to->protocol = from->protocol;
 623         skb_dst_drop(to);
 624         skb_dst_set(to, dst_clone(skb_dst(from)));
 625         to->dev = from->dev;
 626         to->mark = from->mark;
 627
 628         skb_copy_hash(to, from);
 629
 630 #ifdef CONFIG_NET_SCHED
 631         to->tc_index = from->tc_index;
 632 #endif
 633         nf_copy(to, from);
 634         skb_ext_copy(to, from);
 635         skb_copy_secmark(to, from);
 636 }
 637
 638 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 639                       u8 nexthdr, __be32 frag_id,
 640                       struct ip6_fraglist_iter *iter)
 641 {
 642         unsigned int first_len;
 643         struct frag_hdr *fh;
 644
 645         /* BUILD HEADER */
 646         *prevhdr = NEXTHDR_FRAGMENT;
 647         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 648         if (!iter->tmp_hdr)
 649                 return -ENOMEM;
 650
 651         iter->frag = skb_shinfo(skb)->frag_list;
 652         skb_frag_list_init(skb);
 653
 654         iter->offset = 0;
 655         iter->hlen = hlen;
 656         iter->frag_id = frag_id;
 657         iter->nexthdr = nexthdr;
 658
 659         __skb_pull(skb, hlen);
 660         fh = __skb_push(skb, sizeof(struct frag_hdr));
 661         __skb_push(skb, hlen);
 662         skb_reset_network_header(skb);
 663         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 664
 665         fh->nexthdr = nexthdr;
 666         fh->reserved = 0;
 667         fh->frag_off = htons(IP6_MF);
 668         fh->identification = frag_id;
 669
 670         first_len = skb_pagelen(skb);
 671         skb->data_len = first_len - skb_headlen(skb);
 672         skb->len = first_len;
 673         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 674
 675         return 0;
 676 }
 677 EXPORT_SYMBOL(ip6_fraglist_init);
 678
 679 void ip6_fraglist_prepare(struct sk_buff *skb,
 680                           struct ip6_fraglist_iter *iter)
 681 {
 682         struct sk_buff *frag = iter->frag;
 683         unsigned int hlen = iter->hlen;
 684         struct frag_hdr *fh;
 685
 686         frag->ip_summed = CHECKSUM_NONE;
 687         skb_reset_transport_header(frag);
 688         fh = __skb_push(frag, sizeof(struct frag_hdr));
 689         __skb_push(frag, hlen);
 690         skb_reset_network_header(frag);
 691         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 692         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 693         fh->nexthdr = iter->nexthdr;
 694         fh->reserved = 0;
 695         fh->frag_off = htons(iter->offset);
 696         if (frag->next)
 697                 fh->frag_off |= htons(IP6_MF);
 698         fh->identification = iter->frag_id;
 699         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 700         ip6_copy_metadata(frag, skb);
 701 }
 702 EXPORT_SYMBOL(ip6_fraglist_prepare);
 703
 704 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 705                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 706                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 707 {
 708         state->prevhdr = prevhdr;
 709         state->nexthdr = nexthdr;
 710         state->frag_id = frag_id;
 711
 712         state->hlen = hlen;
 713         state->mtu = mtu;
 714
 715         state->left = skb->len - hlen;  /* Space per frame */
 716         state->ptr = hlen;              /* Where to start from */
 717
 718         state->hroom = hdr_room;
 719         state->troom = needed_tailroom;
 720
 721         state->offset = 0;
 722 }
 723 EXPORT_SYMBOL(ip6_frag_init);
 724
 725 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 726 {
 727         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 728         struct sk_buff *frag;
 729         struct frag_hdr *fh;
 730         unsigned int len;
 731
 732         len = state->left;
 733         /* IF: it doesn't fit, use 'mtu' - the data space left */
 734         if (len > state->mtu)
 735                 len = state->mtu;
 736         /* IF: we are not sending up to and including the packet end
 737            then align the next start on an eight byte boundary */
 738         if (len < state->left)
 739                 len &= ~7;
 740
 741         /* Allocate buffer */
 742         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 743                          state->hroom + state->troom, GFP_ATOMIC);
 744         if (!frag)
 745                 return ERR_PTR(-ENOMEM);
 746
 747         /*
 748          *      Set up data on packet
 749          */
 750
 751         ip6_copy_metadata(frag, skb);
 752         skb_reserve(frag, state->hroom);
 753         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 754         skb_reset_network_header(frag);
 755         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 756         frag->transport_header = (frag->network_header + state->hlen +
 757                                   sizeof(struct frag_hdr));
 758
 759         /*
 760          *      Charge the memory for the fragment to any owner
 761          *      it might possess
 762          */
 763         if (skb->sk)
 764                 skb_set_owner_w(frag, skb->sk);
 765
 766         /*
 767          *      Copy the packet header into the new buffer.
 768          */
 769         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 770
 771         fragnexthdr_offset = skb_network_header(frag);
 772         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 773         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 774
 775         /*
 776          *      Build fragment header.
 777          */
 778         fh->nexthdr = state->nexthdr;
 779         fh->reserved = 0;
 780         fh->identification = state->frag_id;
 781
 782         /*
 783          *      Copy a block of the IP datagram.
 784          */
 785         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 786                              len));
 787         state->left -= len;
 788
 789         fh->frag_off = htons(state->offset);
 790         if (state->left > 0)
 791                 fh->frag_off |= htons(IP6_MF);
 792         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 793
 794         state->ptr += len;
 795         state->offset += len;
 796
 797         return frag;
 798 }
 799 EXPORT_SYMBOL(ip6_frag_next);
 800
 801 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 802                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 803 {
 804         struct sk_buff *frag;
 805         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 806         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 807                                 inet6_sk(skb->sk) : NULL;
 808         struct ip6_frag_state state;
 809         unsigned int mtu, hlen, nexthdr_offset;
 810         ktime_t tstamp = skb->tstamp;
 811         int hroom, err = 0;
 812         __be32 frag_id;
 813         u8 *prevhdr, nexthdr = 0;
 814
 815         err = ip6_find_1stfragopt(skb, &prevhdr);
 816         if (err < 0)
 817                 goto fail;
 818         hlen = err;
 819         nexthdr = *prevhdr;
 820         nexthdr_offset = prevhdr - skb_network_header(skb);
 821
 822         mtu = ip6_skb_dst_mtu(skb);
 823
 824         /* We must not fragment if the socket is set to force MTU discovery
 825          * or if the skb it not generated by a local socket.
 826          */
 827         if (unlikely(!skb->ignore_df && skb->len > mtu))
 828                 goto fail_toobig;
 829
 830         if (IP6CB(skb)->frag_max_size) {
 831                 if (IP6CB(skb)->frag_max_size > mtu)
 832                         goto fail_toobig;
 833
 834                 /* don't send fragments larger than what we received */
 835                 mtu = IP6CB(skb)->frag_max_size;
 836                 if (mtu < IPV6_MIN_MTU)
 837                         mtu = IPV6_MIN_MTU;
 838         }
 839
 840         if (np && np->frag_size < mtu) {
 841                 if (np->frag_size)
 842                         mtu = np->frag_size;
 843         }
 844         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 845                 goto fail_toobig;
 846         mtu -= hlen + sizeof(struct frag_hdr);
 847
 848         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 849                                     &ipv6_hdr(skb)->saddr);
 850
 851         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 852             (err = skb_checksum_help(skb)))
 853                 goto fail;
 854
 855         prevhdr = skb_network_header(skb) + nexthdr_offset;
 856         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 857         if (skb_has_frag_list(skb)) {
 858                 unsigned int first_len = skb_pagelen(skb);
 859                 struct ip6_fraglist_iter iter;
 860                 struct sk_buff *frag2;
 861
 862                 if (first_len - hlen > mtu ||
 863                     ((first_len - hlen) & 7) ||
 864                     skb_cloned(skb) ||
 865                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 866                         goto slow_path;
 867
 868                 skb_walk_frags(skb, frag) {
 869                         /* Correct geometry. */
 870                         if (frag->len > mtu ||
 871                             ((frag->len & 7) && frag->next) ||
 872                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 873                                 goto slow_path_clean;
 874
 875                         /* Partially cloned skb? */
 876                         if (skb_shared(frag))
 877                                 goto slow_path_clean;
 878
 879                         BUG_ON(frag->sk);
 880                         if (skb->sk) {
 881                                 frag->sk = skb->sk;
 882                                 frag->destructor = sock_wfree;
 883                         }
 884                         skb->truesize -= frag->truesize;
 885                 }
 886
 887                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 888                                         &iter);
 889                 if (err < 0)
 890                         goto fail;
 891
 892                 for (;;) {
 893                         /* Prepare header of the next frame,
 894                          * before previous one went down. */
 895                         if (iter.frag)
 896                                 ip6_fraglist_prepare(skb, &iter);
 897
 898                         skb->tstamp = tstamp;
 899                         err = output(net, sk, skb);
 900                         if (!err)
 901                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 902                                               IPSTATS_MIB_FRAGCREATES);
 903
 904                         if (err || !iter.frag)
 905                                 break;
 906
 907                         skb = ip6_fraglist_next(&iter);
 908                 }
 909
 910                 kfree(iter.tmp_hdr);
 911
 912                 if (err == 0) {
 913                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 914                                       IPSTATS_MIB_FRAGOKS);
 915                         return 0;
 916                 }
 917
 918                 kfree_skb_list(iter.frag);
 919
 920                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 921                               IPSTATS_MIB_FRAGFAILS);
 922                 return err;
 923
 924 slow_path_clean:
 925                 skb_walk_frags(skb, frag2) {
 926                         if (frag2 == frag)
 927                                 break;
 928                         frag2->sk = NULL;
 929                         frag2->destructor = NULL;
 930                         skb->truesize += frag2->truesize;
 931                 }
 932         }
 933
 934 slow_path:
 935         /*
 936          *      Fragment the datagram.
 937          */
 938
 939         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 940                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 941                       &state);
 942
 943         /*
 944          *      Keep copying data until we run out.
 945          */
 946
 947         while (state.left > 0) {
 948                 frag = ip6_frag_next(skb, &state);
 949                 if (IS_ERR(frag)) {
 950                         err = PTR_ERR(frag);
 951                         goto fail;
 952                 }
 953
 954                 /*
 955                  *      Put this fragment into the sending queue.
 956                  */
 957                 frag->tstamp = tstamp;
 958                 err = output(net, sk, frag);
 959                 if (err)
 960                         goto fail;
 961
 962                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 963                               IPSTATS_MIB_FRAGCREATES);
 964         }
 965         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 966                       IPSTATS_MIB_FRAGOKS);
 967         consume_skb(skb);
 968         return err;
 969
 970 fail_toobig:
 971         if (skb->sk && dst_allfrag(skb_dst(skb)))
 972                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 973
 974         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 975         err = -EMSGSIZE;
 976
 977 fail:
 978         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 979                       IPSTATS_MIB_FRAGFAILS);
 980         kfree_skb(skb);
 981         return err;
 982 }
 983
 984 static inline int ip6_rt_check(const struct rt6key *rt_key,
 985                                const struct in6_addr *fl_addr,
 986                                const struct in6_addr *addr_cache)
 987 {
 988         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 989                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 990 }
 991
 992 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 993                                           struct dst_entry *dst,
 994                                           const struct flowi6 *fl6)
 995 {
 996         struct ipv6_pinfo *np = inet6_sk(sk);
 997         struct rt6_info *rt;
 998
 999         if (!dst)
1000                 goto out;
1001
1002         if (dst->ops->family != AF_INET6) {
1003                 dst_release(dst);
1004                 return NULL;
1005         }
1006
1007         rt = (struct rt6_info *)dst;
1008         /* Yes, checking route validity in not connected
1009          * case is not very simple. Take into account,
1010          * that we do not support routing by source, TOS,
1011          * and MSG_DONTROUTE            --ANK (980726)
1012          *
1013          * 1. ip6_rt_check(): If route was host route,
1014          *    check that cached destination is current.
1015          *    If it is network route, we still may
1016          *    check its validity using saved pointer
1017          *    to the last used address: daddr_cache.
1018          *    We do not want to save whole address now,
1019          *    (because main consumer of this service
1020          *    is tcp, which has not this problem),
1021          *    so that the last trick works only on connected
1022          *    sockets.
1023          * 2. oif also should be the same.
1024          */
1025         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1026 #ifdef CONFIG_IPV6_SUBTREES
1027             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1028 #endif
1029            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1030               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1031                 dst_release(dst);
1032                 dst = NULL;
1033         }
1034
1035 out:
1036         return dst;
1037 }
1038
1039 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1040                                struct dst_entry **dst, struct flowi6 *fl6)
1041 {
1042 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1043         struct neighbour *n;
1044         struct rt6_info *rt;
1045 #endif
1046         int err;
1047         int flags = 0;
1048
1049         /* The correct way to handle this would be to do
1050          * ip6_route_get_saddr, and then ip6_route_output; however,
1051          * the route-specific preferred source forces the
1052          * ip6_route_output call _before_ ip6_route_get_saddr.
1053          *
1054          * In source specific routing (no src=any default route),
1055          * ip6_route_output will fail given src=any saddr, though, so
1056          * that's why we try it again later.
1057          */
1058         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1059                 struct fib6_info *from;
1060                 struct rt6_info *rt;
1061                 bool had_dst = *dst != NULL;
1062
1063                 if (!had_dst)
1064                         *dst = ip6_route_output(net, sk, fl6);
1065                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1066
1067                 rcu_read_lock();
1068                 from = rt ? rcu_dereference(rt->from) : NULL;
1069                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1070                                           sk ? inet6_sk(sk)->srcprefs : 0,
1071                                           &fl6->saddr);
1072                 rcu_read_unlock();
1073
1074                 if (err)
1075                         goto out_err_release;
1076
1077                 /* If we had an erroneous initial result, pretend it
1078                  * never existed and let the SA-enabled version take
1079                  * over.
1080                  */
1081                 if (!had_dst && (*dst)->error) {
1082                         dst_release(*dst);
1083                         *dst = NULL;
1084                 }
1085
1086                 if (fl6->flowi6_oif)
1087                         flags |= RT6_LOOKUP_F_IFACE;
1088         }
1089
1090         if (!*dst)
1091                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1092
1093         err = (*dst)->error;
1094         if (err)
1095                 goto out_err_release;
1096
1097 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1098         /*
1099          * Here if the dst entry we've looked up
1100          * has a neighbour entry that is in the INCOMPLETE
1101          * state and the src address from the flow is
1102          * marked as OPTIMISTIC, we release the found
1103          * dst entry and replace it instead with the
1104          * dst entry of the nexthop router
1105          */
1106         rt = (struct rt6_info *) *dst;
1107         rcu_read_lock_bh();
1108         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1109                                       rt6_nexthop(rt, &fl6->daddr));
1110         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1111         rcu_read_unlock_bh();
1112
1113         if (err) {
1114                 struct inet6_ifaddr *ifp;
1115                 struct flowi6 fl_gw6;
1116                 int redirect;
1117
1118                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1119                                       (*dst)->dev, 1);
1120
1121                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1122                 if (ifp)
1123                         in6_ifa_put(ifp);
1124
1125                 if (redirect) {
1126                         /*
1127                          * We need to get the dst entry for the
1128                          * default router instead
1129                          */
1130                         dst_release(*dst);
1131                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1132                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1133                         *dst = ip6_route_output(net, sk, &fl_gw6);
1134                         err = (*dst)->error;
1135                         if (err)
1136                                 goto out_err_release;
1137                 }
1138         }
1139 #endif
1140         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1141             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1142                 err = -EAFNOSUPPORT;
1143                 goto out_err_release;
1144         }
1145
1146         return 0;
1147
1148 out_err_release:
1149         dst_release(*dst);
1150         *dst = NULL;
1151
1152         if (err == -ENETUNREACH)
1153                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1154         return err;
1155 }
1156
1157 /**
1158  *      ip6_dst_lookup - perform route lookup on flow
1159  *      @net: Network namespace to perform lookup in
1160  *      @sk: socket which provides route info
1161  *      @dst: pointer to dst_entry * for result
1162  *      @fl6: flow to lookup
1163  *
1164  *      This function performs a route lookup on the given flow.
1165  *
1166  *      It returns zero on success, or a standard errno code on error.
1167  */
1168 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1169                    struct flowi6 *fl6)
1170 {
1171         *dst = NULL;
1172         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1173 }
1174 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1175
1176 /**
1177  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1178  *      @net: Network namespace to perform lookup in
1179  *      @sk: socket which provides route info
1180  *      @fl6: flow to lookup
1181  *      @final_dst: final destination address for ipsec lookup
1182  *
1183  *      This function performs a route lookup on the given flow.
1184  *
1185  *      It returns a valid dst pointer on success, or a pointer encoded
1186  *      error code.
1187  */
1188 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1189                                       const struct in6_addr *final_dst)
1190 {
1191         struct dst_entry *dst = NULL;
1192         int err;
1193
1194         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1195         if (err)
1196                 return ERR_PTR(err);
1197         if (final_dst)
1198                 fl6->daddr = *final_dst;
1199
1200         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1201 }
1202 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1203
1204 /**
1205  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1206  *      @sk: socket which provides the dst cache and route info
1207  *      @fl6: flow to lookup
1208  *      @final_dst: final destination address for ipsec lookup
1209  *      @connected: whether @sk is connected or not
1210  *
1211  *      This function performs a route lookup on the given flow with the
1212  *      possibility of using the cached route in the socket if it is valid.
1213  *      It will take the socket dst lock when operating on the dst cache.
1214  *      As a result, this function can only be used in process context.
1215  *
1216  *      In addition, for a connected socket, cache the dst in the socket
1217  *      if the current cache is not valid.
1218  *
1219  *      It returns a valid dst pointer on success, or a pointer encoded
1220  *      error code.
1221  */
1222 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1223                                          const struct in6_addr *final_dst,
1224                                          bool connected)
1225 {
1226         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1227
1228         dst = ip6_sk_dst_check(sk, dst, fl6);
1229         if (dst)
1230                 return dst;
1231
1232         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1233         if (connected && !IS_ERR(dst))
1234                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1235
1236         return dst;
1237 }
1238 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1239
1240 /**
1241  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1242  *      @skb: Packet for which lookup is done
1243  *      @dev: Tunnel device
1244  *      @net: Network namespace of tunnel device
1245  *      @sock: Socket which provides route info
1246  *      @saddr: Memory to store the src ip address
1247  *      @info: Tunnel information
1248  *      @protocol: IP protocol
1249  *      @use_cache: Flag to enable cache usage
1250  *      This function performs a route lookup on a tunnel
1251  *
1252  *      It returns a valid dst pointer and stores src address to be used in
1253  *      tunnel in param saddr on success, else a pointer encoded error code.
1254  */
1255
1256 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1257                                         struct net_device *dev,
1258                                         struct net *net,
1259                                         struct socket *sock,
1260                                         struct in6_addr *saddr,
1261                                         const struct ip_tunnel_info *info,
1262                                         u8 protocol,
1263                                         bool use_cache)
1264 {
1265         struct dst_entry *dst = NULL;
1266 #ifdef CONFIG_DST_CACHE
1267         struct dst_cache *dst_cache;
1268 #endif
1269         struct flowi6 fl6;
1270         __u8 prio;
1271
1272 #ifdef CONFIG_DST_CACHE
1273         dst_cache = (struct dst_cache *)&info->dst_cache;
1274         if (use_cache) {
1275                 dst = dst_cache_get_ip6(dst_cache, saddr);
1276                 if (dst)
1277                         return dst;
1278         }
1279 #endif
1280         memset(&fl6, 0, sizeof(fl6));
1281         fl6.flowi6_mark = skb->mark;
1282         fl6.flowi6_proto = protocol;
1283         fl6.daddr = info->key.u.ipv6.dst;
1284         fl6.saddr = info->key.u.ipv6.src;
1285         prio = info->key.tos;
1286         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1287                                           info->key.label);
1288
1289         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1290                                               NULL);
1291         if (IS_ERR(dst)) {
1292                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1293                 return ERR_PTR(-ENETUNREACH);
1294         }
1295         if (dst->dev == dev) { /* is this necessary? */
1296                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1297                 dst_release(dst);
1298                 return ERR_PTR(-ELOOP);
1299         }
1300 #ifdef CONFIG_DST_CACHE
1301         if (use_cache)
1302                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1303 #endif
1304         *saddr = fl6.saddr;
1305         return dst;
1306 }
1307 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1308
1309 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1310                                                gfp_t gfp)
1311 {
1312         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1313 }
1314
1315 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1316                                                 gfp_t gfp)
1317 {
1318         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1319 }
1320
1321 static void ip6_append_data_mtu(unsigned int *mtu,
1322                                 int *maxfraglen,
1323                                 unsigned int fragheaderlen,
1324                                 struct sk_buff *skb,
1325                                 struct rt6_info *rt,
1326                                 unsigned int orig_mtu)
1327 {
1328         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1329                 if (!skb) {
1330                         /* first fragment, reserve header_len */
1331                         *mtu = orig_mtu - rt->dst.header_len;
1332
1333                 } else {
1334                         /*
1335                          * this fragment is not first, the headers
1336                          * space is regarded as data space.
1337                          */
1338                         *mtu = orig_mtu;
1339                 }
1340                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1341                               + fragheaderlen - sizeof(struct frag_hdr);
1342         }
1343 }
1344
1345 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1346                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1347                           struct rt6_info *rt, struct flowi6 *fl6)
1348 {
1349         struct ipv6_pinfo *np = inet6_sk(sk);
1350         unsigned int mtu;
1351         struct ipv6_txoptions *opt = ipc6->opt;
1352
1353         /*
1354          * setup for corking
1355          */
1356         if (opt) {
1357                 if (WARN_ON(v6_cork->opt))
1358                         return -EINVAL;
1359
1360                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1361                 if (unlikely(!v6_cork->opt))
1362                         return -ENOBUFS;
1363
1364                 v6_cork->opt->tot_len = sizeof(*opt);
1365                 v6_cork->opt->opt_flen = opt->opt_flen;
1366                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1367
1368                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1369                                                     sk->sk_allocation);
1370                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1371                         return -ENOBUFS;
1372
1373                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1374                                                     sk->sk_allocation);
1375                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1376                         return -ENOBUFS;
1377
1378                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1379                                                    sk->sk_allocation);
1380                 if (opt->hopopt && !v6_cork->opt->hopopt)
1381                         return -ENOBUFS;
1382
1383                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1384                                                     sk->sk_allocation);
1385                 if (opt->srcrt && !v6_cork->opt->srcrt)
1386                         return -ENOBUFS;
1387
1388                 /* need source address above miyazawa*/
1389         }
1390         dst_hold(&rt->dst);
1391         cork->base.dst = &rt->dst;
1392         cork->fl.u.ip6 = *fl6;
1393         v6_cork->hop_limit = ipc6->hlimit;
1394         v6_cork->tclass = ipc6->tclass;
1395         if (rt->dst.flags & DST_XFRM_TUNNEL)
1396                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1397                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1398         else
1399                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1400                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1401         if (np->frag_size < mtu) {
1402                 if (np->frag_size)
1403                         mtu = np->frag_size;
1404         }
1405         if (mtu < IPV6_MIN_MTU)
1406                 return -EINVAL;
1407         cork->base.fragsize = mtu;
1408         cork->base.gso_size = ipc6->gso_size;
1409         cork->base.tx_flags = 0;
1410         cork->base.mark = ipc6->sockc.mark;
1411         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1412
1413         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1414                 cork->base.flags |= IPCORK_ALLFRAG;
1415         cork->base.length = 0;
1416
1417         cork->base.transmit_time = ipc6->sockc.transmit_time;
1418
1419         return 0;
1420 }
1421
1422 static int __ip6_append_data(struct sock *sk,
1423                              struct flowi6 *fl6,
1424                              struct sk_buff_head *queue,
1425                              struct inet_cork *cork,
1426                              struct inet6_cork *v6_cork,
1427                              struct page_frag *pfrag,
1428                              int getfrag(void *from, char *to, int offset,
1429                                          int len, int odd, struct sk_buff *skb),
1430                              void *from, int length, int transhdrlen,
1431                              unsigned int flags, struct ipcm6_cookie *ipc6)
1432 {
1433         struct sk_buff *skb, *skb_prev = NULL;
1434         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1435         struct ubuf_info *uarg = NULL;
1436         int exthdrlen = 0;
1437         int dst_exthdrlen = 0;
1438         int hh_len;
1439         int copy;
1440         int err;
1441         int offset = 0;
1442         u32 tskey = 0;
1443         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1444         struct ipv6_txoptions *opt = v6_cork->opt;
1445         int csummode = CHECKSUM_NONE;
1446         unsigned int maxnonfragsize, headersize;
1447         unsigned int wmem_alloc_delta = 0;
1448         bool paged, extra_uref = false;
1449
1450         skb = skb_peek_tail(queue);
1451         if (!skb) {
1452                 exthdrlen = opt ? opt->opt_flen : 0;
1453                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1454         }
1455
1456         paged = !!cork->gso_size;
1457         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1458         orig_mtu = mtu;
1459
1460         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1461             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1462                 tskey = sk->sk_tskey++;
1463
1464         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1465
1466         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1467                         (opt ? opt->opt_nflen : 0);
1468         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1469                      sizeof(struct frag_hdr);
1470
1471         headersize = sizeof(struct ipv6hdr) +
1472                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1473                      (dst_allfrag(&rt->dst) ?
1474                       sizeof(struct frag_hdr) : 0) +
1475                      rt->rt6i_nfheader_len;
1476
1477         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1478          * the first fragment
1479          */
1480         if (headersize + transhdrlen > mtu)
1481                 goto emsgsize;
1482
1483         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1484             (sk->sk_protocol == IPPROTO_UDP ||
1485              sk->sk_protocol == IPPROTO_RAW)) {
1486                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1487                                 sizeof(struct ipv6hdr));
1488                 goto emsgsize;
1489         }
1490
1491         if (ip6_sk_ignore_df(sk))
1492                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1493         else
1494                 maxnonfragsize = mtu;
1495
1496         if (cork->length + length > maxnonfragsize - headersize) {
1497 emsgsize:
1498                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1499                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1500                 return -EMSGSIZE;
1501         }
1502
1503         /* CHECKSUM_PARTIAL only with no extension headers and when
1504          * we are not going to fragment
1505          */
1506         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1507             headersize == sizeof(struct ipv6hdr) &&
1508             length <= mtu - headersize &&
1509             (!(flags & MSG_MORE) || cork->gso_size) &&
1510             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1511                 csummode = CHECKSUM_PARTIAL;
1512
1513         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1514                 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1515                 if (!uarg)
1516                         return -ENOBUFS;
1517                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1518                 if (rt->dst.dev->features & NETIF_F_SG &&
1519                     csummode == CHECKSUM_PARTIAL) {
1520                         paged = true;
1521                 } else {
1522                         uarg->zerocopy = 0;
1523                         skb_zcopy_set(skb, uarg, &extra_uref);
1524                 }
1525         }
1526
1527         /*
1528          * Let's try using as much space as possible.
1529          * Use MTU if total length of the message fits into the MTU.
1530          * Otherwise, we need to reserve fragment header and
1531          * fragment alignment (= 8-15 octects, in total).
1532          *
1533          * Note that we may need to "move" the data from the tail
1534          * of the buffer to the new fragment when we split
1535          * the message.
1536          *
1537          * FIXME: It may be fragmented into multiple chunks
1538          *        at once if non-fragmentable extension headers
1539          *        are too large.
1540          * --yoshfuji
1541          */
1542
1543         cork->length += length;
1544         if (!skb)
1545                 goto alloc_new_skb;
1546
1547         while (length > 0) {
1548                 /* Check if the remaining data fits into current packet. */
1549                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1550                 if (copy < length)
1551                         copy = maxfraglen - skb->len;
1552
1553                 if (copy <= 0) {
1554                         char *data;
1555                         unsigned int datalen;
1556                         unsigned int fraglen;
1557                         unsigned int fraggap;
1558                         unsigned int alloclen;
1559                         unsigned int pagedlen;
1560 alloc_new_skb:
1561                         /* There's no room in the current skb */
1562                         if (skb)
1563                                 fraggap = skb->len - maxfraglen;
1564                         else
1565                                 fraggap = 0;
1566                         /* update mtu and maxfraglen if necessary */
1567                         if (!skb || !skb_prev)
1568                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1569                                                     fragheaderlen, skb, rt,
1570                                                     orig_mtu);
1571
1572                         skb_prev = skb;
1573
1574                         /*
1575                          * If remaining data exceeds the mtu,
1576                          * we know we need more fragment(s).
1577                          */
1578                         datalen = length + fraggap;
1579
1580                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1581                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1582                         fraglen = datalen + fragheaderlen;
1583                         pagedlen = 0;
1584
1585                         if ((flags & MSG_MORE) &&
1586                             !(rt->dst.dev->features&NETIF_F_SG))
1587                                 alloclen = mtu;
1588                         else if (!paged)
1589                                 alloclen = fraglen;
1590                         else {
1591                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1592                                 pagedlen = fraglen - alloclen;
1593                         }
1594
1595                         alloclen += dst_exthdrlen;
1596
1597                         if (datalen != length + fraggap) {
1598                                 /*
1599                                  * this is not the last fragment, the trailer
1600                                  * space is regarded as data space.
1601                                  */
1602                                 datalen += rt->dst.trailer_len;
1603                         }
1604
1605                         alloclen += rt->dst.trailer_len;
1606                         fraglen = datalen + fragheaderlen;
1607
1608                         /*
1609                          * We just reserve space for fragment header.
1610                          * Note: this may be overallocation if the message
1611                          * (without MSG_MORE) fits into the MTU.
1612                          */
1613                         alloclen += sizeof(struct frag_hdr);
1614
1615                         copy = datalen - transhdrlen - fraggap - pagedlen;
1616                         if (copy < 0) {
1617                                 err = -EINVAL;
1618                                 goto error;
1619                         }
1620                         if (transhdrlen) {
1621                                 skb = sock_alloc_send_skb(sk,
1622                                                 alloclen + hh_len,
1623                                                 (flags & MSG_DONTWAIT), &err);
1624                         } else {
1625                                 skb = NULL;
1626                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1627                                     2 * sk->sk_sndbuf)
1628                                         skb = alloc_skb(alloclen + hh_len,
1629                                                         sk->sk_allocation);
1630                                 if (unlikely(!skb))
1631                                         err = -ENOBUFS;
1632                         }
1633                         if (!skb)
1634                                 goto error;
1635                         /*
1636                          *      Fill in the control structures
1637                          */
1638                         skb->protocol = htons(ETH_P_IPV6);
1639                         skb->ip_summed = csummode;
1640                         skb->csum = 0;
1641                         /* reserve for fragmentation and ipsec header */
1642                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1643                                     dst_exthdrlen);
1644
1645                         /*
1646                          *      Find where to start putting bytes
1647                          */
1648                         data = skb_put(skb, fraglen - pagedlen);
1649                         skb_set_network_header(skb, exthdrlen);
1650                         data += fragheaderlen;
1651                         skb->transport_header = (skb->network_header +
1652                                                  fragheaderlen);
1653                         if (fraggap) {
1654                                 skb->csum = skb_copy_and_csum_bits(
1655                                         skb_prev, maxfraglen,
1656                                         data + transhdrlen, fraggap);
1657                                 skb_prev->csum = csum_sub(skb_prev->csum,
1658                                                           skb->csum);
1659                                 data += fraggap;
1660                                 pskb_trim_unique(skb_prev, maxfraglen);
1661                         }
1662                         if (copy > 0 &&
1663                             getfrag(from, data + transhdrlen, offset,
1664                                     copy, fraggap, skb) < 0) {
1665                                 err = -EFAULT;
1666                                 kfree_skb(skb);
1667                                 goto error;
1668                         }
1669
1670                         offset += copy;
1671                         length -= copy + transhdrlen;
1672                         transhdrlen = 0;
1673                         exthdrlen = 0;
1674                         dst_exthdrlen = 0;
1675
1676                         /* Only the initial fragment is time stamped */
1677                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1678                         cork->tx_flags = 0;
1679                         skb_shinfo(skb)->tskey = tskey;
1680                         tskey = 0;
1681                         skb_zcopy_set(skb, uarg, &extra_uref);
1682
1683                         if ((flags & MSG_CONFIRM) && !skb_prev)
1684                                 skb_set_dst_pending_confirm(skb, 1);
1685
1686                         /*
1687                          * Put the packet on the pending queue
1688                          */
1689                         if (!skb->destructor) {
1690                                 skb->destructor = sock_wfree;
1691                                 skb->sk = sk;
1692                                 wmem_alloc_delta += skb->truesize;
1693                         }
1694                         __skb_queue_tail(queue, skb);
1695                         continue;
1696                 }
1697
1698                 if (copy > length)
1699                         copy = length;
1700
1701                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1702                     skb_tailroom(skb) >= copy) {
1703                         unsigned int off;
1704
1705                         off = skb->len;
1706                         if (getfrag(from, skb_put(skb, copy),
1707                                                 offset, copy, off, skb) < 0) {
1708                                 __skb_trim(skb, off);
1709                                 err = -EFAULT;
1710                                 goto error;
1711                         }
1712                 } else if (!uarg || !uarg->zerocopy) {
1713                         int i = skb_shinfo(skb)->nr_frags;
1714
1715                         err = -ENOMEM;
1716                         if (!sk_page_frag_refill(sk, pfrag))
1717                                 goto error;
1718
1719                         if (!skb_can_coalesce(skb, i, pfrag->page,
1720                                               pfrag->offset)) {
1721                                 err = -EMSGSIZE;
1722                                 if (i == MAX_SKB_FRAGS)
1723                                         goto error;
1724
1725                                 __skb_fill_page_desc(skb, i, pfrag->page,
1726                                                      pfrag->offset, 0);
1727                                 skb_shinfo(skb)->nr_frags = ++i;
1728                                 get_page(pfrag->page);
1729                         }
1730                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1731                         if (getfrag(from,
1732                                     page_address(pfrag->page) + pfrag->offset,
1733                                     offset, copy, skb->len, skb) < 0)
1734                                 goto error_efault;
1735
1736                         pfrag->offset += copy;
1737                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1738                         skb->len += copy;
1739                         skb->data_len += copy;
1740                         skb->truesize += copy;
1741                         wmem_alloc_delta += copy;
1742                 } else {
1743                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1744                         if (err < 0)
1745                                 goto error;
1746                 }
1747                 offset += copy;
1748                 length -= copy;
1749         }
1750
1751         if (wmem_alloc_delta)
1752                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1753         return 0;
1754
1755 error_efault:
1756         err = -EFAULT;
1757 error:
1758         net_zcopy_put_abort(uarg, extra_uref);
1759         cork->length -= length;
1760         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1761         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1762         return err;
1763 }
1764
1765 int ip6_append_data(struct sock *sk,
1766                     int getfrag(void *from, char *to, int offset, int len,
1767                                 int odd, struct sk_buff *skb),
1768                     void *from, int length, int transhdrlen,
1769                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1770                     struct rt6_info *rt, unsigned int flags)
1771 {
1772         struct inet_sock *inet = inet_sk(sk);
1773         struct ipv6_pinfo *np = inet6_sk(sk);
1774         int exthdrlen;
1775         int err;
1776
1777         if (flags&MSG_PROBE)
1778                 return 0;
1779         if (skb_queue_empty(&sk->sk_write_queue)) {
1780                 /*
1781                  * setup for corking
1782                  */
1783                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1784                                      ipc6, rt, fl6);
1785                 if (err)
1786                         return err;
1787
1788                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1789                 length += exthdrlen;
1790                 transhdrlen += exthdrlen;
1791         } else {
1792                 fl6 = &inet->cork.fl.u.ip6;
1793                 transhdrlen = 0;
1794         }
1795
1796         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1797                                  &np->cork, sk_page_frag(sk), getfrag,
1798                                  from, length, transhdrlen, flags, ipc6);
1799 }
1800 EXPORT_SYMBOL_GPL(ip6_append_data);
1801
1802 static void ip6_cork_release(struct inet_cork_full *cork,
1803                              struct inet6_cork *v6_cork)
1804 {
1805         if (v6_cork->opt) {
1806                 kfree(v6_cork->opt->dst0opt);
1807                 kfree(v6_cork->opt->dst1opt);
1808                 kfree(v6_cork->opt->hopopt);
1809                 kfree(v6_cork->opt->srcrt);
1810                 kfree(v6_cork->opt);
1811                 v6_cork->opt = NULL;
1812         }
1813
1814         if (cork->base.dst) {
1815                 dst_release(cork->base.dst);
1816                 cork->base.dst = NULL;
1817                 cork->base.flags &= ~IPCORK_ALLFRAG;
1818         }
1819         memset(&cork->fl, 0, sizeof(cork->fl));
1820 }
1821
1822 struct sk_buff *__ip6_make_skb(struct sock *sk,
1823                                struct sk_buff_head *queue,
1824                                struct inet_cork_full *cork,
1825                                struct inet6_cork *v6_cork)
1826 {
1827         struct sk_buff *skb, *tmp_skb;
1828         struct sk_buff **tail_skb;
1829         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1830         struct ipv6_pinfo *np = inet6_sk(sk);
1831         struct net *net = sock_net(sk);
1832         struct ipv6hdr *hdr;
1833         struct ipv6_txoptions *opt = v6_cork->opt;
1834         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1835         struct flowi6 *fl6 = &cork->fl.u.ip6;
1836         unsigned char proto = fl6->flowi6_proto;
1837
1838         skb = __skb_dequeue(queue);
1839         if (!skb)
1840                 goto out;
1841         tail_skb = &(skb_shinfo(skb)->frag_list);
1842
1843         /* move skb->data to ip header from ext header */
1844         if (skb->data < skb_network_header(skb))
1845                 __skb_pull(skb, skb_network_offset(skb));
1846         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1847                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1848                 *tail_skb = tmp_skb;
1849                 tail_skb = &(tmp_skb->next);
1850                 skb->len += tmp_skb->len;
1851                 skb->data_len += tmp_skb->len;
1852                 skb->truesize += tmp_skb->truesize;
1853                 tmp_skb->destructor = NULL;
1854                 tmp_skb->sk = NULL;
1855         }
1856
1857         /* Allow local fragmentation. */
1858         skb->ignore_df = ip6_sk_ignore_df(sk);
1859
1860         *final_dst = fl6->daddr;
1861         __skb_pull(skb, skb_network_header_len(skb));
1862         if (opt && opt->opt_flen)
1863                 ipv6_push_frag_opts(skb, opt, &proto);
1864         if (opt && opt->opt_nflen)
1865                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1866
1867         skb_push(skb, sizeof(struct ipv6hdr));
1868         skb_reset_network_header(skb);
1869         hdr = ipv6_hdr(skb);
1870
1871         ip6_flow_hdr(hdr, v6_cork->tclass,
1872                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1873                                         ip6_autoflowlabel(net, np), fl6));
1874         hdr->hop_limit = v6_cork->hop_limit;
1875         hdr->nexthdr = proto;
1876         hdr->saddr = fl6->saddr;
1877         hdr->daddr = *final_dst;
1878
1879         skb->priority = sk->sk_priority;
1880         skb->mark = cork->base.mark;
1881
1882         skb->tstamp = cork->base.transmit_time;
1883
1884         skb_dst_set(skb, dst_clone(&rt->dst));
1885         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1886         if (proto == IPPROTO_ICMPV6) {
1887                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1888
1889                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1890                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1891         }
1892
1893         ip6_cork_release(cork, v6_cork);
1894 out:
1895         return skb;
1896 }
1897
1898 int ip6_send_skb(struct sk_buff *skb)
1899 {
1900         struct net *net = sock_net(skb->sk);
1901         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1902         int err;
1903
1904         err = ip6_local_out(net, skb->sk, skb);
1905         if (err) {
1906                 if (err > 0)
1907                         err = net_xmit_errno(err);
1908                 if (err)
1909                         IP6_INC_STATS(net, rt->rt6i_idev,
1910                                       IPSTATS_MIB_OUTDISCARDS);
1911         }
1912
1913         return err;
1914 }
1915
1916 int ip6_push_pending_frames(struct sock *sk)
1917 {
1918         struct sk_buff *skb;
1919
1920         skb = ip6_finish_skb(sk);
1921         if (!skb)
1922                 return 0;
1923
1924         return ip6_send_skb(skb);
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1927
1928 static void __ip6_flush_pending_frames(struct sock *sk,
1929                                        struct sk_buff_head *queue,
1930                                        struct inet_cork_full *cork,
1931                                        struct inet6_cork *v6_cork)
1932 {
1933         struct sk_buff *skb;
1934
1935         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1936                 if (skb_dst(skb))
1937                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1938                                       IPSTATS_MIB_OUTDISCARDS);
1939                 kfree_skb(skb);
1940         }
1941
1942         ip6_cork_release(cork, v6_cork);
1943 }
1944
1945 void ip6_flush_pending_frames(struct sock *sk)
1946 {
1947         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1948                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1949 }
1950 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1951
1952 struct sk_buff *ip6_make_skb(struct sock *sk,
1953                              int getfrag(void *from, char *to, int offset,
1954                                          int len, int odd, struct sk_buff *skb),
1955                              void *from, int length, int transhdrlen,
1956                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1957                              struct rt6_info *rt, unsigned int flags,
1958                              struct inet_cork_full *cork)
1959 {
1960         struct inet6_cork v6_cork;
1961         struct sk_buff_head queue;
1962         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1963         int err;
1964
1965         if (flags & MSG_PROBE)
1966                 return NULL;
1967
1968         __skb_queue_head_init(&queue);
1969
1970         cork->base.flags = 0;
1971         cork->base.addr = 0;
1972         cork->base.opt = NULL;
1973         cork->base.dst = NULL;
1974         v6_cork.opt = NULL;
1975         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1976         if (err) {
1977                 ip6_cork_release(cork, &v6_cork);
1978                 return ERR_PTR(err);
1979         }
1980         if (ipc6->dontfrag < 0)
1981                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1982
1983         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1984                                 &current->task_frag, getfrag, from,
1985                                 length + exthdrlen, transhdrlen + exthdrlen,
1986                                 flags, ipc6);
1987         if (err) {
1988                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1989                 return ERR_PTR(err);
1990         }
1991
1992         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1993 }