net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  64         int delta = hh_len - skb_headroom(skb);
  65         const struct in6_addr *nexthop;
  66         struct neighbour *neigh;
  67         int ret;
  68
  69         /* Be paranoid, rather than too clever. */
  70         if (unlikely(delta > 0) && dev->header_ops) {
  71                 /* pskb_expand_head() might crash, if skb is shared */
  72                 if (skb_shared(skb)) {
  73                         struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
  74
  75                         if (likely(nskb)) {
  76                                 if (skb->sk)
  77                                         skb_set_owner_w(nskb, skb->sk);
  78                                 consume_skb(skb);
  79                         } else {
  80                                 kfree_skb(skb);
  81                         }
  82                         skb = nskb;
  83                 }
  84                 if (skb &&
  85                     pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
  86                         kfree_skb(skb);
  87                         skb = NULL;
  88                 }
  89                 if (!skb) {
  90                         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
  91                         return -ENOMEM;
  92                 }
  93         }
  94
  95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  99                     ((mroute6_is_socket(net, skb) &&
 100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102                                          &ipv6_hdr(skb)->saddr))) {
 103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105                         /* Do not check for IFF_ALLMULTI; multicast routing
 106                            is not supported in any case.
 107                          */
 108                         if (newskb)
 109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110                                         net, sk, newskb, NULL, newskb->dev,
 111                                         dev_loopback_xmit);
 112
 113                         if (ipv6_hdr(skb)->hop_limit == 0) {
 114                                 IP6_INC_STATS(net, idev,
 115                                               IPSTATS_MIB_OUTDISCARDS);
 116                                 kfree_skb(skb);
 117                                 return 0;
 118                         }
 119                 }
 120
 121                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 122
 123                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 124                     IPV6_ADDR_SCOPE_NODELOCAL &&
 125                     !(dev->flags & IFF_LOOPBACK)) {
 126                         kfree_skb(skb);
 127                         return 0;
 128                 }
 129         }
 130
 131         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 132                 int res = lwtunnel_xmit(skb);
 133
 134                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 135                         return res;
 136         }
 137
 138         rcu_read_lock_bh();
 139         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 140         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 141         if (unlikely(!neigh))
 142                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 143         if (!IS_ERR(neigh)) {
 144                 sock_confirm_neigh(skb, neigh);
 145                 ret = neigh_output(neigh, skb, false);
 146                 rcu_read_unlock_bh();
 147                 return ret;
 148         }
 149         rcu_read_unlock_bh();
 150
 151         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 152         kfree_skb(skb);
 153         return -EINVAL;
 154 }
 155
 156 static int
 157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 158                                     struct sk_buff *skb, unsigned int mtu)
 159 {
 160         struct sk_buff *segs, *nskb;
 161         netdev_features_t features;
 162         int ret = 0;
 163
 164         /* Please see corresponding comment in ip_finish_output_gso
 165          * describing the cases where GSO segment length exceeds the
 166          * egress MTU.
 167          */
 168         features = netif_skb_features(skb);
 169         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 170         if (IS_ERR_OR_NULL(segs)) {
 171                 kfree_skb(skb);
 172                 return -ENOMEM;
 173         }
 174
 175         consume_skb(skb);
 176
 177         skb_list_walk_safe(segs, segs, nskb) {
 178                 int err;
 179
 180                 skb_mark_not_on_list(segs);
 181                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 182                 if (err && ret == 0)
 183                         ret = err;
 184         }
 185
 186         return ret;
 187 }
 188
 189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 190 {
 191         unsigned int mtu;
 192
 193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 194         /* Policy lookup after SNAT yielded a new policy */
 195         if (skb_dst(skb)->xfrm) {
 196                 IPCB(skb)->flags |= IPSKB_REROUTED;
 197                 return dst_output(net, sk, skb);
 198         }
 199 #endif
 200
 201         mtu = ip6_skb_dst_mtu(skb);
 202         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 203                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 204
 205         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 206             dst_allfrag(skb_dst(skb)) ||
 207             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 208                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 209         else
 210                 return ip6_finish_output2(net, sk, skb);
 211 }
 212
 213 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 214 {
 215         int ret;
 216
 217         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 218         switch (ret) {
 219         case NET_XMIT_SUCCESS:
 220                 return __ip6_finish_output(net, sk, skb);
 221         case NET_XMIT_CN:
 222                 return __ip6_finish_output(net, sk, skb) ? : ret;
 223         default:
 224                 kfree_skb(skb);
 225                 return ret;
 226         }
 227 }
 228
 229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 230 {
 231         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 232         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 233
 234         skb->protocol = htons(ETH_P_IPV6);
 235         skb->dev = dev;
 236
 237         if (unlikely(idev->cnf.disable_ipv6)) {
 238                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 239                 kfree_skb(skb);
 240                 return 0;
 241         }
 242
 243         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 244                             net, sk, skb, indev, dev,
 245                             ip6_finish_output,
 246                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 247 }
 248 EXPORT_SYMBOL(ip6_output);
 249
 250 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 251 {
 252         if (!np->autoflowlabel_set)
 253                 return ip6_default_np_autolabel(net);
 254         else
 255                 return np->autoflowlabel;
 256 }
 257
 258 /*
 259  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 260  * Note : socket lock is not held for SYNACK packets, but might be modified
 261  * by calls to skb_set_owner_w() and ipv6_local_error(),
 262  * which are using proper atomic operations or spinlocks.
 263  */
 264 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 265              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 266 {
 267         struct net *net = sock_net(sk);
 268         const struct ipv6_pinfo *np = inet6_sk(sk);
 269         struct in6_addr *first_hop = &fl6->daddr;
 270         struct dst_entry *dst = skb_dst(skb);
 271         unsigned int head_room;
 272         struct ipv6hdr *hdr;
 273         u8  proto = fl6->flowi6_proto;
 274         int seg_len = skb->len;
 275         int hlimit = -1;
 276         u32 mtu;
 277
 278         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 279         if (opt)
 280                 head_room += opt->opt_nflen + opt->opt_flen;
 281
 282         if (unlikely(skb_headroom(skb) < head_room)) {
 283                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 284                 if (!skb2) {
 285                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 286                                       IPSTATS_MIB_OUTDISCARDS);
 287                         kfree_skb(skb);
 288                         return -ENOBUFS;
 289                 }
 290                 if (skb->sk)
 291                         skb_set_owner_w(skb2, skb->sk);
 292                 consume_skb(skb);
 293                 skb = skb2;
 294         }
 295
 296         if (opt) {
 297                 seg_len += opt->opt_nflen + opt->opt_flen;
 298
 299                 if (opt->opt_flen)
 300                         ipv6_push_frag_opts(skb, opt, &proto);
 301
 302                 if (opt->opt_nflen)
 303                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 304                                              &fl6->saddr);
 305         }
 306
 307         skb_push(skb, sizeof(struct ipv6hdr));
 308         skb_reset_network_header(skb);
 309         hdr = ipv6_hdr(skb);
 310
 311         /*
 312          *      Fill in the IPv6 header
 313          */
 314         if (np)
 315                 hlimit = np->hop_limit;
 316         if (hlimit < 0)
 317                 hlimit = ip6_dst_hoplimit(dst);
 318
 319         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 320                                 ip6_autoflowlabel(net, np), fl6));
 321
 322         hdr->payload_len = htons(seg_len);
 323         hdr->nexthdr = proto;
 324         hdr->hop_limit = hlimit;
 325
 326         hdr->saddr = fl6->saddr;
 327         hdr->daddr = *first_hop;
 328
 329         skb->protocol = htons(ETH_P_IPV6);
 330         skb->priority = priority;
 331         skb->mark = mark;
 332
 333         mtu = dst_mtu(dst);
 334         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 335                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 336                               IPSTATS_MIB_OUT, skb->len);
 337
 338                 /* if egress device is enslaved to an L3 master device pass the
 339                  * skb to its handler for processing
 340                  */
 341                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 342                 if (unlikely(!skb))
 343                         return 0;
 344
 345                 /* hooks should never assume socket lock is held.
 346                  * we promote our socket to non const
 347                  */
 348                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 349                                net, (struct sock *)sk, skb, NULL, dst->dev,
 350                                dst_output);
 351         }
 352
 353         skb->dev = dst->dev;
 354         /* ipv6_local_error() does not require socket lock,
 355          * we promote our socket to non const
 356          */
 357         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 358
 359         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 360         kfree_skb(skb);
 361         return -EMSGSIZE;
 362 }
 363 EXPORT_SYMBOL(ip6_xmit);
 364
 365 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 366 {
 367         struct ip6_ra_chain *ra;
 368         struct sock *last = NULL;
 369
 370         read_lock(&ip6_ra_lock);
 371         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 372                 struct sock *sk = ra->sk;
 373                 if (sk && ra->sel == sel &&
 374                     (!sk->sk_bound_dev_if ||
 375                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 376                         struct ipv6_pinfo *np = inet6_sk(sk);
 377
 378                         if (np && np->rtalert_isolate &&
 379                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 380                                 continue;
 381                         }
 382                         if (last) {
 383                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 384                                 if (skb2)
 385                                         rawv6_rcv(last, skb2);
 386                         }
 387                         last = sk;
 388                 }
 389         }
 390
 391         if (last) {
 392                 rawv6_rcv(last, skb);
 393                 read_unlock(&ip6_ra_lock);
 394                 return 1;
 395         }
 396         read_unlock(&ip6_ra_lock);
 397         return 0;
 398 }
 399
 400 static int ip6_forward_proxy_check(struct sk_buff *skb)
 401 {
 402         struct ipv6hdr *hdr = ipv6_hdr(skb);
 403         u8 nexthdr = hdr->nexthdr;
 404         __be16 frag_off;
 405         int offset;
 406
 407         if (ipv6_ext_hdr(nexthdr)) {
 408                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 409                 if (offset < 0)
 410                         return 0;
 411         } else
 412                 offset = sizeof(struct ipv6hdr);
 413
 414         if (nexthdr == IPPROTO_ICMPV6) {
 415                 struct icmp6hdr *icmp6;
 416
 417                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 418                                          offset + 1 - skb->data)))
 419                         return 0;
 420
 421                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 422
 423                 switch (icmp6->icmp6_type) {
 424                 case NDISC_ROUTER_SOLICITATION:
 425                 case NDISC_ROUTER_ADVERTISEMENT:
 426                 case NDISC_NEIGHBOUR_SOLICITATION:
 427                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 428                 case NDISC_REDIRECT:
 429                         /* For reaction involving unicast neighbor discovery
 430                          * message destined to the proxied address, pass it to
 431                          * input function.
 432                          */
 433                         return 1;
 434                 default:
 435                         break;
 436                 }
 437         }
 438
 439         /*
 440          * The proxying router can't forward traffic sent to a link-local
 441          * address, so signal the sender and discard the packet. This
 442          * behavior is clarified by the MIPv6 specification.
 443          */
 444         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 445                 dst_link_failure(skb);
 446                 return -1;
 447         }
 448
 449         return 0;
 450 }
 451
 452 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 453                                      struct sk_buff *skb)
 454 {
 455         struct dst_entry *dst = skb_dst(skb);
 456
 457         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 458         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 459
 460 #ifdef CONFIG_NET_SWITCHDEV
 461         if (skb->offload_l3_fwd_mark) {
 462                 consume_skb(skb);
 463                 return 0;
 464         }
 465 #endif
 466
 467         skb->tstamp = 0;
 468         return dst_output(net, sk, skb);
 469 }
 470
 471 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 472 {
 473         if (skb->len <= mtu)
 474                 return false;
 475
 476         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 477         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 478                 return true;
 479
 480         if (skb->ignore_df)
 481                 return false;
 482
 483         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 484                 return false;
 485
 486         return true;
 487 }
 488
 489 int ip6_forward(struct sk_buff *skb)
 490 {
 491         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 492         struct dst_entry *dst = skb_dst(skb);
 493         struct ipv6hdr *hdr = ipv6_hdr(skb);
 494         struct inet6_skb_parm *opt = IP6CB(skb);
 495         struct net *net = dev_net(dst->dev);
 496         u32 mtu;
 497
 498         if (net->ipv6.devconf_all->forwarding == 0)
 499                 goto error;
 500
 501         if (skb->pkt_type != PACKET_HOST)
 502                 goto drop;
 503
 504         if (unlikely(skb->sk))
 505                 goto drop;
 506
 507         if (skb_warn_if_lro(skb))
 508                 goto drop;
 509
 510         if (!net->ipv6.devconf_all->disable_policy &&
 511             !idev->cnf.disable_policy &&
 512             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 513                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 514                 goto drop;
 515         }
 516
 517         skb_forward_csum(skb);
 518
 519         /*
 520          *      We DO NOT make any processing on
 521          *      RA packets, pushing them to user level AS IS
 522          *      without ane WARRANTY that application will be able
 523          *      to interpret them. The reason is that we
 524          *      cannot make anything clever here.
 525          *
 526          *      We are not end-node, so that if packet contains
 527          *      AH/ESP, we cannot make anything.
 528          *      Defragmentation also would be mistake, RA packets
 529          *      cannot be fragmented, because there is no warranty
 530          *      that different fragments will go along one path. --ANK
 531          */
 532         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 533                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 534                         return 0;
 535         }
 536
 537         /*
 538          *      check and decrement ttl
 539          */
 540         if (hdr->hop_limit <= 1) {
 541                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 542                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 543
 544                 kfree_skb(skb);
 545                 return -ETIMEDOUT;
 546         }
 547
 548         /* XXX: idev->cnf.proxy_ndp? */
 549         if (net->ipv6.devconf_all->proxy_ndp &&
 550             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 551                 int proxied = ip6_forward_proxy_check(skb);
 552                 if (proxied > 0) {
 553                         hdr->hop_limit--;
 554                         return ip6_input(skb);
 555                 } else if (proxied < 0) {
 556                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 557                         goto drop;
 558                 }
 559         }
 560
 561         if (!xfrm6_route_forward(skb)) {
 562                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 563                 goto drop;
 564         }
 565         dst = skb_dst(skb);
 566
 567         /* IPv6 specs say nothing about it, but it is clear that we cannot
 568            send redirects to source routed frames.
 569            We don't send redirects to frames decapsulated from IPsec.
 570          */
 571         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 572             opt->srcrt == 0 && !skb_sec_path(skb)) {
 573                 struct in6_addr *target = NULL;
 574                 struct inet_peer *peer;
 575                 struct rt6_info *rt;
 576
 577                 /*
 578                  *      incoming and outgoing devices are the same
 579                  *      send a redirect.
 580                  */
 581
 582                 rt = (struct rt6_info *) dst;
 583                 if (rt->rt6i_flags & RTF_GATEWAY)
 584                         target = &rt->rt6i_gateway;
 585                 else
 586                         target = &hdr->daddr;
 587
 588                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 589
 590                 /* Limit redirects both by destination (here)
 591                    and by source (inside ndisc_send_redirect)
 592                  */
 593                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 594                         ndisc_send_redirect(skb, target);
 595                 if (peer)
 596                         inet_putpeer(peer);
 597         } else {
 598                 int addrtype = ipv6_addr_type(&hdr->saddr);
 599
 600                 /* This check is security critical. */
 601                 if (addrtype == IPV6_ADDR_ANY ||
 602                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 603                         goto error;
 604                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 605                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 606                                     ICMPV6_NOT_NEIGHBOUR, 0);
 607                         goto error;
 608                 }
 609         }
 610
 611         mtu = ip6_dst_mtu_forward(dst);
 612         if (mtu < IPV6_MIN_MTU)
 613                 mtu = IPV6_MIN_MTU;
 614
 615         if (ip6_pkt_too_big(skb, mtu)) {
 616                 /* Again, force OUTPUT device used as source address */
 617                 skb->dev = dst->dev;
 618                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 619                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 620                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 621                                 IPSTATS_MIB_FRAGFAILS);
 622                 kfree_skb(skb);
 623                 return -EMSGSIZE;
 624         }
 625
 626         if (skb_cow(skb, dst->dev->hard_header_len)) {
 627                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 628                                 IPSTATS_MIB_OUTDISCARDS);
 629                 goto drop;
 630         }
 631
 632         hdr = ipv6_hdr(skb);
 633
 634         /* Mangling hops number delayed to point after skb COW */
 635
 636         hdr->hop_limit--;
 637
 638         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 639                        net, NULL, skb, skb->dev, dst->dev,
 640                        ip6_forward_finish);
 641
 642 error:
 643         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 644 drop:
 645         kfree_skb(skb);
 646         return -EINVAL;
 647 }
 648
 649 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 650 {
 651         to->pkt_type = from->pkt_type;
 652         to->priority = from->priority;
 653         to->protocol = from->protocol;
 654         skb_dst_drop(to);
 655         skb_dst_set(to, dst_clone(skb_dst(from)));
 656         to->dev = from->dev;
 657         to->mark = from->mark;
 658
 659         skb_copy_hash(to, from);
 660
 661 #ifdef CONFIG_NET_SCHED
 662         to->tc_index = from->tc_index;
 663 #endif
 664         nf_copy(to, from);
 665         skb_ext_copy(to, from);
 666         skb_copy_secmark(to, from);
 667 }
 668
 669 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 670                       u8 nexthdr, __be32 frag_id,
 671                       struct ip6_fraglist_iter *iter)
 672 {
 673         unsigned int first_len;
 674         struct frag_hdr *fh;
 675
 676         /* BUILD HEADER */
 677         *prevhdr = NEXTHDR_FRAGMENT;
 678         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 679         if (!iter->tmp_hdr)
 680                 return -ENOMEM;
 681
 682         iter->frag = skb_shinfo(skb)->frag_list;
 683         skb_frag_list_init(skb);
 684
 685         iter->offset = 0;
 686         iter->hlen = hlen;
 687         iter->frag_id = frag_id;
 688         iter->nexthdr = nexthdr;
 689
 690         __skb_pull(skb, hlen);
 691         fh = __skb_push(skb, sizeof(struct frag_hdr));
 692         __skb_push(skb, hlen);
 693         skb_reset_network_header(skb);
 694         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 695
 696         fh->nexthdr = nexthdr;
 697         fh->reserved = 0;
 698         fh->frag_off = htons(IP6_MF);
 699         fh->identification = frag_id;
 700
 701         first_len = skb_pagelen(skb);
 702         skb->data_len = first_len - skb_headlen(skb);
 703         skb->len = first_len;
 704         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 705
 706         return 0;
 707 }
 708 EXPORT_SYMBOL(ip6_fraglist_init);
 709
 710 void ip6_fraglist_prepare(struct sk_buff *skb,
 711                           struct ip6_fraglist_iter *iter)
 712 {
 713         struct sk_buff *frag = iter->frag;
 714         unsigned int hlen = iter->hlen;
 715         struct frag_hdr *fh;
 716
 717         frag->ip_summed = CHECKSUM_NONE;
 718         skb_reset_transport_header(frag);
 719         fh = __skb_push(frag, sizeof(struct frag_hdr));
 720         __skb_push(frag, hlen);
 721         skb_reset_network_header(frag);
 722         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 723         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 724         fh->nexthdr = iter->nexthdr;
 725         fh->reserved = 0;
 726         fh->frag_off = htons(iter->offset);
 727         if (frag->next)
 728                 fh->frag_off |= htons(IP6_MF);
 729         fh->identification = iter->frag_id;
 730         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 731         ip6_copy_metadata(frag, skb);
 732 }
 733 EXPORT_SYMBOL(ip6_fraglist_prepare);
 734
 735 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 736                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 737                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 738 {
 739         state->prevhdr = prevhdr;
 740         state->nexthdr = nexthdr;
 741         state->frag_id = frag_id;
 742
 743         state->hlen = hlen;
 744         state->mtu = mtu;
 745
 746         state->left = skb->len - hlen;  /* Space per frame */
 747         state->ptr = hlen;              /* Where to start from */
 748
 749         state->hroom = hdr_room;
 750         state->troom = needed_tailroom;
 751
 752         state->offset = 0;
 753 }
 754 EXPORT_SYMBOL(ip6_frag_init);
 755
 756 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 757 {
 758         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 759         struct sk_buff *frag;
 760         struct frag_hdr *fh;
 761         unsigned int len;
 762
 763         len = state->left;
 764         /* IF: it doesn't fit, use 'mtu' - the data space left */
 765         if (len > state->mtu)
 766                 len = state->mtu;
 767         /* IF: we are not sending up to and including the packet end
 768            then align the next start on an eight byte boundary */
 769         if (len < state->left)
 770                 len &= ~7;
 771
 772         /* Allocate buffer */
 773         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 774                          state->hroom + state->troom, GFP_ATOMIC);
 775         if (!frag)
 776                 return ERR_PTR(-ENOMEM);
 777
 778         /*
 779          *      Set up data on packet
 780          */
 781
 782         ip6_copy_metadata(frag, skb);
 783         skb_reserve(frag, state->hroom);
 784         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 785         skb_reset_network_header(frag);
 786         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 787         frag->transport_header = (frag->network_header + state->hlen +
 788                                   sizeof(struct frag_hdr));
 789
 790         /*
 791          *      Charge the memory for the fragment to any owner
 792          *      it might possess
 793          */
 794         if (skb->sk)
 795                 skb_set_owner_w(frag, skb->sk);
 796
 797         /*
 798          *      Copy the packet header into the new buffer.
 799          */
 800         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 801
 802         fragnexthdr_offset = skb_network_header(frag);
 803         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 804         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 805
 806         /*
 807          *      Build fragment header.
 808          */
 809         fh->nexthdr = state->nexthdr;
 810         fh->reserved = 0;
 811         fh->identification = state->frag_id;
 812
 813         /*
 814          *      Copy a block of the IP datagram.
 815          */
 816         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 817                              len));
 818         state->left -= len;
 819
 820         fh->frag_off = htons(state->offset);
 821         if (state->left > 0)
 822                 fh->frag_off |= htons(IP6_MF);
 823         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 824
 825         state->ptr += len;
 826         state->offset += len;
 827
 828         return frag;
 829 }
 830 EXPORT_SYMBOL(ip6_frag_next);
 831
 832 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 833                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 834 {
 835         struct sk_buff *frag;
 836         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 837         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 838                                 inet6_sk(skb->sk) : NULL;
 839         struct ip6_frag_state state;
 840         unsigned int mtu, hlen, nexthdr_offset;
 841         ktime_t tstamp = skb->tstamp;
 842         int hroom, err = 0;
 843         __be32 frag_id;
 844         u8 *prevhdr, nexthdr = 0;
 845
 846         err = ip6_find_1stfragopt(skb, &prevhdr);
 847         if (err < 0)
 848                 goto fail;
 849         hlen = err;
 850         nexthdr = *prevhdr;
 851         nexthdr_offset = prevhdr - skb_network_header(skb);
 852
 853         mtu = ip6_skb_dst_mtu(skb);
 854
 855         /* We must not fragment if the socket is set to force MTU discovery
 856          * or if the skb it not generated by a local socket.
 857          */
 858         if (unlikely(!skb->ignore_df && skb->len > mtu))
 859                 goto fail_toobig;
 860
 861         if (IP6CB(skb)->frag_max_size) {
 862                 if (IP6CB(skb)->frag_max_size > mtu)
 863                         goto fail_toobig;
 864
 865                 /* don't send fragments larger than what we received */
 866                 mtu = IP6CB(skb)->frag_max_size;
 867                 if (mtu < IPV6_MIN_MTU)
 868                         mtu = IPV6_MIN_MTU;
 869         }
 870
 871         if (np && np->frag_size < mtu) {
 872                 if (np->frag_size)
 873                         mtu = np->frag_size;
 874         }
 875         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 876                 goto fail_toobig;
 877         mtu -= hlen + sizeof(struct frag_hdr);
 878
 879         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 880                                     &ipv6_hdr(skb)->saddr);
 881
 882         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 883             (err = skb_checksum_help(skb)))
 884                 goto fail;
 885
 886         prevhdr = skb_network_header(skb) + nexthdr_offset;
 887         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 888         if (skb_has_frag_list(skb)) {
 889                 unsigned int first_len = skb_pagelen(skb);
 890                 struct ip6_fraglist_iter iter;
 891                 struct sk_buff *frag2;
 892
 893                 if (first_len - hlen > mtu ||
 894                     ((first_len - hlen) & 7) ||
 895                     skb_cloned(skb) ||
 896                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 897                         goto slow_path;
 898
 899                 skb_walk_frags(skb, frag) {
 900                         /* Correct geometry. */
 901                         if (frag->len > mtu ||
 902                             ((frag->len & 7) && frag->next) ||
 903                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 904                                 goto slow_path_clean;
 905
 906                         /* Partially cloned skb? */
 907                         if (skb_shared(frag))
 908                                 goto slow_path_clean;
 909
 910                         BUG_ON(frag->sk);
 911                         if (skb->sk) {
 912                                 frag->sk = skb->sk;
 913                                 frag->destructor = sock_wfree;
 914                         }
 915                         skb->truesize -= frag->truesize;
 916                 }
 917
 918                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 919                                         &iter);
 920                 if (err < 0)
 921                         goto fail;
 922
 923                 for (;;) {
 924                         /* Prepare header of the next frame,
 925                          * before previous one went down. */
 926                         if (iter.frag)
 927                                 ip6_fraglist_prepare(skb, &iter);
 928
 929                         skb->tstamp = tstamp;
 930                         err = output(net, sk, skb);
 931                         if (!err)
 932                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 933                                               IPSTATS_MIB_FRAGCREATES);
 934
 935                         if (err || !iter.frag)
 936                                 break;
 937
 938                         skb = ip6_fraglist_next(&iter);
 939                 }
 940
 941                 kfree(iter.tmp_hdr);
 942
 943                 if (err == 0) {
 944                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 945                                       IPSTATS_MIB_FRAGOKS);
 946                         return 0;
 947                 }
 948
 949                 kfree_skb_list(iter.frag);
 950
 951                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 952                               IPSTATS_MIB_FRAGFAILS);
 953                 return err;
 954
 955 slow_path_clean:
 956                 skb_walk_frags(skb, frag2) {
 957                         if (frag2 == frag)
 958                                 break;
 959                         frag2->sk = NULL;
 960                         frag2->destructor = NULL;
 961                         skb->truesize += frag2->truesize;
 962                 }
 963         }
 964
 965 slow_path:
 966         /*
 967          *      Fragment the datagram.
 968          */
 969
 970         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 971                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 972                       &state);
 973
 974         /*
 975          *      Keep copying data until we run out.
 976          */
 977
 978         while (state.left > 0) {
 979                 frag = ip6_frag_next(skb, &state);
 980                 if (IS_ERR(frag)) {
 981                         err = PTR_ERR(frag);
 982                         goto fail;
 983                 }
 984
 985                 /*
 986                  *      Put this fragment into the sending queue.
 987                  */
 988                 frag->tstamp = tstamp;
 989                 err = output(net, sk, frag);
 990                 if (err)
 991                         goto fail;
 992
 993                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 994                               IPSTATS_MIB_FRAGCREATES);
 995         }
 996         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 997                       IPSTATS_MIB_FRAGOKS);
 998         consume_skb(skb);
 999         return err;
1000
1001 fail_toobig:
1002         if (skb->sk && dst_allfrag(skb_dst(skb)))
1003                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1004
1005         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1006         err = -EMSGSIZE;
1007
1008 fail:
1009         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1010                       IPSTATS_MIB_FRAGFAILS);
1011         kfree_skb(skb);
1012         return err;
1013 }
1014
1015 static inline int ip6_rt_check(const struct rt6key *rt_key,
1016                                const struct in6_addr *fl_addr,
1017                                const struct in6_addr *addr_cache)
1018 {
1019         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1020                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1021 }
1022
1023 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1024                                           struct dst_entry *dst,
1025                                           const struct flowi6 *fl6)
1026 {
1027         struct ipv6_pinfo *np = inet6_sk(sk);
1028         struct rt6_info *rt;
1029
1030         if (!dst)
1031                 goto out;
1032
1033         if (dst->ops->family != AF_INET6) {
1034                 dst_release(dst);
1035                 return NULL;
1036         }
1037
1038         rt = (struct rt6_info *)dst;
1039         /* Yes, checking route validity in not connected
1040          * case is not very simple. Take into account,
1041          * that we do not support routing by source, TOS,
1042          * and MSG_DONTROUTE            --ANK (980726)
1043          *
1044          * 1. ip6_rt_check(): If route was host route,
1045          *    check that cached destination is current.
1046          *    If it is network route, we still may
1047          *    check its validity using saved pointer
1048          *    to the last used address: daddr_cache.
1049          *    We do not want to save whole address now,
1050          *    (because main consumer of this service
1051          *    is tcp, which has not this problem),
1052          *    so that the last trick works only on connected
1053          *    sockets.
1054          * 2. oif also should be the same.
1055          */
1056         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1057 #ifdef CONFIG_IPV6_SUBTREES
1058             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1059 #endif
1060            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1061               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1062                 dst_release(dst);
1063                 dst = NULL;
1064         }
1065
1066 out:
1067         return dst;
1068 }
1069
1070 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1071                                struct dst_entry **dst, struct flowi6 *fl6)
1072 {
1073 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1074         struct neighbour *n;
1075         struct rt6_info *rt;
1076 #endif
1077         int err;
1078         int flags = 0;
1079
1080         /* The correct way to handle this would be to do
1081          * ip6_route_get_saddr, and then ip6_route_output; however,
1082          * the route-specific preferred source forces the
1083          * ip6_route_output call _before_ ip6_route_get_saddr.
1084          *
1085          * In source specific routing (no src=any default route),
1086          * ip6_route_output will fail given src=any saddr, though, so
1087          * that's why we try it again later.
1088          */
1089         if (ipv6_addr_any(&fl6->saddr)) {
1090                 struct fib6_info *from;
1091                 struct rt6_info *rt;
1092
1093                 *dst = ip6_route_output(net, sk, fl6);
1094                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1095
1096                 rcu_read_lock();
1097                 from = rt ? rcu_dereference(rt->from) : NULL;
1098                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1099                                           sk ? inet6_sk(sk)->srcprefs : 0,
1100                                           &fl6->saddr);
1101                 rcu_read_unlock();
1102
1103                 if (err)
1104                         goto out_err_release;
1105
1106                 /* If we had an erroneous initial result, pretend it
1107                  * never existed and let the SA-enabled version take
1108                  * over.
1109                  */
1110                 if ((*dst)->error) {
1111                         dst_release(*dst);
1112                         *dst = NULL;
1113                 }
1114
1115                 if (fl6->flowi6_oif)
1116                         flags |= RT6_LOOKUP_F_IFACE;
1117         }
1118
1119         if (!*dst)
1120                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1121
1122         err = (*dst)->error;
1123         if (err)
1124                 goto out_err_release;
1125
1126 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1127         /*
1128          * Here if the dst entry we've looked up
1129          * has a neighbour entry that is in the INCOMPLETE
1130          * state and the src address from the flow is
1131          * marked as OPTIMISTIC, we release the found
1132          * dst entry and replace it instead with the
1133          * dst entry of the nexthop router
1134          */
1135         rt = (struct rt6_info *) *dst;
1136         rcu_read_lock_bh();
1137         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1138                                       rt6_nexthop(rt, &fl6->daddr));
1139         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1140         rcu_read_unlock_bh();
1141
1142         if (err) {
1143                 struct inet6_ifaddr *ifp;
1144                 struct flowi6 fl_gw6;
1145                 int redirect;
1146
1147                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1148                                       (*dst)->dev, 1);
1149
1150                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1151                 if (ifp)
1152                         in6_ifa_put(ifp);
1153
1154                 if (redirect) {
1155                         /*
1156                          * We need to get the dst entry for the
1157                          * default router instead
1158                          */
1159                         dst_release(*dst);
1160                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1161                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1162                         *dst = ip6_route_output(net, sk, &fl_gw6);
1163                         err = (*dst)->error;
1164                         if (err)
1165                                 goto out_err_release;
1166                 }
1167         }
1168 #endif
1169         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1170             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1171                 err = -EAFNOSUPPORT;
1172                 goto out_err_release;
1173         }
1174
1175         return 0;
1176
1177 out_err_release:
1178         dst_release(*dst);
1179         *dst = NULL;
1180
1181         if (err == -ENETUNREACH)
1182                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1183         return err;
1184 }
1185
1186 /**
1187  *      ip6_dst_lookup - perform route lookup on flow
1188  *      @net: Network namespace to perform lookup in
1189  *      @sk: socket which provides route info
1190  *      @dst: pointer to dst_entry * for result
1191  *      @fl6: flow to lookup
1192  *
1193  *      This function performs a route lookup on the given flow.
1194  *
1195  *      It returns zero on success, or a standard errno code on error.
1196  */
1197 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1198                    struct flowi6 *fl6)
1199 {
1200         *dst = NULL;
1201         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1202 }
1203 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1204
1205 /**
1206  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1207  *      @net: Network namespace to perform lookup in
1208  *      @sk: socket which provides route info
1209  *      @fl6: flow to lookup
1210  *      @final_dst: final destination address for ipsec lookup
1211  *
1212  *      This function performs a route lookup on the given flow.
1213  *
1214  *      It returns a valid dst pointer on success, or a pointer encoded
1215  *      error code.
1216  */
1217 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1218                                       const struct in6_addr *final_dst)
1219 {
1220         struct dst_entry *dst = NULL;
1221         int err;
1222
1223         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1224         if (err)
1225                 return ERR_PTR(err);
1226         if (final_dst)
1227                 fl6->daddr = *final_dst;
1228
1229         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1230 }
1231 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1232
1233 /**
1234  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1235  *      @sk: socket which provides the dst cache and route info
1236  *      @fl6: flow to lookup
1237  *      @final_dst: final destination address for ipsec lookup
1238  *      @connected: whether @sk is connected or not
1239  *
1240  *      This function performs a route lookup on the given flow with the
1241  *      possibility of using the cached route in the socket if it is valid.
1242  *      It will take the socket dst lock when operating on the dst cache.
1243  *      As a result, this function can only be used in process context.
1244  *
1245  *      In addition, for a connected socket, cache the dst in the socket
1246  *      if the current cache is not valid.
1247  *
1248  *      It returns a valid dst pointer on success, or a pointer encoded
1249  *      error code.
1250  */
1251 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1252                                          const struct in6_addr *final_dst,
1253                                          bool connected)
1254 {
1255         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1256
1257         dst = ip6_sk_dst_check(sk, dst, fl6);
1258         if (dst)
1259                 return dst;
1260
1261         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1262         if (connected && !IS_ERR(dst))
1263                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1264
1265         return dst;
1266 }
1267 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1268
1269 /**
1270  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1271  *      @skb: Packet for which lookup is done
1272  *      @dev: Tunnel device
1273  *      @net: Network namespace of tunnel device
1274  *      @sock: Socket which provides route info
1275  *      @saddr: Memory to store the src ip address
1276  *      @info: Tunnel information
1277  *      @protocol: IP protocol
1278  *      @use_cache: Flag to enable cache usage
1279  *      This function performs a route lookup on a tunnel
1280  *
1281  *      It returns a valid dst pointer and stores src address to be used in
1282  *      tunnel in param saddr on success, else a pointer encoded error code.
1283  */
1284
1285 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1286                                         struct net_device *dev,
1287                                         struct net *net,
1288                                         struct socket *sock,
1289                                         struct in6_addr *saddr,
1290                                         const struct ip_tunnel_info *info,
1291                                         u8 protocol,
1292                                         bool use_cache)
1293 {
1294         struct dst_entry *dst = NULL;
1295 #ifdef CONFIG_DST_CACHE
1296         struct dst_cache *dst_cache;
1297 #endif
1298         struct flowi6 fl6;
1299         __u8 prio;
1300
1301 #ifdef CONFIG_DST_CACHE
1302         dst_cache = (struct dst_cache *)&info->dst_cache;
1303         if (use_cache) {
1304                 dst = dst_cache_get_ip6(dst_cache, saddr);
1305                 if (dst)
1306                         return dst;
1307         }
1308 #endif
1309         memset(&fl6, 0, sizeof(fl6));
1310         fl6.flowi6_mark = skb->mark;
1311         fl6.flowi6_proto = protocol;
1312         fl6.daddr = info->key.u.ipv6.dst;
1313         fl6.saddr = info->key.u.ipv6.src;
1314         prio = info->key.tos;
1315         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1316                                           info->key.label);
1317
1318         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1319                                               NULL);
1320         if (IS_ERR(dst)) {
1321                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1322                 return ERR_PTR(-ENETUNREACH);
1323         }
1324         if (dst->dev == dev) { /* is this necessary? */
1325                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1326                 dst_release(dst);
1327                 return ERR_PTR(-ELOOP);
1328         }
1329 #ifdef CONFIG_DST_CACHE
1330         if (use_cache)
1331                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1332 #endif
1333         *saddr = fl6.saddr;
1334         return dst;
1335 }
1336 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1337
1338 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1339                                                gfp_t gfp)
1340 {
1341         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1342 }
1343
1344 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1345                                                 gfp_t gfp)
1346 {
1347         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1348 }
1349
1350 static void ip6_append_data_mtu(unsigned int *mtu,
1351                                 int *maxfraglen,
1352                                 unsigned int fragheaderlen,
1353                                 struct sk_buff *skb,
1354                                 struct rt6_info *rt,
1355                                 unsigned int orig_mtu)
1356 {
1357         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1358                 if (!skb) {
1359                         /* first fragment, reserve header_len */
1360                         *mtu = orig_mtu - rt->dst.header_len;
1361
1362                 } else {
1363                         /*
1364                          * this fragment is not first, the headers
1365                          * space is regarded as data space.
1366                          */
1367                         *mtu = orig_mtu;
1368                 }
1369                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1370                               + fragheaderlen - sizeof(struct frag_hdr);
1371         }
1372 }
1373
1374 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1375                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1376                           struct rt6_info *rt, struct flowi6 *fl6)
1377 {
1378         struct ipv6_pinfo *np = inet6_sk(sk);
1379         unsigned int mtu;
1380         struct ipv6_txoptions *opt = ipc6->opt;
1381
1382         /*
1383          * setup for corking
1384          */
1385         if (opt) {
1386                 if (WARN_ON(v6_cork->opt))
1387                         return -EINVAL;
1388
1389                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1390                 if (unlikely(!v6_cork->opt))
1391                         return -ENOBUFS;
1392
1393                 v6_cork->opt->tot_len = sizeof(*opt);
1394                 v6_cork->opt->opt_flen = opt->opt_flen;
1395                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1396
1397                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1398                                                     sk->sk_allocation);
1399                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1400                         return -ENOBUFS;
1401
1402                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1403                                                     sk->sk_allocation);
1404                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1405                         return -ENOBUFS;
1406
1407                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1408                                                    sk->sk_allocation);
1409                 if (opt->hopopt && !v6_cork->opt->hopopt)
1410                         return -ENOBUFS;
1411
1412                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1413                                                     sk->sk_allocation);
1414                 if (opt->srcrt && !v6_cork->opt->srcrt)
1415                         return -ENOBUFS;
1416
1417                 /* need source address above miyazawa*/
1418         }
1419         dst_hold(&rt->dst);
1420         cork->base.dst = &rt->dst;
1421         cork->fl.u.ip6 = *fl6;
1422         v6_cork->hop_limit = ipc6->hlimit;
1423         v6_cork->tclass = ipc6->tclass;
1424         if (rt->dst.flags & DST_XFRM_TUNNEL)
1425                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1426                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1427         else
1428                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1429                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1430         if (np->frag_size < mtu) {
1431                 if (np->frag_size)
1432                         mtu = np->frag_size;
1433         }
1434         if (mtu < IPV6_MIN_MTU)
1435                 return -EINVAL;
1436         cork->base.fragsize = mtu;
1437         cork->base.gso_size = ipc6->gso_size;
1438         cork->base.tx_flags = 0;
1439         cork->base.mark = ipc6->sockc.mark;
1440         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1441
1442         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1443                 cork->base.flags |= IPCORK_ALLFRAG;
1444         cork->base.length = 0;
1445
1446         cork->base.transmit_time = ipc6->sockc.transmit_time;
1447
1448         return 0;
1449 }
1450
1451 static int __ip6_append_data(struct sock *sk,
1452                              struct flowi6 *fl6,
1453                              struct sk_buff_head *queue,
1454                              struct inet_cork *cork,
1455                              struct inet6_cork *v6_cork,
1456                              struct page_frag *pfrag,
1457                              int getfrag(void *from, char *to, int offset,
1458                                          int len, int odd, struct sk_buff *skb),
1459                              void *from, int length, int transhdrlen,
1460                              unsigned int flags, struct ipcm6_cookie *ipc6)
1461 {
1462         struct sk_buff *skb, *skb_prev = NULL;
1463         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1464         struct ubuf_info *uarg = NULL;
1465         int exthdrlen = 0;
1466         int dst_exthdrlen = 0;
1467         int hh_len;
1468         int copy;
1469         int err;
1470         int offset = 0;
1471         u32 tskey = 0;
1472         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1473         struct ipv6_txoptions *opt = v6_cork->opt;
1474         int csummode = CHECKSUM_NONE;
1475         unsigned int maxnonfragsize, headersize;
1476         unsigned int wmem_alloc_delta = 0;
1477         bool paged, extra_uref = false;
1478
1479         skb = skb_peek_tail(queue);
1480         if (!skb) {
1481                 exthdrlen = opt ? opt->opt_flen : 0;
1482                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1483         }
1484
1485         paged = !!cork->gso_size;
1486         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1487         orig_mtu = mtu;
1488
1489         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1490             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1491                 tskey = sk->sk_tskey++;
1492
1493         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1494
1495         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1496                         (opt ? opt->opt_nflen : 0);
1497         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1498                      sizeof(struct frag_hdr);
1499
1500         headersize = sizeof(struct ipv6hdr) +
1501                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1502                      (dst_allfrag(&rt->dst) ?
1503                       sizeof(struct frag_hdr) : 0) +
1504                      rt->rt6i_nfheader_len;
1505
1506         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1507          * the first fragment
1508          */
1509         if (headersize + transhdrlen > mtu)
1510                 goto emsgsize;
1511
1512         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1513             (sk->sk_protocol == IPPROTO_UDP ||
1514              sk->sk_protocol == IPPROTO_RAW)) {
1515                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1516                                 sizeof(struct ipv6hdr));
1517                 goto emsgsize;
1518         }
1519
1520         if (ip6_sk_ignore_df(sk))
1521                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1522         else
1523                 maxnonfragsize = mtu;
1524
1525         if (cork->length + length > maxnonfragsize - headersize) {
1526 emsgsize:
1527                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1528                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1529                 return -EMSGSIZE;
1530         }
1531
1532         /* CHECKSUM_PARTIAL only with no extension headers and when
1533          * we are not going to fragment
1534          */
1535         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1536             headersize == sizeof(struct ipv6hdr) &&
1537             length <= mtu - headersize &&
1538             (!(flags & MSG_MORE) || cork->gso_size) &&
1539             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1540                 csummode = CHECKSUM_PARTIAL;
1541
1542         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1543                 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1544                 if (!uarg)
1545                         return -ENOBUFS;
1546                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1547                 if (rt->dst.dev->features & NETIF_F_SG &&
1548                     csummode == CHECKSUM_PARTIAL) {
1549                         paged = true;
1550                 } else {
1551                         uarg->zerocopy = 0;
1552                         skb_zcopy_set(skb, uarg, &extra_uref);
1553                 }
1554         }
1555
1556         /*
1557          * Let's try using as much space as possible.
1558          * Use MTU if total length of the message fits into the MTU.
1559          * Otherwise, we need to reserve fragment header and
1560          * fragment alignment (= 8-15 octects, in total).
1561          *
1562          * Note that we may need to "move" the data from the tail
1563          * of the buffer to the new fragment when we split
1564          * the message.
1565          *
1566          * FIXME: It may be fragmented into multiple chunks
1567          *        at once if non-fragmentable extension headers
1568          *        are too large.
1569          * --yoshfuji
1570          */
1571
1572         cork->length += length;
1573         if (!skb)
1574                 goto alloc_new_skb;
1575
1576         while (length > 0) {
1577                 /* Check if the remaining data fits into current packet. */
1578                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1579                 if (copy < length)
1580                         copy = maxfraglen - skb->len;
1581
1582                 if (copy <= 0) {
1583                         char *data;
1584                         unsigned int datalen;
1585                         unsigned int fraglen;
1586                         unsigned int fraggap;
1587                         unsigned int alloclen, alloc_extra;
1588                         unsigned int pagedlen;
1589 alloc_new_skb:
1590                         /* There's no room in the current skb */
1591                         if (skb)
1592                                 fraggap = skb->len - maxfraglen;
1593                         else
1594                                 fraggap = 0;
1595                         /* update mtu and maxfraglen if necessary */
1596                         if (!skb || !skb_prev)
1597                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1598                                                     fragheaderlen, skb, rt,
1599                                                     orig_mtu);
1600
1601                         skb_prev = skb;
1602
1603                         /*
1604                          * If remaining data exceeds the mtu,
1605                          * we know we need more fragment(s).
1606                          */
1607                         datalen = length + fraggap;
1608
1609                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1610                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1611                         fraglen = datalen + fragheaderlen;
1612                         pagedlen = 0;
1613
1614                         alloc_extra = hh_len;
1615                         alloc_extra += dst_exthdrlen;
1616                         alloc_extra += rt->dst.trailer_len;
1617
1618                         /* We just reserve space for fragment header.
1619                          * Note: this may be overallocation if the message
1620                          * (without MSG_MORE) fits into the MTU.
1621                          */
1622                         alloc_extra += sizeof(struct frag_hdr);
1623
1624                         if ((flags & MSG_MORE) &&
1625                             !(rt->dst.dev->features&NETIF_F_SG))
1626                                 alloclen = mtu;
1627                         else if (!paged &&
1628                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1629                                   !(rt->dst.dev->features & NETIF_F_SG)))
1630                                 alloclen = fraglen;
1631                         else {
1632                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1633                                 pagedlen = fraglen - alloclen;
1634                         }
1635                         alloclen += alloc_extra;
1636
1637                         if (datalen != length + fraggap) {
1638                                 /*
1639                                  * this is not the last fragment, the trailer
1640                                  * space is regarded as data space.
1641                                  */
1642                                 datalen += rt->dst.trailer_len;
1643                         }
1644
1645                         fraglen = datalen + fragheaderlen;
1646
1647                         copy = datalen - transhdrlen - fraggap - pagedlen;
1648                         if (copy < 0) {
1649                                 err = -EINVAL;
1650                                 goto error;
1651                         }
1652                         if (transhdrlen) {
1653                                 skb = sock_alloc_send_skb(sk, alloclen,
1654                                                 (flags & MSG_DONTWAIT), &err);
1655                         } else {
1656                                 skb = NULL;
1657                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1658                                     2 * sk->sk_sndbuf)
1659                                         skb = alloc_skb(alloclen,
1660                                                         sk->sk_allocation);
1661                                 if (unlikely(!skb))
1662                                         err = -ENOBUFS;
1663                         }
1664                         if (!skb)
1665                                 goto error;
1666                         /*
1667                          *      Fill in the control structures
1668                          */
1669                         skb->protocol = htons(ETH_P_IPV6);
1670                         skb->ip_summed = csummode;
1671                         skb->csum = 0;
1672                         /* reserve for fragmentation and ipsec header */
1673                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1674                                     dst_exthdrlen);
1675
1676                         /*
1677                          *      Find where to start putting bytes
1678                          */
1679                         data = skb_put(skb, fraglen - pagedlen);
1680                         skb_set_network_header(skb, exthdrlen);
1681                         data += fragheaderlen;
1682                         skb->transport_header = (skb->network_header +
1683                                                  fragheaderlen);
1684                         if (fraggap) {
1685                                 skb->csum = skb_copy_and_csum_bits(
1686                                         skb_prev, maxfraglen,
1687                                         data + transhdrlen, fraggap);
1688                                 skb_prev->csum = csum_sub(skb_prev->csum,
1689                                                           skb->csum);
1690                                 data += fraggap;
1691                                 pskb_trim_unique(skb_prev, maxfraglen);
1692                         }
1693                         if (copy > 0 &&
1694                             getfrag(from, data + transhdrlen, offset,
1695                                     copy, fraggap, skb) < 0) {
1696                                 err = -EFAULT;
1697                                 kfree_skb(skb);
1698                                 goto error;
1699                         }
1700
1701                         offset += copy;
1702                         length -= copy + transhdrlen;
1703                         transhdrlen = 0;
1704                         exthdrlen = 0;
1705                         dst_exthdrlen = 0;
1706
1707                         /* Only the initial fragment is time stamped */
1708                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1709                         cork->tx_flags = 0;
1710                         skb_shinfo(skb)->tskey = tskey;
1711                         tskey = 0;
1712                         skb_zcopy_set(skb, uarg, &extra_uref);
1713
1714                         if ((flags & MSG_CONFIRM) && !skb_prev)
1715                                 skb_set_dst_pending_confirm(skb, 1);
1716
1717                         /*
1718                          * Put the packet on the pending queue
1719                          */
1720                         if (!skb->destructor) {
1721                                 skb->destructor = sock_wfree;
1722                                 skb->sk = sk;
1723                                 wmem_alloc_delta += skb->truesize;
1724                         }
1725                         __skb_queue_tail(queue, skb);
1726                         continue;
1727                 }
1728
1729                 if (copy > length)
1730                         copy = length;
1731
1732                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1733                     skb_tailroom(skb) >= copy) {
1734                         unsigned int off;
1735
1736                         off = skb->len;
1737                         if (getfrag(from, skb_put(skb, copy),
1738                                                 offset, copy, off, skb) < 0) {
1739                                 __skb_trim(skb, off);
1740                                 err = -EFAULT;
1741                                 goto error;
1742                         }
1743                 } else if (!uarg || !uarg->zerocopy) {
1744                         int i = skb_shinfo(skb)->nr_frags;
1745
1746                         err = -ENOMEM;
1747                         if (!sk_page_frag_refill(sk, pfrag))
1748                                 goto error;
1749
1750                         if (!skb_can_coalesce(skb, i, pfrag->page,
1751                                               pfrag->offset)) {
1752                                 err = -EMSGSIZE;
1753                                 if (i == MAX_SKB_FRAGS)
1754                                         goto error;
1755
1756                                 __skb_fill_page_desc(skb, i, pfrag->page,
1757                                                      pfrag->offset, 0);
1758                                 skb_shinfo(skb)->nr_frags = ++i;
1759                                 get_page(pfrag->page);
1760                         }
1761                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1762                         if (getfrag(from,
1763                                     page_address(pfrag->page) + pfrag->offset,
1764                                     offset, copy, skb->len, skb) < 0)
1765                                 goto error_efault;
1766
1767                         pfrag->offset += copy;
1768                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1769                         skb->len += copy;
1770                         skb->data_len += copy;
1771                         skb->truesize += copy;
1772                         wmem_alloc_delta += copy;
1773                 } else {
1774                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1775                         if (err < 0)
1776                                 goto error;
1777                 }
1778                 offset += copy;
1779                 length -= copy;
1780         }
1781
1782         if (wmem_alloc_delta)
1783                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1784         return 0;
1785
1786 error_efault:
1787         err = -EFAULT;
1788 error:
1789         net_zcopy_put_abort(uarg, extra_uref);
1790         cork->length -= length;
1791         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1792         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1793         return err;
1794 }
1795
1796 int ip6_append_data(struct sock *sk,
1797                     int getfrag(void *from, char *to, int offset, int len,
1798                                 int odd, struct sk_buff *skb),
1799                     void *from, int length, int transhdrlen,
1800                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1801                     struct rt6_info *rt, unsigned int flags)
1802 {
1803         struct inet_sock *inet = inet_sk(sk);
1804         struct ipv6_pinfo *np = inet6_sk(sk);
1805         int exthdrlen;
1806         int err;
1807
1808         if (flags&MSG_PROBE)
1809                 return 0;
1810         if (skb_queue_empty(&sk->sk_write_queue)) {
1811                 /*
1812                  * setup for corking
1813                  */
1814                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1815                                      ipc6, rt, fl6);
1816                 if (err)
1817                         return err;
1818
1819                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1820                 length += exthdrlen;
1821                 transhdrlen += exthdrlen;
1822         } else {
1823                 fl6 = &inet->cork.fl.u.ip6;
1824                 transhdrlen = 0;
1825         }
1826
1827         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1828                                  &np->cork, sk_page_frag(sk), getfrag,
1829                                  from, length, transhdrlen, flags, ipc6);
1830 }
1831 EXPORT_SYMBOL_GPL(ip6_append_data);
1832
1833 static void ip6_cork_release(struct inet_cork_full *cork,
1834                              struct inet6_cork *v6_cork)
1835 {
1836         if (v6_cork->opt) {
1837                 kfree(v6_cork->opt->dst0opt);
1838                 kfree(v6_cork->opt->dst1opt);
1839                 kfree(v6_cork->opt->hopopt);
1840                 kfree(v6_cork->opt->srcrt);
1841                 kfree(v6_cork->opt);
1842                 v6_cork->opt = NULL;
1843         }
1844
1845         if (cork->base.dst) {
1846                 dst_release(cork->base.dst);
1847                 cork->base.dst = NULL;
1848                 cork->base.flags &= ~IPCORK_ALLFRAG;
1849         }
1850         memset(&cork->fl, 0, sizeof(cork->fl));
1851 }
1852
1853 struct sk_buff *__ip6_make_skb(struct sock *sk,
1854                                struct sk_buff_head *queue,
1855                                struct inet_cork_full *cork,
1856                                struct inet6_cork *v6_cork)
1857 {
1858         struct sk_buff *skb, *tmp_skb;
1859         struct sk_buff **tail_skb;
1860         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1861         struct ipv6_pinfo *np = inet6_sk(sk);
1862         struct net *net = sock_net(sk);
1863         struct ipv6hdr *hdr;
1864         struct ipv6_txoptions *opt = v6_cork->opt;
1865         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1866         struct flowi6 *fl6 = &cork->fl.u.ip6;
1867         unsigned char proto = fl6->flowi6_proto;
1868
1869         skb = __skb_dequeue(queue);
1870         if (!skb)
1871                 goto out;
1872         tail_skb = &(skb_shinfo(skb)->frag_list);
1873
1874         /* move skb->data to ip header from ext header */
1875         if (skb->data < skb_network_header(skb))
1876                 __skb_pull(skb, skb_network_offset(skb));
1877         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1878                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1879                 *tail_skb = tmp_skb;
1880                 tail_skb = &(tmp_skb->next);
1881                 skb->len += tmp_skb->len;
1882                 skb->data_len += tmp_skb->len;
1883                 skb->truesize += tmp_skb->truesize;
1884                 tmp_skb->destructor = NULL;
1885                 tmp_skb->sk = NULL;
1886         }
1887
1888         /* Allow local fragmentation. */
1889         skb->ignore_df = ip6_sk_ignore_df(sk);
1890
1891         *final_dst = fl6->daddr;
1892         __skb_pull(skb, skb_network_header_len(skb));
1893         if (opt && opt->opt_flen)
1894                 ipv6_push_frag_opts(skb, opt, &proto);
1895         if (opt && opt->opt_nflen)
1896                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1897
1898         skb_push(skb, sizeof(struct ipv6hdr));
1899         skb_reset_network_header(skb);
1900         hdr = ipv6_hdr(skb);
1901
1902         ip6_flow_hdr(hdr, v6_cork->tclass,
1903                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1904                                         ip6_autoflowlabel(net, np), fl6));
1905         hdr->hop_limit = v6_cork->hop_limit;
1906         hdr->nexthdr = proto;
1907         hdr->saddr = fl6->saddr;
1908         hdr->daddr = *final_dst;
1909
1910         skb->priority = sk->sk_priority;
1911         skb->mark = cork->base.mark;
1912
1913         skb->tstamp = cork->base.transmit_time;
1914
1915         skb_dst_set(skb, dst_clone(&rt->dst));
1916         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1917         if (proto == IPPROTO_ICMPV6) {
1918                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1919
1920                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1921                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1922         }
1923
1924         ip6_cork_release(cork, v6_cork);
1925 out:
1926         return skb;
1927 }
1928
1929 int ip6_send_skb(struct sk_buff *skb)
1930 {
1931         struct net *net = sock_net(skb->sk);
1932         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1933         int err;
1934
1935         err = ip6_local_out(net, skb->sk, skb);
1936         if (err) {
1937                 if (err > 0)
1938                         err = net_xmit_errno(err);
1939                 if (err)
1940                         IP6_INC_STATS(net, rt->rt6i_idev,
1941                                       IPSTATS_MIB_OUTDISCARDS);
1942         }
1943
1944         return err;
1945 }
1946
1947 int ip6_push_pending_frames(struct sock *sk)
1948 {
1949         struct sk_buff *skb;
1950
1951         skb = ip6_finish_skb(sk);
1952         if (!skb)
1953                 return 0;
1954
1955         return ip6_send_skb(skb);
1956 }
1957 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1958
1959 static void __ip6_flush_pending_frames(struct sock *sk,
1960                                        struct sk_buff_head *queue,
1961                                        struct inet_cork_full *cork,
1962                                        struct inet6_cork *v6_cork)
1963 {
1964         struct sk_buff *skb;
1965
1966         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1967                 if (skb_dst(skb))
1968                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1969                                       IPSTATS_MIB_OUTDISCARDS);
1970                 kfree_skb(skb);
1971         }
1972
1973         ip6_cork_release(cork, v6_cork);
1974 }
1975
1976 void ip6_flush_pending_frames(struct sock *sk)
1977 {
1978         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1979                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1980 }
1981 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1982
1983 struct sk_buff *ip6_make_skb(struct sock *sk,
1984                              int getfrag(void *from, char *to, int offset,
1985                                          int len, int odd, struct sk_buff *skb),
1986                              void *from, int length, int transhdrlen,
1987                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1988                              struct rt6_info *rt, unsigned int flags,
1989                              struct inet_cork_full *cork)
1990 {
1991         struct inet6_cork v6_cork;
1992         struct sk_buff_head queue;
1993         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1994         int err;
1995
1996         if (flags & MSG_PROBE)
1997                 return NULL;
1998
1999         __skb_queue_head_init(&queue);
2000
2001         cork->base.flags = 0;
2002         cork->base.addr = 0;
2003         cork->base.opt = NULL;
2004         cork->base.dst = NULL;
2005         v6_cork.opt = NULL;
2006         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2007         if (err) {
2008                 ip6_cork_release(cork, &v6_cork);
2009                 return ERR_PTR(err);
2010         }
2011         if (ipc6->dontfrag < 0)
2012                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2013
2014         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2015                                 &current->task_frag, getfrag, from,
2016                                 length + exthdrlen, transhdrlen + exthdrlen,
2017                                 flags, ipc6);
2018         if (err) {
2019                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2020                 return ERR_PTR(err);
2021         }
2022
2023         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2024 }