net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_is_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         struct ipv6hdr *hdr;
 199         u8  proto = fl6->flowi6_proto;
 200         int seg_len = skb->len;
 201         int hlimit = -1;
 202         u32 mtu;
 203
 204         if (opt) {
 205                 unsigned int head_room;
 206
 207                 /* First: exthdrs may take lots of space (~8K for now)
 208                    MAX_HEADER is not enough.
 209                  */
 210                 head_room = opt->opt_nflen + opt->opt_flen;
 211                 seg_len += head_room;
 212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 213
 214                 if (skb_headroom(skb) < head_room) {
 215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 216                         if (!skb2) {
 217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 218                                               IPSTATS_MIB_OUTDISCARDS);
 219                                 kfree_skb(skb);
 220                                 return -ENOBUFS;
 221                         }
 222                         consume_skb(skb);
 223                         skb = skb2;
 224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 225                          * it is safe to call in our context (socket lock not held)
 226                          */
 227                         skb_set_owner_w(skb, (struct sock *)sk);
 228                 }
 229                 if (opt->opt_flen)
 230                         ipv6_push_frag_opts(skb, opt, &proto);
 231                 if (opt->opt_nflen)
 232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 233                                              &fl6->saddr);
 234         }
 235
 236         skb_push(skb, sizeof(struct ipv6hdr));
 237         skb_reset_network_header(skb);
 238         hdr = ipv6_hdr(skb);
 239
 240         /*
 241          *      Fill in the IPv6 header
 242          */
 243         if (np)
 244                 hlimit = np->hop_limit;
 245         if (hlimit < 0)
 246                 hlimit = ip6_dst_hoplimit(dst);
 247
 248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 249                                 ip6_autoflowlabel(net, np), fl6));
 250
 251         hdr->payload_len = htons(seg_len);
 252         hdr->nexthdr = proto;
 253         hdr->hop_limit = hlimit;
 254
 255         hdr->saddr = fl6->saddr;
 256         hdr->daddr = *first_hop;
 257
 258         skb->protocol = htons(ETH_P_IPV6);
 259         skb->priority = sk->sk_priority;
 260         skb->mark = mark;
 261
 262         mtu = dst_mtu(dst);
 263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 265                               IPSTATS_MIB_OUT, skb->len);
 266
 267                 /* if egress device is enslaved to an L3 master device pass the
 268                  * skb to its handler for processing
 269                  */
 270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 271                 if (unlikely(!skb))
 272                         return 0;
 273
 274                 /* hooks should never assume socket lock is held.
 275                  * we promote our socket to non const
 276                  */
 277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 278                                net, (struct sock *)sk, skb, NULL, dst->dev,
 279                                dst_output);
 280         }
 281
 282         skb->dev = dst->dev;
 283         /* ipv6_local_error() does not require socket lock,
 284          * we promote our socket to non const
 285          */
 286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 287
 288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 289         kfree_skb(skb);
 290         return -EMSGSIZE;
 291 }
 292 EXPORT_SYMBOL(ip6_xmit);
 293
 294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 295 {
 296         struct ip6_ra_chain *ra;
 297         struct sock *last = NULL;
 298
 299         read_lock(&ip6_ra_lock);
 300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 301                 struct sock *sk = ra->sk;
 302                 if (sk && ra->sel == sel &&
 303                     (!sk->sk_bound_dev_if ||
 304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 305                         if (last) {
 306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 307                                 if (skb2)
 308                                         rawv6_rcv(last, skb2);
 309                         }
 310                         last = sk;
 311                 }
 312         }
 313
 314         if (last) {
 315                 rawv6_rcv(last, skb);
 316                 read_unlock(&ip6_ra_lock);
 317                 return 1;
 318         }
 319         read_unlock(&ip6_ra_lock);
 320         return 0;
 321 }
 322
 323 static int ip6_forward_proxy_check(struct sk_buff *skb)
 324 {
 325         struct ipv6hdr *hdr = ipv6_hdr(skb);
 326         u8 nexthdr = hdr->nexthdr;
 327         __be16 frag_off;
 328         int offset;
 329
 330         if (ipv6_ext_hdr(nexthdr)) {
 331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 332                 if (offset < 0)
 333                         return 0;
 334         } else
 335                 offset = sizeof(struct ipv6hdr);
 336
 337         if (nexthdr == IPPROTO_ICMPV6) {
 338                 struct icmp6hdr *icmp6;
 339
 340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 341                                          offset + 1 - skb->data)))
 342                         return 0;
 343
 344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 345
 346                 switch (icmp6->icmp6_type) {
 347                 case NDISC_ROUTER_SOLICITATION:
 348                 case NDISC_ROUTER_ADVERTISEMENT:
 349                 case NDISC_NEIGHBOUR_SOLICITATION:
 350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 351                 case NDISC_REDIRECT:
 352                         /* For reaction involving unicast neighbor discovery
 353                          * message destined to the proxied address, pass it to
 354                          * input function.
 355                          */
 356                         return 1;
 357                 default:
 358                         break;
 359                 }
 360         }
 361
 362         /*
 363          * The proxying router can't forward traffic sent to a link-local
 364          * address, so signal the sender and discard the packet. This
 365          * behavior is clarified by the MIPv6 specification.
 366          */
 367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 368                 dst_link_failure(skb);
 369                 return -1;
 370         }
 371
 372         return 0;
 373 }
 374
 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 376                                      struct sk_buff *skb)
 377 {
 378         struct dst_entry *dst = skb_dst(skb);
 379
 380         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 381         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 382
 383         return dst_output(net, sk, skb);
 384 }
 385
 386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387 {
 388         if (skb->len <= mtu)
 389                 return false;
 390
 391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                 return true;
 394
 395         if (skb->ignore_df)
 396                 return false;
 397
 398         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 399                 return false;
 400
 401         return true;
 402 }
 403
 404 int ip6_forward(struct sk_buff *skb)
 405 {
 406         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 407         struct dst_entry *dst = skb_dst(skb);
 408         struct ipv6hdr *hdr = ipv6_hdr(skb);
 409         struct inet6_skb_parm *opt = IP6CB(skb);
 410         struct net *net = dev_net(dst->dev);
 411         u32 mtu;
 412
 413         if (net->ipv6.devconf_all->forwarding == 0)
 414                 goto error;
 415
 416         if (skb->pkt_type != PACKET_HOST)
 417                 goto drop;
 418
 419         if (unlikely(skb->sk))
 420                 goto drop;
 421
 422         if (skb_warn_if_lro(skb))
 423                 goto drop;
 424
 425         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 426                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 427                 goto drop;
 428         }
 429
 430         skb_forward_csum(skb);
 431
 432         /*
 433          *      We DO NOT make any processing on
 434          *      RA packets, pushing them to user level AS IS
 435          *      without ane WARRANTY that application will be able
 436          *      to interpret them. The reason is that we
 437          *      cannot make anything clever here.
 438          *
 439          *      We are not end-node, so that if packet contains
 440          *      AH/ESP, we cannot make anything.
 441          *      Defragmentation also would be mistake, RA packets
 442          *      cannot be fragmented, because there is no warranty
 443          *      that different fragments will go along one path. --ANK
 444          */
 445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                         return 0;
 448         }
 449
 450         /*
 451          *      check and decrement ttl
 452          */
 453         if (hdr->hop_limit <= 1) {
 454                 /* Force OUTPUT device used as source address */
 455                 skb->dev = dst->dev;
 456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 458
 459                 kfree_skb(skb);
 460                 return -ETIMEDOUT;
 461         }
 462
 463         /* XXX: idev->cnf.proxy_ndp? */
 464         if (net->ipv6.devconf_all->proxy_ndp &&
 465             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 466                 int proxied = ip6_forward_proxy_check(skb);
 467                 if (proxied > 0)
 468                         return ip6_input(skb);
 469                 else if (proxied < 0) {
 470                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 471                         goto drop;
 472                 }
 473         }
 474
 475         if (!xfrm6_route_forward(skb)) {
 476                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 477                 goto drop;
 478         }
 479         dst = skb_dst(skb);
 480
 481         /* IPv6 specs say nothing about it, but it is clear that we cannot
 482            send redirects to source routed frames.
 483            We don't send redirects to frames decapsulated from IPsec.
 484          */
 485         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 486             opt->srcrt == 0 && !skb_sec_path(skb)) {
 487                 struct in6_addr *target = NULL;
 488                 struct inet_peer *peer;
 489                 struct rt6_info *rt;
 490
 491                 /*
 492                  *      incoming and outgoing devices are the same
 493                  *      send a redirect.
 494                  */
 495
 496                 rt = (struct rt6_info *) dst;
 497                 if (rt->rt6i_flags & RTF_GATEWAY)
 498                         target = &rt->rt6i_gateway;
 499                 else
 500                         target = &hdr->daddr;
 501
 502                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 503
 504                 /* Limit redirects both by destination (here)
 505                    and by source (inside ndisc_send_redirect)
 506                  */
 507                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 508                         ndisc_send_redirect(skb, target);
 509                 if (peer)
 510                         inet_putpeer(peer);
 511         } else {
 512                 int addrtype = ipv6_addr_type(&hdr->saddr);
 513
 514                 /* This check is security critical. */
 515                 if (addrtype == IPV6_ADDR_ANY ||
 516                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 517                         goto error;
 518                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 519                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 520                                     ICMPV6_NOT_NEIGHBOUR, 0);
 521                         goto error;
 522                 }
 523         }
 524
 525         mtu = ip6_dst_mtu_forward(dst);
 526         if (mtu < IPV6_MIN_MTU)
 527                 mtu = IPV6_MIN_MTU;
 528
 529         if (ip6_pkt_too_big(skb, mtu)) {
 530                 /* Again, force OUTPUT device used as source address */
 531                 skb->dev = dst->dev;
 532                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 533                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 534                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 535                                 IPSTATS_MIB_FRAGFAILS);
 536                 kfree_skb(skb);
 537                 return -EMSGSIZE;
 538         }
 539
 540         if (skb_cow(skb, dst->dev->hard_header_len)) {
 541                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 542                                 IPSTATS_MIB_OUTDISCARDS);
 543                 goto drop;
 544         }
 545
 546         hdr = ipv6_hdr(skb);
 547
 548         /* Mangling hops number delayed to point after skb COW */
 549
 550         hdr->hop_limit--;
 551
 552         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 553                        net, NULL, skb, skb->dev, dst->dev,
 554                        ip6_forward_finish);
 555
 556 error:
 557         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 558 drop:
 559         kfree_skb(skb);
 560         return -EINVAL;
 561 }
 562
 563 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 564 {
 565         to->pkt_type = from->pkt_type;
 566         to->priority = from->priority;
 567         to->protocol = from->protocol;
 568         skb_dst_drop(to);
 569         skb_dst_set(to, dst_clone(skb_dst(from)));
 570         to->dev = from->dev;
 571         to->mark = from->mark;
 572
 573 #ifdef CONFIG_NET_SCHED
 574         to->tc_index = from->tc_index;
 575 #endif
 576         nf_copy(to, from);
 577         skb_copy_secmark(to, from);
 578 }
 579
 580 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 581                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 582 {
 583         struct sk_buff *frag;
 584         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 585         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 586                                 inet6_sk(skb->sk) : NULL;
 587         struct ipv6hdr *tmp_hdr;
 588         struct frag_hdr *fh;
 589         unsigned int mtu, hlen, left, len;
 590         int hroom, troom;
 591         __be32 frag_id;
 592         int ptr, offset = 0, err = 0;
 593         u8 *prevhdr, nexthdr = 0;
 594
 595         err = ip6_find_1stfragopt(skb, &prevhdr);
 596         if (err < 0)
 597                 goto fail;
 598         hlen = err;
 599         nexthdr = *prevhdr;
 600
 601         mtu = ip6_skb_dst_mtu(skb);
 602
 603         /* We must not fragment if the socket is set to force MTU discovery
 604          * or if the skb it not generated by a local socket.
 605          */
 606         if (unlikely(!skb->ignore_df && skb->len > mtu))
 607                 goto fail_toobig;
 608
 609         if (IP6CB(skb)->frag_max_size) {
 610                 if (IP6CB(skb)->frag_max_size > mtu)
 611                         goto fail_toobig;
 612
 613                 /* don't send fragments larger than what we received */
 614                 mtu = IP6CB(skb)->frag_max_size;
 615                 if (mtu < IPV6_MIN_MTU)
 616                         mtu = IPV6_MIN_MTU;
 617         }
 618
 619         if (np && np->frag_size < mtu) {
 620                 if (np->frag_size)
 621                         mtu = np->frag_size;
 622         }
 623         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 624                 goto fail_toobig;
 625         mtu -= hlen + sizeof(struct frag_hdr);
 626
 627         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 628                                     &ipv6_hdr(skb)->saddr);
 629
 630         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 631             (err = skb_checksum_help(skb)))
 632                 goto fail;
 633
 634         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 635         if (skb_has_frag_list(skb)) {
 636                 unsigned int first_len = skb_pagelen(skb);
 637                 struct sk_buff *frag2;
 638
 639                 if (first_len - hlen > mtu ||
 640                     ((first_len - hlen) & 7) ||
 641                     skb_cloned(skb) ||
 642                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 643                         goto slow_path;
 644
 645                 skb_walk_frags(skb, frag) {
 646                         /* Correct geometry. */
 647                         if (frag->len > mtu ||
 648                             ((frag->len & 7) && frag->next) ||
 649                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 650                                 goto slow_path_clean;
 651
 652                         /* Partially cloned skb? */
 653                         if (skb_shared(frag))
 654                                 goto slow_path_clean;
 655
 656                         BUG_ON(frag->sk);
 657                         if (skb->sk) {
 658                                 frag->sk = skb->sk;
 659                                 frag->destructor = sock_wfree;
 660                         }
 661                         skb->truesize -= frag->truesize;
 662                 }
 663
 664                 err = 0;
 665                 offset = 0;
 666                 /* BUILD HEADER */
 667
 668                 *prevhdr = NEXTHDR_FRAGMENT;
 669                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 670                 if (!tmp_hdr) {
 671                         err = -ENOMEM;
 672                         goto fail;
 673                 }
 674                 frag = skb_shinfo(skb)->frag_list;
 675                 skb_frag_list_init(skb);
 676
 677                 __skb_pull(skb, hlen);
 678                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 679                 __skb_push(skb, hlen);
 680                 skb_reset_network_header(skb);
 681                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 682
 683                 fh->nexthdr = nexthdr;
 684                 fh->reserved = 0;
 685                 fh->frag_off = htons(IP6_MF);
 686                 fh->identification = frag_id;
 687
 688                 first_len = skb_pagelen(skb);
 689                 skb->data_len = first_len - skb_headlen(skb);
 690                 skb->len = first_len;
 691                 ipv6_hdr(skb)->payload_len = htons(first_len -
 692                                                    sizeof(struct ipv6hdr));
 693
 694                 for (;;) {
 695                         /* Prepare header of the next frame,
 696                          * before previous one went down. */
 697                         if (frag) {
 698                                 frag->ip_summed = CHECKSUM_NONE;
 699                                 skb_reset_transport_header(frag);
 700                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 701                                 __skb_push(frag, hlen);
 702                                 skb_reset_network_header(frag);
 703                                 memcpy(skb_network_header(frag), tmp_hdr,
 704                                        hlen);
 705                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 706                                 fh->nexthdr = nexthdr;
 707                                 fh->reserved = 0;
 708                                 fh->frag_off = htons(offset);
 709                                 if (frag->next)
 710                                         fh->frag_off |= htons(IP6_MF);
 711                                 fh->identification = frag_id;
 712                                 ipv6_hdr(frag)->payload_len =
 713                                                 htons(frag->len -
 714                                                       sizeof(struct ipv6hdr));
 715                                 ip6_copy_metadata(frag, skb);
 716                         }
 717
 718                         err = output(net, sk, skb);
 719                         if (!err)
 720                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 721                                               IPSTATS_MIB_FRAGCREATES);
 722
 723                         if (err || !frag)
 724                                 break;
 725
 726                         skb = frag;
 727                         frag = skb->next;
 728                         skb->next = NULL;
 729                 }
 730
 731                 kfree(tmp_hdr);
 732
 733                 if (err == 0) {
 734                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 735                                       IPSTATS_MIB_FRAGOKS);
 736                         return 0;
 737                 }
 738
 739                 kfree_skb_list(frag);
 740
 741                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 742                               IPSTATS_MIB_FRAGFAILS);
 743                 return err;
 744
 745 slow_path_clean:
 746                 skb_walk_frags(skb, frag2) {
 747                         if (frag2 == frag)
 748                                 break;
 749                         frag2->sk = NULL;
 750                         frag2->destructor = NULL;
 751                         skb->truesize += frag2->truesize;
 752                 }
 753         }
 754
 755 slow_path:
 756         left = skb->len - hlen;         /* Space per frame */
 757         ptr = hlen;                     /* Where to start from */
 758
 759         /*
 760          *      Fragment the datagram.
 761          */
 762
 763         troom = rt->dst.dev->needed_tailroom;
 764
 765         /*
 766          *      Keep copying data until we run out.
 767          */
 768         while (left > 0)        {
 769                 u8 *fragnexthdr_offset;
 770
 771                 len = left;
 772                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 773                 if (len > mtu)
 774                         len = mtu;
 775                 /* IF: we are not sending up to and including the packet end
 776                    then align the next start on an eight byte boundary */
 777                 if (len < left) {
 778                         len &= ~7;
 779                 }
 780
 781                 /* Allocate buffer */
 782                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 783                                  hroom + troom, GFP_ATOMIC);
 784                 if (!frag) {
 785                         err = -ENOMEM;
 786                         goto fail;
 787                 }
 788
 789                 /*
 790                  *      Set up data on packet
 791                  */
 792
 793                 ip6_copy_metadata(frag, skb);
 794                 skb_reserve(frag, hroom);
 795                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 796                 skb_reset_network_header(frag);
 797                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 798                 frag->transport_header = (frag->network_header + hlen +
 799                                           sizeof(struct frag_hdr));
 800
 801                 /*
 802                  *      Charge the memory for the fragment to any owner
 803                  *      it might possess
 804                  */
 805                 if (skb->sk)
 806                         skb_set_owner_w(frag, skb->sk);
 807
 808                 /*
 809                  *      Copy the packet header into the new buffer.
 810                  */
 811                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 812
 813                 fragnexthdr_offset = skb_network_header(frag);
 814                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 815                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 816
 817                 /*
 818                  *      Build fragment header.
 819                  */
 820                 fh->nexthdr = nexthdr;
 821                 fh->reserved = 0;
 822                 fh->identification = frag_id;
 823
 824                 /*
 825                  *      Copy a block of the IP datagram.
 826                  */
 827                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 828                                      len));
 829                 left -= len;
 830
 831                 fh->frag_off = htons(offset);
 832                 if (left > 0)
 833                         fh->frag_off |= htons(IP6_MF);
 834                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 835                                                     sizeof(struct ipv6hdr));
 836
 837                 ptr += len;
 838                 offset += len;
 839
 840                 /*
 841                  *      Put this fragment into the sending queue.
 842                  */
 843                 err = output(net, sk, frag);
 844                 if (err)
 845                         goto fail;
 846
 847                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 848                               IPSTATS_MIB_FRAGCREATES);
 849         }
 850         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 851                       IPSTATS_MIB_FRAGOKS);
 852         consume_skb(skb);
 853         return err;
 854
 855 fail_toobig:
 856         if (skb->sk && dst_allfrag(skb_dst(skb)))
 857                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 858
 859         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 860         err = -EMSGSIZE;
 861
 862 fail:
 863         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 864                       IPSTATS_MIB_FRAGFAILS);
 865         kfree_skb(skb);
 866         return err;
 867 }
 868
 869 static inline int ip6_rt_check(const struct rt6key *rt_key,
 870                                const struct in6_addr *fl_addr,
 871                                const struct in6_addr *addr_cache)
 872 {
 873         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 874                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 875 }
 876
 877 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 878                                           struct dst_entry *dst,
 879                                           const struct flowi6 *fl6)
 880 {
 881         struct ipv6_pinfo *np = inet6_sk(sk);
 882         struct rt6_info *rt;
 883
 884         if (!dst)
 885                 goto out;
 886
 887         if (dst->ops->family != AF_INET6) {
 888                 dst_release(dst);
 889                 return NULL;
 890         }
 891
 892         rt = (struct rt6_info *)dst;
 893         /* Yes, checking route validity in not connected
 894          * case is not very simple. Take into account,
 895          * that we do not support routing by source, TOS,
 896          * and MSG_DONTROUTE            --ANK (980726)
 897          *
 898          * 1. ip6_rt_check(): If route was host route,
 899          *    check that cached destination is current.
 900          *    If it is network route, we still may
 901          *    check its validity using saved pointer
 902          *    to the last used address: daddr_cache.
 903          *    We do not want to save whole address now,
 904          *    (because main consumer of this service
 905          *    is tcp, which has not this problem),
 906          *    so that the last trick works only on connected
 907          *    sockets.
 908          * 2. oif also should be the same.
 909          */
 910         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 911 #ifdef CONFIG_IPV6_SUBTREES
 912             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 913 #endif
 914            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 915               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 916                 dst_release(dst);
 917                 dst = NULL;
 918         }
 919
 920 out:
 921         return dst;
 922 }
 923
 924 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 925                                struct dst_entry **dst, struct flowi6 *fl6)
 926 {
 927 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 928         struct neighbour *n;
 929         struct rt6_info *rt;
 930 #endif
 931         int err;
 932         int flags = 0;
 933
 934         /* The correct way to handle this would be to do
 935          * ip6_route_get_saddr, and then ip6_route_output; however,
 936          * the route-specific preferred source forces the
 937          * ip6_route_output call _before_ ip6_route_get_saddr.
 938          *
 939          * In source specific routing (no src=any default route),
 940          * ip6_route_output will fail given src=any saddr, though, so
 941          * that's why we try it again later.
 942          */
 943         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 944                 struct fib6_info *from;
 945                 struct rt6_info *rt;
 946                 bool had_dst = *dst != NULL;
 947
 948                 if (!had_dst)
 949                         *dst = ip6_route_output(net, sk, fl6);
 950                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 951
 952                 rcu_read_lock();
 953                 from = rt ? rcu_dereference(rt->from) : NULL;
 954                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
 955                                           sk ? inet6_sk(sk)->srcprefs : 0,
 956                                           &fl6->saddr);
 957                 rcu_read_unlock();
 958
 959                 if (err)
 960                         goto out_err_release;
 961
 962                 /* If we had an erroneous initial result, pretend it
 963                  * never existed and let the SA-enabled version take
 964                  * over.
 965                  */
 966                 if (!had_dst && (*dst)->error) {
 967                         dst_release(*dst);
 968                         *dst = NULL;
 969                 }
 970
 971                 if (fl6->flowi6_oif)
 972                         flags |= RT6_LOOKUP_F_IFACE;
 973         }
 974
 975         if (!*dst)
 976                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 977
 978         err = (*dst)->error;
 979         if (err)
 980                 goto out_err_release;
 981
 982 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 983         /*
 984          * Here if the dst entry we've looked up
 985          * has a neighbour entry that is in the INCOMPLETE
 986          * state and the src address from the flow is
 987          * marked as OPTIMISTIC, we release the found
 988          * dst entry and replace it instead with the
 989          * dst entry of the nexthop router
 990          */
 991         rt = (struct rt6_info *) *dst;
 992         rcu_read_lock_bh();
 993         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 994                                       rt6_nexthop(rt, &fl6->daddr));
 995         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 996         rcu_read_unlock_bh();
 997
 998         if (err) {
 999                 struct inet6_ifaddr *ifp;
1000                 struct flowi6 fl_gw6;
1001                 int redirect;
1002
1003                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1004                                       (*dst)->dev, 1);
1005
1006                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1007                 if (ifp)
1008                         in6_ifa_put(ifp);
1009
1010                 if (redirect) {
1011                         /*
1012                          * We need to get the dst entry for the
1013                          * default router instead
1014                          */
1015                         dst_release(*dst);
1016                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1017                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1018                         *dst = ip6_route_output(net, sk, &fl_gw6);
1019                         err = (*dst)->error;
1020                         if (err)
1021                                 goto out_err_release;
1022                 }
1023         }
1024 #endif
1025         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1026             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1027                 err = -EAFNOSUPPORT;
1028                 goto out_err_release;
1029         }
1030
1031         return 0;
1032
1033 out_err_release:
1034         dst_release(*dst);
1035         *dst = NULL;
1036
1037         if (err == -ENETUNREACH)
1038                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1039         return err;
1040 }
1041
1042 /**
1043  *      ip6_dst_lookup - perform route lookup on flow
1044  *      @sk: socket which provides route info
1045  *      @dst: pointer to dst_entry * for result
1046  *      @fl6: flow to lookup
1047  *
1048  *      This function performs a route lookup on the given flow.
1049  *
1050  *      It returns zero on success, or a standard errno code on error.
1051  */
1052 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1053                    struct flowi6 *fl6)
1054 {
1055         *dst = NULL;
1056         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1057 }
1058 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1059
1060 /**
1061  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1062  *      @sk: socket which provides route info
1063  *      @fl6: flow to lookup
1064  *      @final_dst: final destination address for ipsec lookup
1065  *
1066  *      This function performs a route lookup on the given flow.
1067  *
1068  *      It returns a valid dst pointer on success, or a pointer encoded
1069  *      error code.
1070  */
1071 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1072                                       const struct in6_addr *final_dst)
1073 {
1074         struct dst_entry *dst = NULL;
1075         int err;
1076
1077         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1078         if (err)
1079                 return ERR_PTR(err);
1080         if (final_dst)
1081                 fl6->daddr = *final_dst;
1082
1083         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1084 }
1085 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1086
1087 /**
1088  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1089  *      @sk: socket which provides the dst cache and route info
1090  *      @fl6: flow to lookup
1091  *      @final_dst: final destination address for ipsec lookup
1092  *      @connected: whether @sk is connected or not
1093  *
1094  *      This function performs a route lookup on the given flow with the
1095  *      possibility of using the cached route in the socket if it is valid.
1096  *      It will take the socket dst lock when operating on the dst cache.
1097  *      As a result, this function can only be used in process context.
1098  *
1099  *      In addition, for a connected socket, cache the dst in the socket
1100  *      if the current cache is not valid.
1101  *
1102  *      It returns a valid dst pointer on success, or a pointer encoded
1103  *      error code.
1104  */
1105 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1106                                          const struct in6_addr *final_dst,
1107                                          bool connected)
1108 {
1109         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1110
1111         dst = ip6_sk_dst_check(sk, dst, fl6);
1112         if (dst)
1113                 return dst;
1114
1115         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1116         if (connected && !IS_ERR(dst))
1117                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1118
1119         return dst;
1120 }
1121 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1122
1123 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1124                                                gfp_t gfp)
1125 {
1126         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1127 }
1128
1129 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1130                                                 gfp_t gfp)
1131 {
1132         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1133 }
1134
1135 static void ip6_append_data_mtu(unsigned int *mtu,
1136                                 int *maxfraglen,
1137                                 unsigned int fragheaderlen,
1138                                 struct sk_buff *skb,
1139                                 struct rt6_info *rt,
1140                                 unsigned int orig_mtu)
1141 {
1142         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1143                 if (!skb) {
1144                         /* first fragment, reserve header_len */
1145                         *mtu = orig_mtu - rt->dst.header_len;
1146
1147                 } else {
1148                         /*
1149                          * this fragment is not first, the headers
1150                          * space is regarded as data space.
1151                          */
1152                         *mtu = orig_mtu;
1153                 }
1154                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1155                               + fragheaderlen - sizeof(struct frag_hdr);
1156         }
1157 }
1158
1159 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1160                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1161                           struct rt6_info *rt, struct flowi6 *fl6)
1162 {
1163         struct ipv6_pinfo *np = inet6_sk(sk);
1164         unsigned int mtu;
1165         struct ipv6_txoptions *opt = ipc6->opt;
1166
1167         /*
1168          * setup for corking
1169          */
1170         if (opt) {
1171                 if (WARN_ON(v6_cork->opt))
1172                         return -EINVAL;
1173
1174                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1175                 if (unlikely(!v6_cork->opt))
1176                         return -ENOBUFS;
1177
1178                 v6_cork->opt->tot_len = sizeof(*opt);
1179                 v6_cork->opt->opt_flen = opt->opt_flen;
1180                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1181
1182                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1183                                                     sk->sk_allocation);
1184                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1185                         return -ENOBUFS;
1186
1187                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1188                                                     sk->sk_allocation);
1189                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1190                         return -ENOBUFS;
1191
1192                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1193                                                    sk->sk_allocation);
1194                 if (opt->hopopt && !v6_cork->opt->hopopt)
1195                         return -ENOBUFS;
1196
1197                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1198                                                     sk->sk_allocation);
1199                 if (opt->srcrt && !v6_cork->opt->srcrt)
1200                         return -ENOBUFS;
1201
1202                 /* need source address above miyazawa*/
1203         }
1204         dst_hold(&rt->dst);
1205         cork->base.dst = &rt->dst;
1206         cork->fl.u.ip6 = *fl6;
1207         v6_cork->hop_limit = ipc6->hlimit;
1208         v6_cork->tclass = ipc6->tclass;
1209         if (rt->dst.flags & DST_XFRM_TUNNEL)
1210                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1211                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1212         else
1213                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1214                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1215         if (np->frag_size < mtu) {
1216                 if (np->frag_size)
1217                         mtu = np->frag_size;
1218         }
1219         if (mtu < IPV6_MIN_MTU)
1220                 return -EINVAL;
1221         cork->base.fragsize = mtu;
1222         cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
1223                               sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
1224
1225         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1226                 cork->base.flags |= IPCORK_ALLFRAG;
1227         cork->base.length = 0;
1228
1229         return 0;
1230 }
1231
1232 static int __ip6_append_data(struct sock *sk,
1233                              struct flowi6 *fl6,
1234                              struct sk_buff_head *queue,
1235                              struct inet_cork *cork,
1236                              struct inet6_cork *v6_cork,
1237                              struct page_frag *pfrag,
1238                              int getfrag(void *from, char *to, int offset,
1239                                          int len, int odd, struct sk_buff *skb),
1240                              void *from, int length, int transhdrlen,
1241                              unsigned int flags, struct ipcm6_cookie *ipc6,
1242                              const struct sockcm_cookie *sockc)
1243 {
1244         struct sk_buff *skb, *skb_prev = NULL;
1245         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1246         int exthdrlen = 0;
1247         int dst_exthdrlen = 0;
1248         int hh_len;
1249         int copy;
1250         int err;
1251         int offset = 0;
1252         __u8 tx_flags = 0;
1253         u32 tskey = 0;
1254         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1255         struct ipv6_txoptions *opt = v6_cork->opt;
1256         int csummode = CHECKSUM_NONE;
1257         unsigned int maxnonfragsize, headersize;
1258         unsigned int wmem_alloc_delta = 0;
1259         bool paged;
1260
1261         skb = skb_peek_tail(queue);
1262         if (!skb) {
1263                 exthdrlen = opt ? opt->opt_flen : 0;
1264                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1265         }
1266
1267         paged = !!cork->gso_size;
1268         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1269         orig_mtu = mtu;
1270
1271         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1272
1273         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1274                         (opt ? opt->opt_nflen : 0);
1275         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1276                      sizeof(struct frag_hdr);
1277
1278         headersize = sizeof(struct ipv6hdr) +
1279                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1280                      (dst_allfrag(&rt->dst) ?
1281                       sizeof(struct frag_hdr) : 0) +
1282                      rt->rt6i_nfheader_len;
1283
1284         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1285          * the first fragment
1286          */
1287         if (headersize + transhdrlen > mtu)
1288                 goto emsgsize;
1289
1290         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1291             (sk->sk_protocol == IPPROTO_UDP ||
1292              sk->sk_protocol == IPPROTO_RAW)) {
1293                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1294                                 sizeof(struct ipv6hdr));
1295                 goto emsgsize;
1296         }
1297
1298         if (ip6_sk_ignore_df(sk))
1299                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1300         else
1301                 maxnonfragsize = mtu;
1302
1303         if (cork->length + length > maxnonfragsize - headersize) {
1304 emsgsize:
1305                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1306                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1307                 return -EMSGSIZE;
1308         }
1309
1310         /* CHECKSUM_PARTIAL only with no extension headers and when
1311          * we are not going to fragment
1312          */
1313         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1314             headersize == sizeof(struct ipv6hdr) &&
1315             length <= mtu - headersize &&
1316             (!(flags & MSG_MORE) || cork->gso_size) &&
1317             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1318                 csummode = CHECKSUM_PARTIAL;
1319
1320         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1321                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1322                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1323                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1324                         tskey = sk->sk_tskey++;
1325         }
1326
1327         /*
1328          * Let's try using as much space as possible.
1329          * Use MTU if total length of the message fits into the MTU.
1330          * Otherwise, we need to reserve fragment header and
1331          * fragment alignment (= 8-15 octects, in total).
1332          *
1333          * Note that we may need to "move" the data from the tail of
1334          * of the buffer to the new fragment when we split
1335          * the message.
1336          *
1337          * FIXME: It may be fragmented into multiple chunks
1338          *        at once if non-fragmentable extension headers
1339          *        are too large.
1340          * --yoshfuji
1341          */
1342
1343         cork->length += length;
1344         if (!skb)
1345                 goto alloc_new_skb;
1346
1347         while (length > 0) {
1348                 /* Check if the remaining data fits into current packet. */
1349                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1350                 if (copy < length)
1351                         copy = maxfraglen - skb->len;
1352
1353                 if (copy <= 0) {
1354                         char *data;
1355                         unsigned int datalen;
1356                         unsigned int fraglen;
1357                         unsigned int fraggap;
1358                         unsigned int alloclen;
1359                         unsigned int pagedlen = 0;
1360 alloc_new_skb:
1361                         /* There's no room in the current skb */
1362                         if (skb)
1363                                 fraggap = skb->len - maxfraglen;
1364                         else
1365                                 fraggap = 0;
1366                         /* update mtu and maxfraglen if necessary */
1367                         if (!skb || !skb_prev)
1368                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1369                                                     fragheaderlen, skb, rt,
1370                                                     orig_mtu);
1371
1372                         skb_prev = skb;
1373
1374                         /*
1375                          * If remaining data exceeds the mtu,
1376                          * we know we need more fragment(s).
1377                          */
1378                         datalen = length + fraggap;
1379
1380                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1381                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1382                         fraglen = datalen + fragheaderlen;
1383
1384                         if ((flags & MSG_MORE) &&
1385                             !(rt->dst.dev->features&NETIF_F_SG))
1386                                 alloclen = mtu;
1387                         else if (!paged)
1388                                 alloclen = fraglen;
1389                         else {
1390                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1391                                 pagedlen = fraglen - alloclen;
1392                         }
1393
1394                         alloclen += dst_exthdrlen;
1395
1396                         if (datalen != length + fraggap) {
1397                                 /*
1398                                  * this is not the last fragment, the trailer
1399                                  * space is regarded as data space.
1400                                  */
1401                                 datalen += rt->dst.trailer_len;
1402                         }
1403
1404                         alloclen += rt->dst.trailer_len;
1405                         fraglen = datalen + fragheaderlen;
1406
1407                         /*
1408                          * We just reserve space for fragment header.
1409                          * Note: this may be overallocation if the message
1410                          * (without MSG_MORE) fits into the MTU.
1411                          */
1412                         alloclen += sizeof(struct frag_hdr);
1413
1414                         copy = datalen - transhdrlen - fraggap - pagedlen;
1415                         if (copy < 0) {
1416                                 err = -EINVAL;
1417                                 goto error;
1418                         }
1419                         if (transhdrlen) {
1420                                 skb = sock_alloc_send_skb(sk,
1421                                                 alloclen + hh_len,
1422                                                 (flags & MSG_DONTWAIT), &err);
1423                         } else {
1424                                 skb = NULL;
1425                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1426                                     2 * sk->sk_sndbuf)
1427                                         skb = alloc_skb(alloclen + hh_len,
1428                                                         sk->sk_allocation);
1429                                 if (unlikely(!skb))
1430                                         err = -ENOBUFS;
1431                         }
1432                         if (!skb)
1433                                 goto error;
1434                         /*
1435                          *      Fill in the control structures
1436                          */
1437                         skb->protocol = htons(ETH_P_IPV6);
1438                         skb->ip_summed = csummode;
1439                         skb->csum = 0;
1440                         /* reserve for fragmentation and ipsec header */
1441                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442                                     dst_exthdrlen);
1443
1444                         /* Only the initial fragment is time stamped */
1445                         skb_shinfo(skb)->tx_flags = tx_flags;
1446                         tx_flags = 0;
1447                         skb_shinfo(skb)->tskey = tskey;
1448                         tskey = 0;
1449
1450                         /*
1451                          *      Find where to start putting bytes
1452                          */
1453                         data = skb_put(skb, fraglen - pagedlen);
1454                         skb_set_network_header(skb, exthdrlen);
1455                         data += fragheaderlen;
1456                         skb->transport_header = (skb->network_header +
1457                                                  fragheaderlen);
1458                         if (fraggap) {
1459                                 skb->csum = skb_copy_and_csum_bits(
1460                                         skb_prev, maxfraglen,
1461                                         data + transhdrlen, fraggap, 0);
1462                                 skb_prev->csum = csum_sub(skb_prev->csum,
1463                                                           skb->csum);
1464                                 data += fraggap;
1465                                 pskb_trim_unique(skb_prev, maxfraglen);
1466                         }
1467                         if (copy > 0 &&
1468                             getfrag(from, data + transhdrlen, offset,
1469                                     copy, fraggap, skb) < 0) {
1470                                 err = -EFAULT;
1471                                 kfree_skb(skb);
1472                                 goto error;
1473                         }
1474
1475                         offset += copy;
1476                         length -= copy + transhdrlen;
1477                         transhdrlen = 0;
1478                         exthdrlen = 0;
1479                         dst_exthdrlen = 0;
1480
1481                         if ((flags & MSG_CONFIRM) && !skb_prev)
1482                                 skb_set_dst_pending_confirm(skb, 1);
1483
1484                         /*
1485                          * Put the packet on the pending queue
1486                          */
1487                         if (!skb->destructor) {
1488                                 skb->destructor = sock_wfree;
1489                                 skb->sk = sk;
1490                                 wmem_alloc_delta += skb->truesize;
1491                         }
1492                         __skb_queue_tail(queue, skb);
1493                         continue;
1494                 }
1495
1496                 if (copy > length)
1497                         copy = length;
1498
1499                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1500                     skb_tailroom(skb) >= copy) {
1501                         unsigned int off;
1502
1503                         off = skb->len;
1504                         if (getfrag(from, skb_put(skb, copy),
1505                                                 offset, copy, off, skb) < 0) {
1506                                 __skb_trim(skb, off);
1507                                 err = -EFAULT;
1508                                 goto error;
1509                         }
1510                 } else {
1511                         int i = skb_shinfo(skb)->nr_frags;
1512
1513                         err = -ENOMEM;
1514                         if (!sk_page_frag_refill(sk, pfrag))
1515                                 goto error;
1516
1517                         if (!skb_can_coalesce(skb, i, pfrag->page,
1518                                               pfrag->offset)) {
1519                                 err = -EMSGSIZE;
1520                                 if (i == MAX_SKB_FRAGS)
1521                                         goto error;
1522
1523                                 __skb_fill_page_desc(skb, i, pfrag->page,
1524                                                      pfrag->offset, 0);
1525                                 skb_shinfo(skb)->nr_frags = ++i;
1526                                 get_page(pfrag->page);
1527                         }
1528                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1529                         if (getfrag(from,
1530                                     page_address(pfrag->page) + pfrag->offset,
1531                                     offset, copy, skb->len, skb) < 0)
1532                                 goto error_efault;
1533
1534                         pfrag->offset += copy;
1535                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1536                         skb->len += copy;
1537                         skb->data_len += copy;
1538                         skb->truesize += copy;
1539                         wmem_alloc_delta += copy;
1540                 }
1541                 offset += copy;
1542                 length -= copy;
1543         }
1544
1545         if (wmem_alloc_delta)
1546                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1547         return 0;
1548
1549 error_efault:
1550         err = -EFAULT;
1551 error:
1552         cork->length -= length;
1553         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1554         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1555         return err;
1556 }
1557
1558 int ip6_append_data(struct sock *sk,
1559                     int getfrag(void *from, char *to, int offset, int len,
1560                                 int odd, struct sk_buff *skb),
1561                     void *from, int length, int transhdrlen,
1562                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1563                     struct rt6_info *rt, unsigned int flags,
1564                     const struct sockcm_cookie *sockc)
1565 {
1566         struct inet_sock *inet = inet_sk(sk);
1567         struct ipv6_pinfo *np = inet6_sk(sk);
1568         int exthdrlen;
1569         int err;
1570
1571         if (flags&MSG_PROBE)
1572                 return 0;
1573         if (skb_queue_empty(&sk->sk_write_queue)) {
1574                 /*
1575                  * setup for corking
1576                  */
1577                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1578                                      ipc6, rt, fl6);
1579                 if (err)
1580                         return err;
1581
1582                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1583                 length += exthdrlen;
1584                 transhdrlen += exthdrlen;
1585         } else {
1586                 fl6 = &inet->cork.fl.u.ip6;
1587                 transhdrlen = 0;
1588         }
1589
1590         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1591                                  &np->cork, sk_page_frag(sk), getfrag,
1592                                  from, length, transhdrlen, flags, ipc6, sockc);
1593 }
1594 EXPORT_SYMBOL_GPL(ip6_append_data);
1595
1596 static void ip6_cork_release(struct inet_cork_full *cork,
1597                              struct inet6_cork *v6_cork)
1598 {
1599         if (v6_cork->opt) {
1600                 kfree(v6_cork->opt->dst0opt);
1601                 kfree(v6_cork->opt->dst1opt);
1602                 kfree(v6_cork->opt->hopopt);
1603                 kfree(v6_cork->opt->srcrt);
1604                 kfree(v6_cork->opt);
1605                 v6_cork->opt = NULL;
1606         }
1607
1608         if (cork->base.dst) {
1609                 dst_release(cork->base.dst);
1610                 cork->base.dst = NULL;
1611                 cork->base.flags &= ~IPCORK_ALLFRAG;
1612         }
1613         memset(&cork->fl, 0, sizeof(cork->fl));
1614 }
1615
1616 struct sk_buff *__ip6_make_skb(struct sock *sk,
1617                                struct sk_buff_head *queue,
1618                                struct inet_cork_full *cork,
1619                                struct inet6_cork *v6_cork)
1620 {
1621         struct sk_buff *skb, *tmp_skb;
1622         struct sk_buff **tail_skb;
1623         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1624         struct ipv6_pinfo *np = inet6_sk(sk);
1625         struct net *net = sock_net(sk);
1626         struct ipv6hdr *hdr;
1627         struct ipv6_txoptions *opt = v6_cork->opt;
1628         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1629         struct flowi6 *fl6 = &cork->fl.u.ip6;
1630         unsigned char proto = fl6->flowi6_proto;
1631
1632         skb = __skb_dequeue(queue);
1633         if (!skb)
1634                 goto out;
1635         tail_skb = &(skb_shinfo(skb)->frag_list);
1636
1637         /* move skb->data to ip header from ext header */
1638         if (skb->data < skb_network_header(skb))
1639                 __skb_pull(skb, skb_network_offset(skb));
1640         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1641                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1642                 *tail_skb = tmp_skb;
1643                 tail_skb = &(tmp_skb->next);
1644                 skb->len += tmp_skb->len;
1645                 skb->data_len += tmp_skb->len;
1646                 skb->truesize += tmp_skb->truesize;
1647                 tmp_skb->destructor = NULL;
1648                 tmp_skb->sk = NULL;
1649         }
1650
1651         /* Allow local fragmentation. */
1652         skb->ignore_df = ip6_sk_ignore_df(sk);
1653
1654         *final_dst = fl6->daddr;
1655         __skb_pull(skb, skb_network_header_len(skb));
1656         if (opt && opt->opt_flen)
1657                 ipv6_push_frag_opts(skb, opt, &proto);
1658         if (opt && opt->opt_nflen)
1659                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1660
1661         skb_push(skb, sizeof(struct ipv6hdr));
1662         skb_reset_network_header(skb);
1663         hdr = ipv6_hdr(skb);
1664
1665         ip6_flow_hdr(hdr, v6_cork->tclass,
1666                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1667                                         ip6_autoflowlabel(net, np), fl6));
1668         hdr->hop_limit = v6_cork->hop_limit;
1669         hdr->nexthdr = proto;
1670         hdr->saddr = fl6->saddr;
1671         hdr->daddr = *final_dst;
1672
1673         skb->priority = sk->sk_priority;
1674         skb->mark = sk->sk_mark;
1675
1676         skb_dst_set(skb, dst_clone(&rt->dst));
1677         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1678         if (proto == IPPROTO_ICMPV6) {
1679                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1680
1681                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1682                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1683         }
1684
1685         ip6_cork_release(cork, v6_cork);
1686 out:
1687         return skb;
1688 }
1689
1690 int ip6_send_skb(struct sk_buff *skb)
1691 {
1692         struct net *net = sock_net(skb->sk);
1693         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1694         int err;
1695
1696         err = ip6_local_out(net, skb->sk, skb);
1697         if (err) {
1698                 if (err > 0)
1699                         err = net_xmit_errno(err);
1700                 if (err)
1701                         IP6_INC_STATS(net, rt->rt6i_idev,
1702                                       IPSTATS_MIB_OUTDISCARDS);
1703         }
1704
1705         return err;
1706 }
1707
1708 int ip6_push_pending_frames(struct sock *sk)
1709 {
1710         struct sk_buff *skb;
1711
1712         skb = ip6_finish_skb(sk);
1713         if (!skb)
1714                 return 0;
1715
1716         return ip6_send_skb(skb);
1717 }
1718 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1719
1720 static void __ip6_flush_pending_frames(struct sock *sk,
1721                                        struct sk_buff_head *queue,
1722                                        struct inet_cork_full *cork,
1723                                        struct inet6_cork *v6_cork)
1724 {
1725         struct sk_buff *skb;
1726
1727         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1728                 if (skb_dst(skb))
1729                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1730                                       IPSTATS_MIB_OUTDISCARDS);
1731                 kfree_skb(skb);
1732         }
1733
1734         ip6_cork_release(cork, v6_cork);
1735 }
1736
1737 void ip6_flush_pending_frames(struct sock *sk)
1738 {
1739         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1740                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1741 }
1742 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1743
1744 struct sk_buff *ip6_make_skb(struct sock *sk,
1745                              int getfrag(void *from, char *to, int offset,
1746                                          int len, int odd, struct sk_buff *skb),
1747                              void *from, int length, int transhdrlen,
1748                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1749                              struct rt6_info *rt, unsigned int flags,
1750                              struct inet_cork_full *cork,
1751                              const struct sockcm_cookie *sockc)
1752 {
1753         struct inet6_cork v6_cork;
1754         struct sk_buff_head queue;
1755         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1756         int err;
1757
1758         if (flags & MSG_PROBE)
1759                 return NULL;
1760
1761         __skb_queue_head_init(&queue);
1762
1763         cork->base.flags = 0;
1764         cork->base.addr = 0;
1765         cork->base.opt = NULL;
1766         cork->base.dst = NULL;
1767         v6_cork.opt = NULL;
1768         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1769         if (err) {
1770                 ip6_cork_release(cork, &v6_cork);
1771                 return ERR_PTR(err);
1772         }
1773         if (ipc6->dontfrag < 0)
1774                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1775
1776         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1777                                 &current->task_frag, getfrag, from,
1778                                 length + exthdrlen, transhdrlen + exthdrlen,
1779                                 flags, ipc6, sockc);
1780         if (err) {
1781                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1782                 return ERR_PTR(err);
1783         }
1784
1785         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1786 }