Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
[linux-2.6-microblaze.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_is_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142         /* Policy lookup after SNAT yielded a new policy */
143         if (skb_dst(skb)->xfrm) {
144                 IPCB(skb)->flags |= IPSKB_REROUTED;
145                 return dst_output(net, sk, skb);
146         }
147 #endif
148
149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150             dst_allfrag(skb_dst(skb)) ||
151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162         skb->protocol = htons(ETH_P_IPV6);
163         skb->dev = dev;
164
165         if (unlikely(idev->cnf.disable_ipv6)) {
166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167                 kfree_skb(skb);
168                 return 0;
169         }
170
171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172                             net, sk, skb, NULL, dev,
173                             ip6_finish_output,
174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179         if (!np->autoflowlabel_set)
180                 return ip6_default_np_autolabel(net);
181         else
182                 return np->autoflowlabel;
183 }
184
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194         struct net *net = sock_net(sk);
195         const struct ipv6_pinfo *np = inet6_sk(sk);
196         struct in6_addr *first_hop = &fl6->daddr;
197         struct dst_entry *dst = skb_dst(skb);
198         unsigned int head_room;
199         struct ipv6hdr *hdr;
200         u8  proto = fl6->flowi6_proto;
201         int seg_len = skb->len;
202         int hlimit = -1;
203         u32 mtu;
204
205         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206         if (opt)
207                 head_room += opt->opt_nflen + opt->opt_flen;
208
209         if (unlikely(skb_headroom(skb) < head_room)) {
210                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211                 if (!skb2) {
212                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213                                       IPSTATS_MIB_OUTDISCARDS);
214                         kfree_skb(skb);
215                         return -ENOBUFS;
216                 }
217                 if (skb->sk)
218                         skb_set_owner_w(skb2, skb->sk);
219                 consume_skb(skb);
220                 skb = skb2;
221         }
222
223         if (opt) {
224                 seg_len += opt->opt_nflen + opt->opt_flen;
225
226                 if (opt->opt_flen)
227                         ipv6_push_frag_opts(skb, opt, &proto);
228
229                 if (opt->opt_nflen)
230                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231                                              &fl6->saddr);
232         }
233
234         skb_push(skb, sizeof(struct ipv6hdr));
235         skb_reset_network_header(skb);
236         hdr = ipv6_hdr(skb);
237
238         /*
239          *      Fill in the IPv6 header
240          */
241         if (np)
242                 hlimit = np->hop_limit;
243         if (hlimit < 0)
244                 hlimit = ip6_dst_hoplimit(dst);
245
246         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247                                 ip6_autoflowlabel(net, np), fl6));
248
249         hdr->payload_len = htons(seg_len);
250         hdr->nexthdr = proto;
251         hdr->hop_limit = hlimit;
252
253         hdr->saddr = fl6->saddr;
254         hdr->daddr = *first_hop;
255
256         skb->protocol = htons(ETH_P_IPV6);
257         skb->priority = sk->sk_priority;
258         skb->mark = mark;
259
260         mtu = dst_mtu(dst);
261         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263                               IPSTATS_MIB_OUT, skb->len);
264
265                 /* if egress device is enslaved to an L3 master device pass the
266                  * skb to its handler for processing
267                  */
268                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
269                 if (unlikely(!skb))
270                         return 0;
271
272                 /* hooks should never assume socket lock is held.
273                  * we promote our socket to non const
274                  */
275                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276                                net, (struct sock *)sk, skb, NULL, dst->dev,
277                                dst_output);
278         }
279
280         skb->dev = dst->dev;
281         /* ipv6_local_error() does not require socket lock,
282          * we promote our socket to non const
283          */
284         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285
286         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287         kfree_skb(skb);
288         return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294         struct ip6_ra_chain *ra;
295         struct sock *last = NULL;
296
297         read_lock(&ip6_ra_lock);
298         for (ra = ip6_ra_chain; ra; ra = ra->next) {
299                 struct sock *sk = ra->sk;
300                 if (sk && ra->sel == sel &&
301                     (!sk->sk_bound_dev_if ||
302                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
303                         struct ipv6_pinfo *np = inet6_sk(sk);
304
305                         if (np && np->rtalert_isolate &&
306                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
307                                 continue;
308                         }
309                         if (last) {
310                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311                                 if (skb2)
312                                         rawv6_rcv(last, skb2);
313                         }
314                         last = sk;
315                 }
316         }
317
318         if (last) {
319                 rawv6_rcv(last, skb);
320                 read_unlock(&ip6_ra_lock);
321                 return 1;
322         }
323         read_unlock(&ip6_ra_lock);
324         return 0;
325 }
326
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329         struct ipv6hdr *hdr = ipv6_hdr(skb);
330         u8 nexthdr = hdr->nexthdr;
331         __be16 frag_off;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
380                                      struct sk_buff *skb)
381 {
382         struct dst_entry *dst = skb_dst(skb);
383
384         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
385         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
386
387 #ifdef CONFIG_NET_SWITCHDEV
388         if (skb->offload_l3_fwd_mark) {
389                 consume_skb(skb);
390                 return 0;
391         }
392 #endif
393
394         skb->tstamp = 0;
395         return dst_output(net, sk, skb);
396 }
397
398 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
399 {
400         if (skb->len <= mtu)
401                 return false;
402
403         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
404         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
405                 return true;
406
407         if (skb->ignore_df)
408                 return false;
409
410         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
411                 return false;
412
413         return true;
414 }
415
416 int ip6_forward(struct sk_buff *skb)
417 {
418         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
419         struct dst_entry *dst = skb_dst(skb);
420         struct ipv6hdr *hdr = ipv6_hdr(skb);
421         struct inet6_skb_parm *opt = IP6CB(skb);
422         struct net *net = dev_net(dst->dev);
423         u32 mtu;
424
425         if (net->ipv6.devconf_all->forwarding == 0)
426                 goto error;
427
428         if (skb->pkt_type != PACKET_HOST)
429                 goto drop;
430
431         if (unlikely(skb->sk))
432                 goto drop;
433
434         if (skb_warn_if_lro(skb))
435                 goto drop;
436
437         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
438                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
439                 goto drop;
440         }
441
442         skb_forward_csum(skb);
443
444         /*
445          *      We DO NOT make any processing on
446          *      RA packets, pushing them to user level AS IS
447          *      without ane WARRANTY that application will be able
448          *      to interpret them. The reason is that we
449          *      cannot make anything clever here.
450          *
451          *      We are not end-node, so that if packet contains
452          *      AH/ESP, we cannot make anything.
453          *      Defragmentation also would be mistake, RA packets
454          *      cannot be fragmented, because there is no warranty
455          *      that different fragments will go along one path. --ANK
456          */
457         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
458                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
459                         return 0;
460         }
461
462         /*
463          *      check and decrement ttl
464          */
465         if (hdr->hop_limit <= 1) {
466                 /* Force OUTPUT device used as source address */
467                 skb->dev = dst->dev;
468                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
469                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
470
471                 kfree_skb(skb);
472                 return -ETIMEDOUT;
473         }
474
475         /* XXX: idev->cnf.proxy_ndp? */
476         if (net->ipv6.devconf_all->proxy_ndp &&
477             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
478                 int proxied = ip6_forward_proxy_check(skb);
479                 if (proxied > 0)
480                         return ip6_input(skb);
481                 else if (proxied < 0) {
482                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
483                         goto drop;
484                 }
485         }
486
487         if (!xfrm6_route_forward(skb)) {
488                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
489                 goto drop;
490         }
491         dst = skb_dst(skb);
492
493         /* IPv6 specs say nothing about it, but it is clear that we cannot
494            send redirects to source routed frames.
495            We don't send redirects to frames decapsulated from IPsec.
496          */
497         if (IP6CB(skb)->iif == dst->dev->ifindex &&
498             opt->srcrt == 0 && !skb_sec_path(skb)) {
499                 struct in6_addr *target = NULL;
500                 struct inet_peer *peer;
501                 struct rt6_info *rt;
502
503                 /*
504                  *      incoming and outgoing devices are the same
505                  *      send a redirect.
506                  */
507
508                 rt = (struct rt6_info *) dst;
509                 if (rt->rt6i_flags & RTF_GATEWAY)
510                         target = &rt->rt6i_gateway;
511                 else
512                         target = &hdr->daddr;
513
514                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
515
516                 /* Limit redirects both by destination (here)
517                    and by source (inside ndisc_send_redirect)
518                  */
519                 if (inet_peer_xrlim_allow(peer, 1*HZ))
520                         ndisc_send_redirect(skb, target);
521                 if (peer)
522                         inet_putpeer(peer);
523         } else {
524                 int addrtype = ipv6_addr_type(&hdr->saddr);
525
526                 /* This check is security critical. */
527                 if (addrtype == IPV6_ADDR_ANY ||
528                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
529                         goto error;
530                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
531                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
532                                     ICMPV6_NOT_NEIGHBOUR, 0);
533                         goto error;
534                 }
535         }
536
537         mtu = ip6_dst_mtu_forward(dst);
538         if (mtu < IPV6_MIN_MTU)
539                 mtu = IPV6_MIN_MTU;
540
541         if (ip6_pkt_too_big(skb, mtu)) {
542                 /* Again, force OUTPUT device used as source address */
543                 skb->dev = dst->dev;
544                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
545                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
546                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
547                                 IPSTATS_MIB_FRAGFAILS);
548                 kfree_skb(skb);
549                 return -EMSGSIZE;
550         }
551
552         if (skb_cow(skb, dst->dev->hard_header_len)) {
553                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
554                                 IPSTATS_MIB_OUTDISCARDS);
555                 goto drop;
556         }
557
558         hdr = ipv6_hdr(skb);
559
560         /* Mangling hops number delayed to point after skb COW */
561
562         hdr->hop_limit--;
563
564         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
565                        net, NULL, skb, skb->dev, dst->dev,
566                        ip6_forward_finish);
567
568 error:
569         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
570 drop:
571         kfree_skb(skb);
572         return -EINVAL;
573 }
574
575 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
576 {
577         to->pkt_type = from->pkt_type;
578         to->priority = from->priority;
579         to->protocol = from->protocol;
580         skb_dst_drop(to);
581         skb_dst_set(to, dst_clone(skb_dst(from)));
582         to->dev = from->dev;
583         to->mark = from->mark;
584
585         skb_copy_hash(to, from);
586
587 #ifdef CONFIG_NET_SCHED
588         to->tc_index = from->tc_index;
589 #endif
590         nf_copy(to, from);
591         skb_ext_copy(to, from);
592         skb_copy_secmark(to, from);
593 }
594
595 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
596                  int (*output)(struct net *, struct sock *, struct sk_buff *))
597 {
598         struct sk_buff *frag;
599         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
600         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
601                                 inet6_sk(skb->sk) : NULL;
602         struct ipv6hdr *tmp_hdr;
603         struct frag_hdr *fh;
604         unsigned int mtu, hlen, left, len;
605         int hroom, troom;
606         __be32 frag_id;
607         int ptr, offset = 0, err = 0;
608         u8 *prevhdr, nexthdr = 0;
609
610         err = ip6_find_1stfragopt(skb, &prevhdr);
611         if (err < 0)
612                 goto fail;
613         hlen = err;
614         nexthdr = *prevhdr;
615
616         mtu = ip6_skb_dst_mtu(skb);
617
618         /* We must not fragment if the socket is set to force MTU discovery
619          * or if the skb it not generated by a local socket.
620          */
621         if (unlikely(!skb->ignore_df && skb->len > mtu))
622                 goto fail_toobig;
623
624         if (IP6CB(skb)->frag_max_size) {
625                 if (IP6CB(skb)->frag_max_size > mtu)
626                         goto fail_toobig;
627
628                 /* don't send fragments larger than what we received */
629                 mtu = IP6CB(skb)->frag_max_size;
630                 if (mtu < IPV6_MIN_MTU)
631                         mtu = IPV6_MIN_MTU;
632         }
633
634         if (np && np->frag_size < mtu) {
635                 if (np->frag_size)
636                         mtu = np->frag_size;
637         }
638         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
639                 goto fail_toobig;
640         mtu -= hlen + sizeof(struct frag_hdr);
641
642         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
643                                     &ipv6_hdr(skb)->saddr);
644
645         if (skb->ip_summed == CHECKSUM_PARTIAL &&
646             (err = skb_checksum_help(skb)))
647                 goto fail;
648
649         hroom = LL_RESERVED_SPACE(rt->dst.dev);
650         if (skb_has_frag_list(skb)) {
651                 unsigned int first_len = skb_pagelen(skb);
652                 struct sk_buff *frag2;
653
654                 if (first_len - hlen > mtu ||
655                     ((first_len - hlen) & 7) ||
656                     skb_cloned(skb) ||
657                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
658                         goto slow_path;
659
660                 skb_walk_frags(skb, frag) {
661                         /* Correct geometry. */
662                         if (frag->len > mtu ||
663                             ((frag->len & 7) && frag->next) ||
664                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
665                                 goto slow_path_clean;
666
667                         /* Partially cloned skb? */
668                         if (skb_shared(frag))
669                                 goto slow_path_clean;
670
671                         BUG_ON(frag->sk);
672                         if (skb->sk) {
673                                 frag->sk = skb->sk;
674                                 frag->destructor = sock_wfree;
675                         }
676                         skb->truesize -= frag->truesize;
677                 }
678
679                 err = 0;
680                 offset = 0;
681                 /* BUILD HEADER */
682
683                 *prevhdr = NEXTHDR_FRAGMENT;
684                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
685                 if (!tmp_hdr) {
686                         err = -ENOMEM;
687                         goto fail;
688                 }
689                 frag = skb_shinfo(skb)->frag_list;
690                 skb_frag_list_init(skb);
691
692                 __skb_pull(skb, hlen);
693                 fh = __skb_push(skb, sizeof(struct frag_hdr));
694                 __skb_push(skb, hlen);
695                 skb_reset_network_header(skb);
696                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
697
698                 fh->nexthdr = nexthdr;
699                 fh->reserved = 0;
700                 fh->frag_off = htons(IP6_MF);
701                 fh->identification = frag_id;
702
703                 first_len = skb_pagelen(skb);
704                 skb->data_len = first_len - skb_headlen(skb);
705                 skb->len = first_len;
706                 ipv6_hdr(skb)->payload_len = htons(first_len -
707                                                    sizeof(struct ipv6hdr));
708
709                 for (;;) {
710                         /* Prepare header of the next frame,
711                          * before previous one went down. */
712                         if (frag) {
713                                 frag->ip_summed = CHECKSUM_NONE;
714                                 skb_reset_transport_header(frag);
715                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
716                                 __skb_push(frag, hlen);
717                                 skb_reset_network_header(frag);
718                                 memcpy(skb_network_header(frag), tmp_hdr,
719                                        hlen);
720                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
721                                 fh->nexthdr = nexthdr;
722                                 fh->reserved = 0;
723                                 fh->frag_off = htons(offset);
724                                 if (frag->next)
725                                         fh->frag_off |= htons(IP6_MF);
726                                 fh->identification = frag_id;
727                                 ipv6_hdr(frag)->payload_len =
728                                                 htons(frag->len -
729                                                       sizeof(struct ipv6hdr));
730                                 ip6_copy_metadata(frag, skb);
731                         }
732
733                         err = output(net, sk, skb);
734                         if (!err)
735                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
736                                               IPSTATS_MIB_FRAGCREATES);
737
738                         if (err || !frag)
739                                 break;
740
741                         skb = frag;
742                         frag = skb->next;
743                         skb_mark_not_on_list(skb);
744                 }
745
746                 kfree(tmp_hdr);
747
748                 if (err == 0) {
749                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750                                       IPSTATS_MIB_FRAGOKS);
751                         return 0;
752                 }
753
754                 kfree_skb_list(frag);
755
756                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757                               IPSTATS_MIB_FRAGFAILS);
758                 return err;
759
760 slow_path_clean:
761                 skb_walk_frags(skb, frag2) {
762                         if (frag2 == frag)
763                                 break;
764                         frag2->sk = NULL;
765                         frag2->destructor = NULL;
766                         skb->truesize += frag2->truesize;
767                 }
768         }
769
770 slow_path:
771         left = skb->len - hlen;         /* Space per frame */
772         ptr = hlen;                     /* Where to start from */
773
774         /*
775          *      Fragment the datagram.
776          */
777
778         troom = rt->dst.dev->needed_tailroom;
779
780         /*
781          *      Keep copying data until we run out.
782          */
783         while (left > 0)        {
784                 u8 *fragnexthdr_offset;
785
786                 len = left;
787                 /* IF: it doesn't fit, use 'mtu' - the data space left */
788                 if (len > mtu)
789                         len = mtu;
790                 /* IF: we are not sending up to and including the packet end
791                    then align the next start on an eight byte boundary */
792                 if (len < left) {
793                         len &= ~7;
794                 }
795
796                 /* Allocate buffer */
797                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
798                                  hroom + troom, GFP_ATOMIC);
799                 if (!frag) {
800                         err = -ENOMEM;
801                         goto fail;
802                 }
803
804                 /*
805                  *      Set up data on packet
806                  */
807
808                 ip6_copy_metadata(frag, skb);
809                 skb_reserve(frag, hroom);
810                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
811                 skb_reset_network_header(frag);
812                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
813                 frag->transport_header = (frag->network_header + hlen +
814                                           sizeof(struct frag_hdr));
815
816                 /*
817                  *      Charge the memory for the fragment to any owner
818                  *      it might possess
819                  */
820                 if (skb->sk)
821                         skb_set_owner_w(frag, skb->sk);
822
823                 /*
824                  *      Copy the packet header into the new buffer.
825                  */
826                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
827
828                 fragnexthdr_offset = skb_network_header(frag);
829                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
830                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
831
832                 /*
833                  *      Build fragment header.
834                  */
835                 fh->nexthdr = nexthdr;
836                 fh->reserved = 0;
837                 fh->identification = frag_id;
838
839                 /*
840                  *      Copy a block of the IP datagram.
841                  */
842                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
843                                      len));
844                 left -= len;
845
846                 fh->frag_off = htons(offset);
847                 if (left > 0)
848                         fh->frag_off |= htons(IP6_MF);
849                 ipv6_hdr(frag)->payload_len = htons(frag->len -
850                                                     sizeof(struct ipv6hdr));
851
852                 ptr += len;
853                 offset += len;
854
855                 /*
856                  *      Put this fragment into the sending queue.
857                  */
858                 err = output(net, sk, frag);
859                 if (err)
860                         goto fail;
861
862                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
863                               IPSTATS_MIB_FRAGCREATES);
864         }
865         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
866                       IPSTATS_MIB_FRAGOKS);
867         consume_skb(skb);
868         return err;
869
870 fail_toobig:
871         if (skb->sk && dst_allfrag(skb_dst(skb)))
872                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
873
874         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
875         err = -EMSGSIZE;
876
877 fail:
878         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
879                       IPSTATS_MIB_FRAGFAILS);
880         kfree_skb(skb);
881         return err;
882 }
883
884 static inline int ip6_rt_check(const struct rt6key *rt_key,
885                                const struct in6_addr *fl_addr,
886                                const struct in6_addr *addr_cache)
887 {
888         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
889                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
890 }
891
892 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
893                                           struct dst_entry *dst,
894                                           const struct flowi6 *fl6)
895 {
896         struct ipv6_pinfo *np = inet6_sk(sk);
897         struct rt6_info *rt;
898
899         if (!dst)
900                 goto out;
901
902         if (dst->ops->family != AF_INET6) {
903                 dst_release(dst);
904                 return NULL;
905         }
906
907         rt = (struct rt6_info *)dst;
908         /* Yes, checking route validity in not connected
909          * case is not very simple. Take into account,
910          * that we do not support routing by source, TOS,
911          * and MSG_DONTROUTE            --ANK (980726)
912          *
913          * 1. ip6_rt_check(): If route was host route,
914          *    check that cached destination is current.
915          *    If it is network route, we still may
916          *    check its validity using saved pointer
917          *    to the last used address: daddr_cache.
918          *    We do not want to save whole address now,
919          *    (because main consumer of this service
920          *    is tcp, which has not this problem),
921          *    so that the last trick works only on connected
922          *    sockets.
923          * 2. oif also should be the same.
924          */
925         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
926 #ifdef CONFIG_IPV6_SUBTREES
927             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
928 #endif
929            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
930               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
931                 dst_release(dst);
932                 dst = NULL;
933         }
934
935 out:
936         return dst;
937 }
938
939 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
940                                struct dst_entry **dst, struct flowi6 *fl6)
941 {
942 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
943         struct neighbour *n;
944         struct rt6_info *rt;
945 #endif
946         int err;
947         int flags = 0;
948
949         /* The correct way to handle this would be to do
950          * ip6_route_get_saddr, and then ip6_route_output; however,
951          * the route-specific preferred source forces the
952          * ip6_route_output call _before_ ip6_route_get_saddr.
953          *
954          * In source specific routing (no src=any default route),
955          * ip6_route_output will fail given src=any saddr, though, so
956          * that's why we try it again later.
957          */
958         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
959                 struct fib6_info *from;
960                 struct rt6_info *rt;
961                 bool had_dst = *dst != NULL;
962
963                 if (!had_dst)
964                         *dst = ip6_route_output(net, sk, fl6);
965                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
966
967                 rcu_read_lock();
968                 from = rt ? rcu_dereference(rt->from) : NULL;
969                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
970                                           sk ? inet6_sk(sk)->srcprefs : 0,
971                                           &fl6->saddr);
972                 rcu_read_unlock();
973
974                 if (err)
975                         goto out_err_release;
976
977                 /* If we had an erroneous initial result, pretend it
978                  * never existed and let the SA-enabled version take
979                  * over.
980                  */
981                 if (!had_dst && (*dst)->error) {
982                         dst_release(*dst);
983                         *dst = NULL;
984                 }
985
986                 if (fl6->flowi6_oif)
987                         flags |= RT6_LOOKUP_F_IFACE;
988         }
989
990         if (!*dst)
991                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
992
993         err = (*dst)->error;
994         if (err)
995                 goto out_err_release;
996
997 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
998         /*
999          * Here if the dst entry we've looked up
1000          * has a neighbour entry that is in the INCOMPLETE
1001          * state and the src address from the flow is
1002          * marked as OPTIMISTIC, we release the found
1003          * dst entry and replace it instead with the
1004          * dst entry of the nexthop router
1005          */
1006         rt = (struct rt6_info *) *dst;
1007         rcu_read_lock_bh();
1008         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1009                                       rt6_nexthop(rt, &fl6->daddr));
1010         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1011         rcu_read_unlock_bh();
1012
1013         if (err) {
1014                 struct inet6_ifaddr *ifp;
1015                 struct flowi6 fl_gw6;
1016                 int redirect;
1017
1018                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1019                                       (*dst)->dev, 1);
1020
1021                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1022                 if (ifp)
1023                         in6_ifa_put(ifp);
1024
1025                 if (redirect) {
1026                         /*
1027                          * We need to get the dst entry for the
1028                          * default router instead
1029                          */
1030                         dst_release(*dst);
1031                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1032                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1033                         *dst = ip6_route_output(net, sk, &fl_gw6);
1034                         err = (*dst)->error;
1035                         if (err)
1036                                 goto out_err_release;
1037                 }
1038         }
1039 #endif
1040         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1041             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1042                 err = -EAFNOSUPPORT;
1043                 goto out_err_release;
1044         }
1045
1046         return 0;
1047
1048 out_err_release:
1049         dst_release(*dst);
1050         *dst = NULL;
1051
1052         if (err == -ENETUNREACH)
1053                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1054         return err;
1055 }
1056
1057 /**
1058  *      ip6_dst_lookup - perform route lookup on flow
1059  *      @sk: socket which provides route info
1060  *      @dst: pointer to dst_entry * for result
1061  *      @fl6: flow to lookup
1062  *
1063  *      This function performs a route lookup on the given flow.
1064  *
1065  *      It returns zero on success, or a standard errno code on error.
1066  */
1067 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1068                    struct flowi6 *fl6)
1069 {
1070         *dst = NULL;
1071         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1072 }
1073 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1074
1075 /**
1076  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1077  *      @sk: socket which provides route info
1078  *      @fl6: flow to lookup
1079  *      @final_dst: final destination address for ipsec lookup
1080  *
1081  *      This function performs a route lookup on the given flow.
1082  *
1083  *      It returns a valid dst pointer on success, or a pointer encoded
1084  *      error code.
1085  */
1086 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1087                                       const struct in6_addr *final_dst)
1088 {
1089         struct dst_entry *dst = NULL;
1090         int err;
1091
1092         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1093         if (err)
1094                 return ERR_PTR(err);
1095         if (final_dst)
1096                 fl6->daddr = *final_dst;
1097
1098         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1099 }
1100 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1101
1102 /**
1103  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1104  *      @sk: socket which provides the dst cache and route info
1105  *      @fl6: flow to lookup
1106  *      @final_dst: final destination address for ipsec lookup
1107  *      @connected: whether @sk is connected or not
1108  *
1109  *      This function performs a route lookup on the given flow with the
1110  *      possibility of using the cached route in the socket if it is valid.
1111  *      It will take the socket dst lock when operating on the dst cache.
1112  *      As a result, this function can only be used in process context.
1113  *
1114  *      In addition, for a connected socket, cache the dst in the socket
1115  *      if the current cache is not valid.
1116  *
1117  *      It returns a valid dst pointer on success, or a pointer encoded
1118  *      error code.
1119  */
1120 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1121                                          const struct in6_addr *final_dst,
1122                                          bool connected)
1123 {
1124         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1125
1126         dst = ip6_sk_dst_check(sk, dst, fl6);
1127         if (dst)
1128                 return dst;
1129
1130         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1131         if (connected && !IS_ERR(dst))
1132                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1133
1134         return dst;
1135 }
1136 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1137
1138 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1139                                                gfp_t gfp)
1140 {
1141         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1142 }
1143
1144 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1145                                                 gfp_t gfp)
1146 {
1147         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1148 }
1149
1150 static void ip6_append_data_mtu(unsigned int *mtu,
1151                                 int *maxfraglen,
1152                                 unsigned int fragheaderlen,
1153                                 struct sk_buff *skb,
1154                                 struct rt6_info *rt,
1155                                 unsigned int orig_mtu)
1156 {
1157         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1158                 if (!skb) {
1159                         /* first fragment, reserve header_len */
1160                         *mtu = orig_mtu - rt->dst.header_len;
1161
1162                 } else {
1163                         /*
1164                          * this fragment is not first, the headers
1165                          * space is regarded as data space.
1166                          */
1167                         *mtu = orig_mtu;
1168                 }
1169                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1170                               + fragheaderlen - sizeof(struct frag_hdr);
1171         }
1172 }
1173
1174 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1175                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1176                           struct rt6_info *rt, struct flowi6 *fl6)
1177 {
1178         struct ipv6_pinfo *np = inet6_sk(sk);
1179         unsigned int mtu;
1180         struct ipv6_txoptions *opt = ipc6->opt;
1181
1182         /*
1183          * setup for corking
1184          */
1185         if (opt) {
1186                 if (WARN_ON(v6_cork->opt))
1187                         return -EINVAL;
1188
1189                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1190                 if (unlikely(!v6_cork->opt))
1191                         return -ENOBUFS;
1192
1193                 v6_cork->opt->tot_len = sizeof(*opt);
1194                 v6_cork->opt->opt_flen = opt->opt_flen;
1195                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1196
1197                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1198                                                     sk->sk_allocation);
1199                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1200                         return -ENOBUFS;
1201
1202                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1203                                                     sk->sk_allocation);
1204                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1205                         return -ENOBUFS;
1206
1207                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1208                                                    sk->sk_allocation);
1209                 if (opt->hopopt && !v6_cork->opt->hopopt)
1210                         return -ENOBUFS;
1211
1212                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1213                                                     sk->sk_allocation);
1214                 if (opt->srcrt && !v6_cork->opt->srcrt)
1215                         return -ENOBUFS;
1216
1217                 /* need source address above miyazawa*/
1218         }
1219         dst_hold(&rt->dst);
1220         cork->base.dst = &rt->dst;
1221         cork->fl.u.ip6 = *fl6;
1222         v6_cork->hop_limit = ipc6->hlimit;
1223         v6_cork->tclass = ipc6->tclass;
1224         if (rt->dst.flags & DST_XFRM_TUNNEL)
1225                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1226                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1227         else
1228                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1229                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1230         if (np->frag_size < mtu) {
1231                 if (np->frag_size)
1232                         mtu = np->frag_size;
1233         }
1234         if (mtu < IPV6_MIN_MTU)
1235                 return -EINVAL;
1236         cork->base.fragsize = mtu;
1237         cork->base.gso_size = ipc6->gso_size;
1238         cork->base.tx_flags = 0;
1239         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1240
1241         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1242                 cork->base.flags |= IPCORK_ALLFRAG;
1243         cork->base.length = 0;
1244
1245         cork->base.transmit_time = ipc6->sockc.transmit_time;
1246
1247         return 0;
1248 }
1249
1250 static int __ip6_append_data(struct sock *sk,
1251                              struct flowi6 *fl6,
1252                              struct sk_buff_head *queue,
1253                              struct inet_cork *cork,
1254                              struct inet6_cork *v6_cork,
1255                              struct page_frag *pfrag,
1256                              int getfrag(void *from, char *to, int offset,
1257                                          int len, int odd, struct sk_buff *skb),
1258                              void *from, int length, int transhdrlen,
1259                              unsigned int flags, struct ipcm6_cookie *ipc6)
1260 {
1261         struct sk_buff *skb, *skb_prev = NULL;
1262         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1263         struct ubuf_info *uarg = NULL;
1264         int exthdrlen = 0;
1265         int dst_exthdrlen = 0;
1266         int hh_len;
1267         int copy;
1268         int err;
1269         int offset = 0;
1270         u32 tskey = 0;
1271         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1272         struct ipv6_txoptions *opt = v6_cork->opt;
1273         int csummode = CHECKSUM_NONE;
1274         unsigned int maxnonfragsize, headersize;
1275         unsigned int wmem_alloc_delta = 0;
1276         bool paged, extra_uref;
1277
1278         skb = skb_peek_tail(queue);
1279         if (!skb) {
1280                 exthdrlen = opt ? opt->opt_flen : 0;
1281                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1282         }
1283
1284         paged = !!cork->gso_size;
1285         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1286         orig_mtu = mtu;
1287
1288         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1289             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1290                 tskey = sk->sk_tskey++;
1291
1292         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1293
1294         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1295                         (opt ? opt->opt_nflen : 0);
1296         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1297                      sizeof(struct frag_hdr);
1298
1299         headersize = sizeof(struct ipv6hdr) +
1300                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1301                      (dst_allfrag(&rt->dst) ?
1302                       sizeof(struct frag_hdr) : 0) +
1303                      rt->rt6i_nfheader_len;
1304
1305         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1306          * the first fragment
1307          */
1308         if (headersize + transhdrlen > mtu)
1309                 goto emsgsize;
1310
1311         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1312             (sk->sk_protocol == IPPROTO_UDP ||
1313              sk->sk_protocol == IPPROTO_RAW)) {
1314                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1315                                 sizeof(struct ipv6hdr));
1316                 goto emsgsize;
1317         }
1318
1319         if (ip6_sk_ignore_df(sk))
1320                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1321         else
1322                 maxnonfragsize = mtu;
1323
1324         if (cork->length + length > maxnonfragsize - headersize) {
1325 emsgsize:
1326                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1327                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1328                 return -EMSGSIZE;
1329         }
1330
1331         /* CHECKSUM_PARTIAL only with no extension headers and when
1332          * we are not going to fragment
1333          */
1334         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1335             headersize == sizeof(struct ipv6hdr) &&
1336             length <= mtu - headersize &&
1337             (!(flags & MSG_MORE) || cork->gso_size) &&
1338             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1339                 csummode = CHECKSUM_PARTIAL;
1340
1341         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1342                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1343                 if (!uarg)
1344                         return -ENOBUFS;
1345                 extra_uref = true;
1346                 if (rt->dst.dev->features & NETIF_F_SG &&
1347                     csummode == CHECKSUM_PARTIAL) {
1348                         paged = true;
1349                 } else {
1350                         uarg->zerocopy = 0;
1351                         skb_zcopy_set(skb, uarg, &extra_uref);
1352                 }
1353         }
1354
1355         /*
1356          * Let's try using as much space as possible.
1357          * Use MTU if total length of the message fits into the MTU.
1358          * Otherwise, we need to reserve fragment header and
1359          * fragment alignment (= 8-15 octects, in total).
1360          *
1361          * Note that we may need to "move" the data from the tail of
1362          * of the buffer to the new fragment when we split
1363          * the message.
1364          *
1365          * FIXME: It may be fragmented into multiple chunks
1366          *        at once if non-fragmentable extension headers
1367          *        are too large.
1368          * --yoshfuji
1369          */
1370
1371         cork->length += length;
1372         if (!skb)
1373                 goto alloc_new_skb;
1374
1375         while (length > 0) {
1376                 /* Check if the remaining data fits into current packet. */
1377                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1378                 if (copy < length)
1379                         copy = maxfraglen - skb->len;
1380
1381                 if (copy <= 0) {
1382                         char *data;
1383                         unsigned int datalen;
1384                         unsigned int fraglen;
1385                         unsigned int fraggap;
1386                         unsigned int alloclen;
1387                         unsigned int pagedlen;
1388 alloc_new_skb:
1389                         /* There's no room in the current skb */
1390                         if (skb)
1391                                 fraggap = skb->len - maxfraglen;
1392                         else
1393                                 fraggap = 0;
1394                         /* update mtu and maxfraglen if necessary */
1395                         if (!skb || !skb_prev)
1396                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1397                                                     fragheaderlen, skb, rt,
1398                                                     orig_mtu);
1399
1400                         skb_prev = skb;
1401
1402                         /*
1403                          * If remaining data exceeds the mtu,
1404                          * we know we need more fragment(s).
1405                          */
1406                         datalen = length + fraggap;
1407
1408                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1409                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1410                         fraglen = datalen + fragheaderlen;
1411                         pagedlen = 0;
1412
1413                         if ((flags & MSG_MORE) &&
1414                             !(rt->dst.dev->features&NETIF_F_SG))
1415                                 alloclen = mtu;
1416                         else if (!paged)
1417                                 alloclen = fraglen;
1418                         else {
1419                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1420                                 pagedlen = fraglen - alloclen;
1421                         }
1422
1423                         alloclen += dst_exthdrlen;
1424
1425                         if (datalen != length + fraggap) {
1426                                 /*
1427                                  * this is not the last fragment, the trailer
1428                                  * space is regarded as data space.
1429                                  */
1430                                 datalen += rt->dst.trailer_len;
1431                         }
1432
1433                         alloclen += rt->dst.trailer_len;
1434                         fraglen = datalen + fragheaderlen;
1435
1436                         /*
1437                          * We just reserve space for fragment header.
1438                          * Note: this may be overallocation if the message
1439                          * (without MSG_MORE) fits into the MTU.
1440                          */
1441                         alloclen += sizeof(struct frag_hdr);
1442
1443                         copy = datalen - transhdrlen - fraggap - pagedlen;
1444                         if (copy < 0) {
1445                                 err = -EINVAL;
1446                                 goto error;
1447                         }
1448                         if (transhdrlen) {
1449                                 skb = sock_alloc_send_skb(sk,
1450                                                 alloclen + hh_len,
1451                                                 (flags & MSG_DONTWAIT), &err);
1452                         } else {
1453                                 skb = NULL;
1454                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1455                                     2 * sk->sk_sndbuf)
1456                                         skb = alloc_skb(alloclen + hh_len,
1457                                                         sk->sk_allocation);
1458                                 if (unlikely(!skb))
1459                                         err = -ENOBUFS;
1460                         }
1461                         if (!skb)
1462                                 goto error;
1463                         /*
1464                          *      Fill in the control structures
1465                          */
1466                         skb->protocol = htons(ETH_P_IPV6);
1467                         skb->ip_summed = csummode;
1468                         skb->csum = 0;
1469                         /* reserve for fragmentation and ipsec header */
1470                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1471                                     dst_exthdrlen);
1472
1473                         /*
1474                          *      Find where to start putting bytes
1475                          */
1476                         data = skb_put(skb, fraglen - pagedlen);
1477                         skb_set_network_header(skb, exthdrlen);
1478                         data += fragheaderlen;
1479                         skb->transport_header = (skb->network_header +
1480                                                  fragheaderlen);
1481                         if (fraggap) {
1482                                 skb->csum = skb_copy_and_csum_bits(
1483                                         skb_prev, maxfraglen,
1484                                         data + transhdrlen, fraggap, 0);
1485                                 skb_prev->csum = csum_sub(skb_prev->csum,
1486                                                           skb->csum);
1487                                 data += fraggap;
1488                                 pskb_trim_unique(skb_prev, maxfraglen);
1489                         }
1490                         if (copy > 0 &&
1491                             getfrag(from, data + transhdrlen, offset,
1492                                     copy, fraggap, skb) < 0) {
1493                                 err = -EFAULT;
1494                                 kfree_skb(skb);
1495                                 goto error;
1496                         }
1497
1498                         offset += copy;
1499                         length -= copy + transhdrlen;
1500                         transhdrlen = 0;
1501                         exthdrlen = 0;
1502                         dst_exthdrlen = 0;
1503
1504                         /* Only the initial fragment is time stamped */
1505                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1506                         cork->tx_flags = 0;
1507                         skb_shinfo(skb)->tskey = tskey;
1508                         tskey = 0;
1509                         skb_zcopy_set(skb, uarg, &extra_uref);
1510
1511                         if ((flags & MSG_CONFIRM) && !skb_prev)
1512                                 skb_set_dst_pending_confirm(skb, 1);
1513
1514                         /*
1515                          * Put the packet on the pending queue
1516                          */
1517                         if (!skb->destructor) {
1518                                 skb->destructor = sock_wfree;
1519                                 skb->sk = sk;
1520                                 wmem_alloc_delta += skb->truesize;
1521                         }
1522                         __skb_queue_tail(queue, skb);
1523                         continue;
1524                 }
1525
1526                 if (copy > length)
1527                         copy = length;
1528
1529                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1530                     skb_tailroom(skb) >= copy) {
1531                         unsigned int off;
1532
1533                         off = skb->len;
1534                         if (getfrag(from, skb_put(skb, copy),
1535                                                 offset, copy, off, skb) < 0) {
1536                                 __skb_trim(skb, off);
1537                                 err = -EFAULT;
1538                                 goto error;
1539                         }
1540                 } else if (!uarg || !uarg->zerocopy) {
1541                         int i = skb_shinfo(skb)->nr_frags;
1542
1543                         err = -ENOMEM;
1544                         if (!sk_page_frag_refill(sk, pfrag))
1545                                 goto error;
1546
1547                         if (!skb_can_coalesce(skb, i, pfrag->page,
1548                                               pfrag->offset)) {
1549                                 err = -EMSGSIZE;
1550                                 if (i == MAX_SKB_FRAGS)
1551                                         goto error;
1552
1553                                 __skb_fill_page_desc(skb, i, pfrag->page,
1554                                                      pfrag->offset, 0);
1555                                 skb_shinfo(skb)->nr_frags = ++i;
1556                                 get_page(pfrag->page);
1557                         }
1558                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1559                         if (getfrag(from,
1560                                     page_address(pfrag->page) + pfrag->offset,
1561                                     offset, copy, skb->len, skb) < 0)
1562                                 goto error_efault;
1563
1564                         pfrag->offset += copy;
1565                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1566                         skb->len += copy;
1567                         skb->data_len += copy;
1568                         skb->truesize += copy;
1569                         wmem_alloc_delta += copy;
1570                 } else {
1571                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1572                         if (err < 0)
1573                                 goto error;
1574                 }
1575                 offset += copy;
1576                 length -= copy;
1577         }
1578
1579         if (wmem_alloc_delta)
1580                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1581         return 0;
1582
1583 error_efault:
1584         err = -EFAULT;
1585 error:
1586         if (uarg)
1587                 sock_zerocopy_put_abort(uarg, extra_uref);
1588         cork->length -= length;
1589         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1590         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1591         return err;
1592 }
1593
1594 int ip6_append_data(struct sock *sk,
1595                     int getfrag(void *from, char *to, int offset, int len,
1596                                 int odd, struct sk_buff *skb),
1597                     void *from, int length, int transhdrlen,
1598                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1599                     struct rt6_info *rt, unsigned int flags)
1600 {
1601         struct inet_sock *inet = inet_sk(sk);
1602         struct ipv6_pinfo *np = inet6_sk(sk);
1603         int exthdrlen;
1604         int err;
1605
1606         if (flags&MSG_PROBE)
1607                 return 0;
1608         if (skb_queue_empty(&sk->sk_write_queue)) {
1609                 /*
1610                  * setup for corking
1611                  */
1612                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1613                                      ipc6, rt, fl6);
1614                 if (err)
1615                         return err;
1616
1617                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1618                 length += exthdrlen;
1619                 transhdrlen += exthdrlen;
1620         } else {
1621                 fl6 = &inet->cork.fl.u.ip6;
1622                 transhdrlen = 0;
1623         }
1624
1625         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1626                                  &np->cork, sk_page_frag(sk), getfrag,
1627                                  from, length, transhdrlen, flags, ipc6);
1628 }
1629 EXPORT_SYMBOL_GPL(ip6_append_data);
1630
1631 static void ip6_cork_release(struct inet_cork_full *cork,
1632                              struct inet6_cork *v6_cork)
1633 {
1634         if (v6_cork->opt) {
1635                 kfree(v6_cork->opt->dst0opt);
1636                 kfree(v6_cork->opt->dst1opt);
1637                 kfree(v6_cork->opt->hopopt);
1638                 kfree(v6_cork->opt->srcrt);
1639                 kfree(v6_cork->opt);
1640                 v6_cork->opt = NULL;
1641         }
1642
1643         if (cork->base.dst) {
1644                 dst_release(cork->base.dst);
1645                 cork->base.dst = NULL;
1646                 cork->base.flags &= ~IPCORK_ALLFRAG;
1647         }
1648         memset(&cork->fl, 0, sizeof(cork->fl));
1649 }
1650
1651 struct sk_buff *__ip6_make_skb(struct sock *sk,
1652                                struct sk_buff_head *queue,
1653                                struct inet_cork_full *cork,
1654                                struct inet6_cork *v6_cork)
1655 {
1656         struct sk_buff *skb, *tmp_skb;
1657         struct sk_buff **tail_skb;
1658         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1659         struct ipv6_pinfo *np = inet6_sk(sk);
1660         struct net *net = sock_net(sk);
1661         struct ipv6hdr *hdr;
1662         struct ipv6_txoptions *opt = v6_cork->opt;
1663         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1664         struct flowi6 *fl6 = &cork->fl.u.ip6;
1665         unsigned char proto = fl6->flowi6_proto;
1666
1667         skb = __skb_dequeue(queue);
1668         if (!skb)
1669                 goto out;
1670         tail_skb = &(skb_shinfo(skb)->frag_list);
1671
1672         /* move skb->data to ip header from ext header */
1673         if (skb->data < skb_network_header(skb))
1674                 __skb_pull(skb, skb_network_offset(skb));
1675         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1676                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1677                 *tail_skb = tmp_skb;
1678                 tail_skb = &(tmp_skb->next);
1679                 skb->len += tmp_skb->len;
1680                 skb->data_len += tmp_skb->len;
1681                 skb->truesize += tmp_skb->truesize;
1682                 tmp_skb->destructor = NULL;
1683                 tmp_skb->sk = NULL;
1684         }
1685
1686         /* Allow local fragmentation. */
1687         skb->ignore_df = ip6_sk_ignore_df(sk);
1688
1689         *final_dst = fl6->daddr;
1690         __skb_pull(skb, skb_network_header_len(skb));
1691         if (opt && opt->opt_flen)
1692                 ipv6_push_frag_opts(skb, opt, &proto);
1693         if (opt && opt->opt_nflen)
1694                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1695
1696         skb_push(skb, sizeof(struct ipv6hdr));
1697         skb_reset_network_header(skb);
1698         hdr = ipv6_hdr(skb);
1699
1700         ip6_flow_hdr(hdr, v6_cork->tclass,
1701                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1702                                         ip6_autoflowlabel(net, np), fl6));
1703         hdr->hop_limit = v6_cork->hop_limit;
1704         hdr->nexthdr = proto;
1705         hdr->saddr = fl6->saddr;
1706         hdr->daddr = *final_dst;
1707
1708         skb->priority = sk->sk_priority;
1709         skb->mark = sk->sk_mark;
1710
1711         skb->tstamp = cork->base.transmit_time;
1712
1713         skb_dst_set(skb, dst_clone(&rt->dst));
1714         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1715         if (proto == IPPROTO_ICMPV6) {
1716                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1717
1718                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1719                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1720         }
1721
1722         ip6_cork_release(cork, v6_cork);
1723 out:
1724         return skb;
1725 }
1726
1727 int ip6_send_skb(struct sk_buff *skb)
1728 {
1729         struct net *net = sock_net(skb->sk);
1730         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1731         int err;
1732
1733         err = ip6_local_out(net, skb->sk, skb);
1734         if (err) {
1735                 if (err > 0)
1736                         err = net_xmit_errno(err);
1737                 if (err)
1738                         IP6_INC_STATS(net, rt->rt6i_idev,
1739                                       IPSTATS_MIB_OUTDISCARDS);
1740         }
1741
1742         return err;
1743 }
1744
1745 int ip6_push_pending_frames(struct sock *sk)
1746 {
1747         struct sk_buff *skb;
1748
1749         skb = ip6_finish_skb(sk);
1750         if (!skb)
1751                 return 0;
1752
1753         return ip6_send_skb(skb);
1754 }
1755 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1756
1757 static void __ip6_flush_pending_frames(struct sock *sk,
1758                                        struct sk_buff_head *queue,
1759                                        struct inet_cork_full *cork,
1760                                        struct inet6_cork *v6_cork)
1761 {
1762         struct sk_buff *skb;
1763
1764         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1765                 if (skb_dst(skb))
1766                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1767                                       IPSTATS_MIB_OUTDISCARDS);
1768                 kfree_skb(skb);
1769         }
1770
1771         ip6_cork_release(cork, v6_cork);
1772 }
1773
1774 void ip6_flush_pending_frames(struct sock *sk)
1775 {
1776         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1777                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1778 }
1779 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1780
1781 struct sk_buff *ip6_make_skb(struct sock *sk,
1782                              int getfrag(void *from, char *to, int offset,
1783                                          int len, int odd, struct sk_buff *skb),
1784                              void *from, int length, int transhdrlen,
1785                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1786                              struct rt6_info *rt, unsigned int flags,
1787                              struct inet_cork_full *cork)
1788 {
1789         struct inet6_cork v6_cork;
1790         struct sk_buff_head queue;
1791         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1792         int err;
1793
1794         if (flags & MSG_PROBE)
1795                 return NULL;
1796
1797         __skb_queue_head_init(&queue);
1798
1799         cork->base.flags = 0;
1800         cork->base.addr = 0;
1801         cork->base.opt = NULL;
1802         cork->base.dst = NULL;
1803         v6_cork.opt = NULL;
1804         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1805         if (err) {
1806                 ip6_cork_release(cork, &v6_cork);
1807                 return ERR_PTR(err);
1808         }
1809         if (ipc6->dontfrag < 0)
1810                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1811
1812         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1813                                 &current->task_frag, getfrag, from,
1814                                 length + exthdrlen, transhdrlen + exthdrlen,
1815                                 flags, ipc6);
1816         if (err) {
1817                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1818                 return ERR_PTR(err);
1819         }
1820
1821         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1822 }