Merge tag 'f2fs-for-v5.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeu...
[linux-2.6-microblaze.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_is_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb, false);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142         /* Policy lookup after SNAT yielded a new policy */
143         if (skb_dst(skb)->xfrm) {
144                 IPCB(skb)->flags |= IPSKB_REROUTED;
145                 return dst_output(net, sk, skb);
146         }
147 #endif
148
149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150             dst_allfrag(skb_dst(skb)) ||
151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162         skb->protocol = htons(ETH_P_IPV6);
163         skb->dev = dev;
164
165         if (unlikely(idev->cnf.disable_ipv6)) {
166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167                 kfree_skb(skb);
168                 return 0;
169         }
170
171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172                             net, sk, skb, NULL, dev,
173                             ip6_finish_output,
174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179         if (!np->autoflowlabel_set)
180                 return ip6_default_np_autolabel(net);
181         else
182                 return np->autoflowlabel;
183 }
184
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194         struct net *net = sock_net(sk);
195         const struct ipv6_pinfo *np = inet6_sk(sk);
196         struct in6_addr *first_hop = &fl6->daddr;
197         struct dst_entry *dst = skb_dst(skb);
198         unsigned int head_room;
199         struct ipv6hdr *hdr;
200         u8  proto = fl6->flowi6_proto;
201         int seg_len = skb->len;
202         int hlimit = -1;
203         u32 mtu;
204
205         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206         if (opt)
207                 head_room += opt->opt_nflen + opt->opt_flen;
208
209         if (unlikely(skb_headroom(skb) < head_room)) {
210                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211                 if (!skb2) {
212                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213                                       IPSTATS_MIB_OUTDISCARDS);
214                         kfree_skb(skb);
215                         return -ENOBUFS;
216                 }
217                 if (skb->sk)
218                         skb_set_owner_w(skb2, skb->sk);
219                 consume_skb(skb);
220                 skb = skb2;
221         }
222
223         if (opt) {
224                 seg_len += opt->opt_nflen + opt->opt_flen;
225
226                 if (opt->opt_flen)
227                         ipv6_push_frag_opts(skb, opt, &proto);
228
229                 if (opt->opt_nflen)
230                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231                                              &fl6->saddr);
232         }
233
234         skb_push(skb, sizeof(struct ipv6hdr));
235         skb_reset_network_header(skb);
236         hdr = ipv6_hdr(skb);
237
238         /*
239          *      Fill in the IPv6 header
240          */
241         if (np)
242                 hlimit = np->hop_limit;
243         if (hlimit < 0)
244                 hlimit = ip6_dst_hoplimit(dst);
245
246         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247                                 ip6_autoflowlabel(net, np), fl6));
248
249         hdr->payload_len = htons(seg_len);
250         hdr->nexthdr = proto;
251         hdr->hop_limit = hlimit;
252
253         hdr->saddr = fl6->saddr;
254         hdr->daddr = *first_hop;
255
256         skb->protocol = htons(ETH_P_IPV6);
257         skb->priority = sk->sk_priority;
258         skb->mark = mark;
259
260         mtu = dst_mtu(dst);
261         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263                               IPSTATS_MIB_OUT, skb->len);
264
265                 /* if egress device is enslaved to an L3 master device pass the
266                  * skb to its handler for processing
267                  */
268                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
269                 if (unlikely(!skb))
270                         return 0;
271
272                 /* hooks should never assume socket lock is held.
273                  * we promote our socket to non const
274                  */
275                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276                                net, (struct sock *)sk, skb, NULL, dst->dev,
277                                dst_output);
278         }
279
280         skb->dev = dst->dev;
281         /* ipv6_local_error() does not require socket lock,
282          * we promote our socket to non const
283          */
284         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285
286         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287         kfree_skb(skb);
288         return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294         struct ip6_ra_chain *ra;
295         struct sock *last = NULL;
296
297         read_lock(&ip6_ra_lock);
298         for (ra = ip6_ra_chain; ra; ra = ra->next) {
299                 struct sock *sk = ra->sk;
300                 if (sk && ra->sel == sel &&
301                     (!sk->sk_bound_dev_if ||
302                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
303                         struct ipv6_pinfo *np = inet6_sk(sk);
304
305                         if (np && np->rtalert_isolate &&
306                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
307                                 continue;
308                         }
309                         if (last) {
310                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311                                 if (skb2)
312                                         rawv6_rcv(last, skb2);
313                         }
314                         last = sk;
315                 }
316         }
317
318         if (last) {
319                 rawv6_rcv(last, skb);
320                 read_unlock(&ip6_ra_lock);
321                 return 1;
322         }
323         read_unlock(&ip6_ra_lock);
324         return 0;
325 }
326
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329         struct ipv6hdr *hdr = ipv6_hdr(skb);
330         u8 nexthdr = hdr->nexthdr;
331         __be16 frag_off;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
380                                      struct sk_buff *skb)
381 {
382         struct dst_entry *dst = skb_dst(skb);
383
384         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
385         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
386
387 #ifdef CONFIG_NET_SWITCHDEV
388         if (skb->offload_l3_fwd_mark) {
389                 consume_skb(skb);
390                 return 0;
391         }
392 #endif
393
394         skb->tstamp = 0;
395         return dst_output(net, sk, skb);
396 }
397
398 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
399 {
400         if (skb->len <= mtu)
401                 return false;
402
403         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
404         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
405                 return true;
406
407         if (skb->ignore_df)
408                 return false;
409
410         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
411                 return false;
412
413         return true;
414 }
415
416 int ip6_forward(struct sk_buff *skb)
417 {
418         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
419         struct dst_entry *dst = skb_dst(skb);
420         struct ipv6hdr *hdr = ipv6_hdr(skb);
421         struct inet6_skb_parm *opt = IP6CB(skb);
422         struct net *net = dev_net(dst->dev);
423         u32 mtu;
424
425         if (net->ipv6.devconf_all->forwarding == 0)
426                 goto error;
427
428         if (skb->pkt_type != PACKET_HOST)
429                 goto drop;
430
431         if (unlikely(skb->sk))
432                 goto drop;
433
434         if (skb_warn_if_lro(skb))
435                 goto drop;
436
437         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
438                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
439                 goto drop;
440         }
441
442         skb_forward_csum(skb);
443
444         /*
445          *      We DO NOT make any processing on
446          *      RA packets, pushing them to user level AS IS
447          *      without ane WARRANTY that application will be able
448          *      to interpret them. The reason is that we
449          *      cannot make anything clever here.
450          *
451          *      We are not end-node, so that if packet contains
452          *      AH/ESP, we cannot make anything.
453          *      Defragmentation also would be mistake, RA packets
454          *      cannot be fragmented, because there is no warranty
455          *      that different fragments will go along one path. --ANK
456          */
457         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
458                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
459                         return 0;
460         }
461
462         /*
463          *      check and decrement ttl
464          */
465         if (hdr->hop_limit <= 1) {
466                 /* Force OUTPUT device used as source address */
467                 skb->dev = dst->dev;
468                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
469                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
470
471                 kfree_skb(skb);
472                 return -ETIMEDOUT;
473         }
474
475         /* XXX: idev->cnf.proxy_ndp? */
476         if (net->ipv6.devconf_all->proxy_ndp &&
477             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
478                 int proxied = ip6_forward_proxy_check(skb);
479                 if (proxied > 0)
480                         return ip6_input(skb);
481                 else if (proxied < 0) {
482                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
483                         goto drop;
484                 }
485         }
486
487         if (!xfrm6_route_forward(skb)) {
488                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
489                 goto drop;
490         }
491         dst = skb_dst(skb);
492
493         /* IPv6 specs say nothing about it, but it is clear that we cannot
494            send redirects to source routed frames.
495            We don't send redirects to frames decapsulated from IPsec.
496          */
497         if (IP6CB(skb)->iif == dst->dev->ifindex &&
498             opt->srcrt == 0 && !skb_sec_path(skb)) {
499                 struct in6_addr *target = NULL;
500                 struct inet_peer *peer;
501                 struct rt6_info *rt;
502
503                 /*
504                  *      incoming and outgoing devices are the same
505                  *      send a redirect.
506                  */
507
508                 rt = (struct rt6_info *) dst;
509                 if (rt->rt6i_flags & RTF_GATEWAY)
510                         target = &rt->rt6i_gateway;
511                 else
512                         target = &hdr->daddr;
513
514                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
515
516                 /* Limit redirects both by destination (here)
517                    and by source (inside ndisc_send_redirect)
518                  */
519                 if (inet_peer_xrlim_allow(peer, 1*HZ))
520                         ndisc_send_redirect(skb, target);
521                 if (peer)
522                         inet_putpeer(peer);
523         } else {
524                 int addrtype = ipv6_addr_type(&hdr->saddr);
525
526                 /* This check is security critical. */
527                 if (addrtype == IPV6_ADDR_ANY ||
528                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
529                         goto error;
530                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
531                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
532                                     ICMPV6_NOT_NEIGHBOUR, 0);
533                         goto error;
534                 }
535         }
536
537         mtu = ip6_dst_mtu_forward(dst);
538         if (mtu < IPV6_MIN_MTU)
539                 mtu = IPV6_MIN_MTU;
540
541         if (ip6_pkt_too_big(skb, mtu)) {
542                 /* Again, force OUTPUT device used as source address */
543                 skb->dev = dst->dev;
544                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
545                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
546                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
547                                 IPSTATS_MIB_FRAGFAILS);
548                 kfree_skb(skb);
549                 return -EMSGSIZE;
550         }
551
552         if (skb_cow(skb, dst->dev->hard_header_len)) {
553                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
554                                 IPSTATS_MIB_OUTDISCARDS);
555                 goto drop;
556         }
557
558         hdr = ipv6_hdr(skb);
559
560         /* Mangling hops number delayed to point after skb COW */
561
562         hdr->hop_limit--;
563
564         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
565                        net, NULL, skb, skb->dev, dst->dev,
566                        ip6_forward_finish);
567
568 error:
569         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
570 drop:
571         kfree_skb(skb);
572         return -EINVAL;
573 }
574
575 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
576 {
577         to->pkt_type = from->pkt_type;
578         to->priority = from->priority;
579         to->protocol = from->protocol;
580         skb_dst_drop(to);
581         skb_dst_set(to, dst_clone(skb_dst(from)));
582         to->dev = from->dev;
583         to->mark = from->mark;
584
585         skb_copy_hash(to, from);
586
587 #ifdef CONFIG_NET_SCHED
588         to->tc_index = from->tc_index;
589 #endif
590         nf_copy(to, from);
591         skb_ext_copy(to, from);
592         skb_copy_secmark(to, from);
593 }
594
595 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
596                  int (*output)(struct net *, struct sock *, struct sk_buff *))
597 {
598         struct sk_buff *frag;
599         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
600         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
601                                 inet6_sk(skb->sk) : NULL;
602         struct ipv6hdr *tmp_hdr;
603         struct frag_hdr *fh;
604         unsigned int mtu, hlen, left, len, nexthdr_offset;
605         int hroom, troom;
606         __be32 frag_id;
607         int ptr, offset = 0, err = 0;
608         u8 *prevhdr, nexthdr = 0;
609
610         err = ip6_find_1stfragopt(skb, &prevhdr);
611         if (err < 0)
612                 goto fail;
613         hlen = err;
614         nexthdr = *prevhdr;
615         nexthdr_offset = prevhdr - skb_network_header(skb);
616
617         mtu = ip6_skb_dst_mtu(skb);
618
619         /* We must not fragment if the socket is set to force MTU discovery
620          * or if the skb it not generated by a local socket.
621          */
622         if (unlikely(!skb->ignore_df && skb->len > mtu))
623                 goto fail_toobig;
624
625         if (IP6CB(skb)->frag_max_size) {
626                 if (IP6CB(skb)->frag_max_size > mtu)
627                         goto fail_toobig;
628
629                 /* don't send fragments larger than what we received */
630                 mtu = IP6CB(skb)->frag_max_size;
631                 if (mtu < IPV6_MIN_MTU)
632                         mtu = IPV6_MIN_MTU;
633         }
634
635         if (np && np->frag_size < mtu) {
636                 if (np->frag_size)
637                         mtu = np->frag_size;
638         }
639         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
640                 goto fail_toobig;
641         mtu -= hlen + sizeof(struct frag_hdr);
642
643         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
644                                     &ipv6_hdr(skb)->saddr);
645
646         if (skb->ip_summed == CHECKSUM_PARTIAL &&
647             (err = skb_checksum_help(skb)))
648                 goto fail;
649
650         prevhdr = skb_network_header(skb) + nexthdr_offset;
651         hroom = LL_RESERVED_SPACE(rt->dst.dev);
652         if (skb_has_frag_list(skb)) {
653                 unsigned int first_len = skb_pagelen(skb);
654                 struct sk_buff *frag2;
655
656                 if (first_len - hlen > mtu ||
657                     ((first_len - hlen) & 7) ||
658                     skb_cloned(skb) ||
659                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
660                         goto slow_path;
661
662                 skb_walk_frags(skb, frag) {
663                         /* Correct geometry. */
664                         if (frag->len > mtu ||
665                             ((frag->len & 7) && frag->next) ||
666                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
667                                 goto slow_path_clean;
668
669                         /* Partially cloned skb? */
670                         if (skb_shared(frag))
671                                 goto slow_path_clean;
672
673                         BUG_ON(frag->sk);
674                         if (skb->sk) {
675                                 frag->sk = skb->sk;
676                                 frag->destructor = sock_wfree;
677                         }
678                         skb->truesize -= frag->truesize;
679                 }
680
681                 err = 0;
682                 offset = 0;
683                 /* BUILD HEADER */
684
685                 *prevhdr = NEXTHDR_FRAGMENT;
686                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
687                 if (!tmp_hdr) {
688                         err = -ENOMEM;
689                         goto fail;
690                 }
691                 frag = skb_shinfo(skb)->frag_list;
692                 skb_frag_list_init(skb);
693
694                 __skb_pull(skb, hlen);
695                 fh = __skb_push(skb, sizeof(struct frag_hdr));
696                 __skb_push(skb, hlen);
697                 skb_reset_network_header(skb);
698                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
699
700                 fh->nexthdr = nexthdr;
701                 fh->reserved = 0;
702                 fh->frag_off = htons(IP6_MF);
703                 fh->identification = frag_id;
704
705                 first_len = skb_pagelen(skb);
706                 skb->data_len = first_len - skb_headlen(skb);
707                 skb->len = first_len;
708                 ipv6_hdr(skb)->payload_len = htons(first_len -
709                                                    sizeof(struct ipv6hdr));
710
711                 for (;;) {
712                         /* Prepare header of the next frame,
713                          * before previous one went down. */
714                         if (frag) {
715                                 frag->ip_summed = CHECKSUM_NONE;
716                                 skb_reset_transport_header(frag);
717                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
718                                 __skb_push(frag, hlen);
719                                 skb_reset_network_header(frag);
720                                 memcpy(skb_network_header(frag), tmp_hdr,
721                                        hlen);
722                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
723                                 fh->nexthdr = nexthdr;
724                                 fh->reserved = 0;
725                                 fh->frag_off = htons(offset);
726                                 if (frag->next)
727                                         fh->frag_off |= htons(IP6_MF);
728                                 fh->identification = frag_id;
729                                 ipv6_hdr(frag)->payload_len =
730                                                 htons(frag->len -
731                                                       sizeof(struct ipv6hdr));
732                                 ip6_copy_metadata(frag, skb);
733                         }
734
735                         err = output(net, sk, skb);
736                         if (!err)
737                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738                                               IPSTATS_MIB_FRAGCREATES);
739
740                         if (err || !frag)
741                                 break;
742
743                         skb = frag;
744                         frag = skb->next;
745                         skb_mark_not_on_list(skb);
746                 }
747
748                 kfree(tmp_hdr);
749
750                 if (err == 0) {
751                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
752                                       IPSTATS_MIB_FRAGOKS);
753                         return 0;
754                 }
755
756                 kfree_skb_list(frag);
757
758                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
759                               IPSTATS_MIB_FRAGFAILS);
760                 return err;
761
762 slow_path_clean:
763                 skb_walk_frags(skb, frag2) {
764                         if (frag2 == frag)
765                                 break;
766                         frag2->sk = NULL;
767                         frag2->destructor = NULL;
768                         skb->truesize += frag2->truesize;
769                 }
770         }
771
772 slow_path:
773         left = skb->len - hlen;         /* Space per frame */
774         ptr = hlen;                     /* Where to start from */
775
776         /*
777          *      Fragment the datagram.
778          */
779
780         troom = rt->dst.dev->needed_tailroom;
781
782         /*
783          *      Keep copying data until we run out.
784          */
785         while (left > 0)        {
786                 u8 *fragnexthdr_offset;
787
788                 len = left;
789                 /* IF: it doesn't fit, use 'mtu' - the data space left */
790                 if (len > mtu)
791                         len = mtu;
792                 /* IF: we are not sending up to and including the packet end
793                    then align the next start on an eight byte boundary */
794                 if (len < left) {
795                         len &= ~7;
796                 }
797
798                 /* Allocate buffer */
799                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
800                                  hroom + troom, GFP_ATOMIC);
801                 if (!frag) {
802                         err = -ENOMEM;
803                         goto fail;
804                 }
805
806                 /*
807                  *      Set up data on packet
808                  */
809
810                 ip6_copy_metadata(frag, skb);
811                 skb_reserve(frag, hroom);
812                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
813                 skb_reset_network_header(frag);
814                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
815                 frag->transport_header = (frag->network_header + hlen +
816                                           sizeof(struct frag_hdr));
817
818                 /*
819                  *      Charge the memory for the fragment to any owner
820                  *      it might possess
821                  */
822                 if (skb->sk)
823                         skb_set_owner_w(frag, skb->sk);
824
825                 /*
826                  *      Copy the packet header into the new buffer.
827                  */
828                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
829
830                 fragnexthdr_offset = skb_network_header(frag);
831                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
832                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
833
834                 /*
835                  *      Build fragment header.
836                  */
837                 fh->nexthdr = nexthdr;
838                 fh->reserved = 0;
839                 fh->identification = frag_id;
840
841                 /*
842                  *      Copy a block of the IP datagram.
843                  */
844                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
845                                      len));
846                 left -= len;
847
848                 fh->frag_off = htons(offset);
849                 if (left > 0)
850                         fh->frag_off |= htons(IP6_MF);
851                 ipv6_hdr(frag)->payload_len = htons(frag->len -
852                                                     sizeof(struct ipv6hdr));
853
854                 ptr += len;
855                 offset += len;
856
857                 /*
858                  *      Put this fragment into the sending queue.
859                  */
860                 err = output(net, sk, frag);
861                 if (err)
862                         goto fail;
863
864                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865                               IPSTATS_MIB_FRAGCREATES);
866         }
867         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
868                       IPSTATS_MIB_FRAGOKS);
869         consume_skb(skb);
870         return err;
871
872 fail_toobig:
873         if (skb->sk && dst_allfrag(skb_dst(skb)))
874                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
875
876         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
877         err = -EMSGSIZE;
878
879 fail:
880         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
881                       IPSTATS_MIB_FRAGFAILS);
882         kfree_skb(skb);
883         return err;
884 }
885
886 static inline int ip6_rt_check(const struct rt6key *rt_key,
887                                const struct in6_addr *fl_addr,
888                                const struct in6_addr *addr_cache)
889 {
890         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
891                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
892 }
893
894 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
895                                           struct dst_entry *dst,
896                                           const struct flowi6 *fl6)
897 {
898         struct ipv6_pinfo *np = inet6_sk(sk);
899         struct rt6_info *rt;
900
901         if (!dst)
902                 goto out;
903
904         if (dst->ops->family != AF_INET6) {
905                 dst_release(dst);
906                 return NULL;
907         }
908
909         rt = (struct rt6_info *)dst;
910         /* Yes, checking route validity in not connected
911          * case is not very simple. Take into account,
912          * that we do not support routing by source, TOS,
913          * and MSG_DONTROUTE            --ANK (980726)
914          *
915          * 1. ip6_rt_check(): If route was host route,
916          *    check that cached destination is current.
917          *    If it is network route, we still may
918          *    check its validity using saved pointer
919          *    to the last used address: daddr_cache.
920          *    We do not want to save whole address now,
921          *    (because main consumer of this service
922          *    is tcp, which has not this problem),
923          *    so that the last trick works only on connected
924          *    sockets.
925          * 2. oif also should be the same.
926          */
927         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
928 #ifdef CONFIG_IPV6_SUBTREES
929             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
930 #endif
931            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
932               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
933                 dst_release(dst);
934                 dst = NULL;
935         }
936
937 out:
938         return dst;
939 }
940
941 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
942                                struct dst_entry **dst, struct flowi6 *fl6)
943 {
944 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
945         struct neighbour *n;
946         struct rt6_info *rt;
947 #endif
948         int err;
949         int flags = 0;
950
951         /* The correct way to handle this would be to do
952          * ip6_route_get_saddr, and then ip6_route_output; however,
953          * the route-specific preferred source forces the
954          * ip6_route_output call _before_ ip6_route_get_saddr.
955          *
956          * In source specific routing (no src=any default route),
957          * ip6_route_output will fail given src=any saddr, though, so
958          * that's why we try it again later.
959          */
960         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
961                 struct fib6_info *from;
962                 struct rt6_info *rt;
963                 bool had_dst = *dst != NULL;
964
965                 if (!had_dst)
966                         *dst = ip6_route_output(net, sk, fl6);
967                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
968
969                 rcu_read_lock();
970                 from = rt ? rcu_dereference(rt->from) : NULL;
971                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
972                                           sk ? inet6_sk(sk)->srcprefs : 0,
973                                           &fl6->saddr);
974                 rcu_read_unlock();
975
976                 if (err)
977                         goto out_err_release;
978
979                 /* If we had an erroneous initial result, pretend it
980                  * never existed and let the SA-enabled version take
981                  * over.
982                  */
983                 if (!had_dst && (*dst)->error) {
984                         dst_release(*dst);
985                         *dst = NULL;
986                 }
987
988                 if (fl6->flowi6_oif)
989                         flags |= RT6_LOOKUP_F_IFACE;
990         }
991
992         if (!*dst)
993                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
994
995         err = (*dst)->error;
996         if (err)
997                 goto out_err_release;
998
999 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1000         /*
1001          * Here if the dst entry we've looked up
1002          * has a neighbour entry that is in the INCOMPLETE
1003          * state and the src address from the flow is
1004          * marked as OPTIMISTIC, we release the found
1005          * dst entry and replace it instead with the
1006          * dst entry of the nexthop router
1007          */
1008         rt = (struct rt6_info *) *dst;
1009         rcu_read_lock_bh();
1010         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1011                                       rt6_nexthop(rt, &fl6->daddr));
1012         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1013         rcu_read_unlock_bh();
1014
1015         if (err) {
1016                 struct inet6_ifaddr *ifp;
1017                 struct flowi6 fl_gw6;
1018                 int redirect;
1019
1020                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1021                                       (*dst)->dev, 1);
1022
1023                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1024                 if (ifp)
1025                         in6_ifa_put(ifp);
1026
1027                 if (redirect) {
1028                         /*
1029                          * We need to get the dst entry for the
1030                          * default router instead
1031                          */
1032                         dst_release(*dst);
1033                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1034                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1035                         *dst = ip6_route_output(net, sk, &fl_gw6);
1036                         err = (*dst)->error;
1037                         if (err)
1038                                 goto out_err_release;
1039                 }
1040         }
1041 #endif
1042         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1043             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1044                 err = -EAFNOSUPPORT;
1045                 goto out_err_release;
1046         }
1047
1048         return 0;
1049
1050 out_err_release:
1051         dst_release(*dst);
1052         *dst = NULL;
1053
1054         if (err == -ENETUNREACH)
1055                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1056         return err;
1057 }
1058
1059 /**
1060  *      ip6_dst_lookup - perform route lookup on flow
1061  *      @sk: socket which provides route info
1062  *      @dst: pointer to dst_entry * for result
1063  *      @fl6: flow to lookup
1064  *
1065  *      This function performs a route lookup on the given flow.
1066  *
1067  *      It returns zero on success, or a standard errno code on error.
1068  */
1069 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1070                    struct flowi6 *fl6)
1071 {
1072         *dst = NULL;
1073         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1074 }
1075 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1076
1077 /**
1078  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1079  *      @sk: socket which provides route info
1080  *      @fl6: flow to lookup
1081  *      @final_dst: final destination address for ipsec lookup
1082  *
1083  *      This function performs a route lookup on the given flow.
1084  *
1085  *      It returns a valid dst pointer on success, or a pointer encoded
1086  *      error code.
1087  */
1088 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1089                                       const struct in6_addr *final_dst)
1090 {
1091         struct dst_entry *dst = NULL;
1092         int err;
1093
1094         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1095         if (err)
1096                 return ERR_PTR(err);
1097         if (final_dst)
1098                 fl6->daddr = *final_dst;
1099
1100         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 }
1102 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1103
1104 /**
1105  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1106  *      @sk: socket which provides the dst cache and route info
1107  *      @fl6: flow to lookup
1108  *      @final_dst: final destination address for ipsec lookup
1109  *      @connected: whether @sk is connected or not
1110  *
1111  *      This function performs a route lookup on the given flow with the
1112  *      possibility of using the cached route in the socket if it is valid.
1113  *      It will take the socket dst lock when operating on the dst cache.
1114  *      As a result, this function can only be used in process context.
1115  *
1116  *      In addition, for a connected socket, cache the dst in the socket
1117  *      if the current cache is not valid.
1118  *
1119  *      It returns a valid dst pointer on success, or a pointer encoded
1120  *      error code.
1121  */
1122 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1123                                          const struct in6_addr *final_dst,
1124                                          bool connected)
1125 {
1126         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1127
1128         dst = ip6_sk_dst_check(sk, dst, fl6);
1129         if (dst)
1130                 return dst;
1131
1132         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1133         if (connected && !IS_ERR(dst))
1134                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1135
1136         return dst;
1137 }
1138 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1139
1140 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1141                                                gfp_t gfp)
1142 {
1143         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1144 }
1145
1146 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1147                                                 gfp_t gfp)
1148 {
1149         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1150 }
1151
1152 static void ip6_append_data_mtu(unsigned int *mtu,
1153                                 int *maxfraglen,
1154                                 unsigned int fragheaderlen,
1155                                 struct sk_buff *skb,
1156                                 struct rt6_info *rt,
1157                                 unsigned int orig_mtu)
1158 {
1159         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1160                 if (!skb) {
1161                         /* first fragment, reserve header_len */
1162                         *mtu = orig_mtu - rt->dst.header_len;
1163
1164                 } else {
1165                         /*
1166                          * this fragment is not first, the headers
1167                          * space is regarded as data space.
1168                          */
1169                         *mtu = orig_mtu;
1170                 }
1171                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1172                               + fragheaderlen - sizeof(struct frag_hdr);
1173         }
1174 }
1175
1176 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1177                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1178                           struct rt6_info *rt, struct flowi6 *fl6)
1179 {
1180         struct ipv6_pinfo *np = inet6_sk(sk);
1181         unsigned int mtu;
1182         struct ipv6_txoptions *opt = ipc6->opt;
1183
1184         /*
1185          * setup for corking
1186          */
1187         if (opt) {
1188                 if (WARN_ON(v6_cork->opt))
1189                         return -EINVAL;
1190
1191                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1192                 if (unlikely(!v6_cork->opt))
1193                         return -ENOBUFS;
1194
1195                 v6_cork->opt->tot_len = sizeof(*opt);
1196                 v6_cork->opt->opt_flen = opt->opt_flen;
1197                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1198
1199                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1200                                                     sk->sk_allocation);
1201                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1202                         return -ENOBUFS;
1203
1204                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1205                                                     sk->sk_allocation);
1206                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1207                         return -ENOBUFS;
1208
1209                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1210                                                    sk->sk_allocation);
1211                 if (opt->hopopt && !v6_cork->opt->hopopt)
1212                         return -ENOBUFS;
1213
1214                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1215                                                     sk->sk_allocation);
1216                 if (opt->srcrt && !v6_cork->opt->srcrt)
1217                         return -ENOBUFS;
1218
1219                 /* need source address above miyazawa*/
1220         }
1221         dst_hold(&rt->dst);
1222         cork->base.dst = &rt->dst;
1223         cork->fl.u.ip6 = *fl6;
1224         v6_cork->hop_limit = ipc6->hlimit;
1225         v6_cork->tclass = ipc6->tclass;
1226         if (rt->dst.flags & DST_XFRM_TUNNEL)
1227                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1228                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1229         else
1230                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1231                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1232         if (np->frag_size < mtu) {
1233                 if (np->frag_size)
1234                         mtu = np->frag_size;
1235         }
1236         if (mtu < IPV6_MIN_MTU)
1237                 return -EINVAL;
1238         cork->base.fragsize = mtu;
1239         cork->base.gso_size = ipc6->gso_size;
1240         cork->base.tx_flags = 0;
1241         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1242
1243         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1244                 cork->base.flags |= IPCORK_ALLFRAG;
1245         cork->base.length = 0;
1246
1247         cork->base.transmit_time = ipc6->sockc.transmit_time;
1248
1249         return 0;
1250 }
1251
1252 static int __ip6_append_data(struct sock *sk,
1253                              struct flowi6 *fl6,
1254                              struct sk_buff_head *queue,
1255                              struct inet_cork *cork,
1256                              struct inet6_cork *v6_cork,
1257                              struct page_frag *pfrag,
1258                              int getfrag(void *from, char *to, int offset,
1259                                          int len, int odd, struct sk_buff *skb),
1260                              void *from, int length, int transhdrlen,
1261                              unsigned int flags, struct ipcm6_cookie *ipc6)
1262 {
1263         struct sk_buff *skb, *skb_prev = NULL;
1264         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1265         struct ubuf_info *uarg = NULL;
1266         int exthdrlen = 0;
1267         int dst_exthdrlen = 0;
1268         int hh_len;
1269         int copy;
1270         int err;
1271         int offset = 0;
1272         u32 tskey = 0;
1273         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1274         struct ipv6_txoptions *opt = v6_cork->opt;
1275         int csummode = CHECKSUM_NONE;
1276         unsigned int maxnonfragsize, headersize;
1277         unsigned int wmem_alloc_delta = 0;
1278         bool paged, extra_uref;
1279
1280         skb = skb_peek_tail(queue);
1281         if (!skb) {
1282                 exthdrlen = opt ? opt->opt_flen : 0;
1283                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1284         }
1285
1286         paged = !!cork->gso_size;
1287         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1288         orig_mtu = mtu;
1289
1290         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1291             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1292                 tskey = sk->sk_tskey++;
1293
1294         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1295
1296         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297                         (opt ? opt->opt_nflen : 0);
1298         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1299                      sizeof(struct frag_hdr);
1300
1301         headersize = sizeof(struct ipv6hdr) +
1302                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1303                      (dst_allfrag(&rt->dst) ?
1304                       sizeof(struct frag_hdr) : 0) +
1305                      rt->rt6i_nfheader_len;
1306
1307         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1308          * the first fragment
1309          */
1310         if (headersize + transhdrlen > mtu)
1311                 goto emsgsize;
1312
1313         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1314             (sk->sk_protocol == IPPROTO_UDP ||
1315              sk->sk_protocol == IPPROTO_RAW)) {
1316                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1317                                 sizeof(struct ipv6hdr));
1318                 goto emsgsize;
1319         }
1320
1321         if (ip6_sk_ignore_df(sk))
1322                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1323         else
1324                 maxnonfragsize = mtu;
1325
1326         if (cork->length + length > maxnonfragsize - headersize) {
1327 emsgsize:
1328                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1329                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1330                 return -EMSGSIZE;
1331         }
1332
1333         /* CHECKSUM_PARTIAL only with no extension headers and when
1334          * we are not going to fragment
1335          */
1336         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1337             headersize == sizeof(struct ipv6hdr) &&
1338             length <= mtu - headersize &&
1339             (!(flags & MSG_MORE) || cork->gso_size) &&
1340             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1341                 csummode = CHECKSUM_PARTIAL;
1342
1343         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1344                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1345                 if (!uarg)
1346                         return -ENOBUFS;
1347                 extra_uref = true;
1348                 if (rt->dst.dev->features & NETIF_F_SG &&
1349                     csummode == CHECKSUM_PARTIAL) {
1350                         paged = true;
1351                 } else {
1352                         uarg->zerocopy = 0;
1353                         skb_zcopy_set(skb, uarg, &extra_uref);
1354                 }
1355         }
1356
1357         /*
1358          * Let's try using as much space as possible.
1359          * Use MTU if total length of the message fits into the MTU.
1360          * Otherwise, we need to reserve fragment header and
1361          * fragment alignment (= 8-15 octects, in total).
1362          *
1363          * Note that we may need to "move" the data from the tail of
1364          * of the buffer to the new fragment when we split
1365          * the message.
1366          *
1367          * FIXME: It may be fragmented into multiple chunks
1368          *        at once if non-fragmentable extension headers
1369          *        are too large.
1370          * --yoshfuji
1371          */
1372
1373         cork->length += length;
1374         if (!skb)
1375                 goto alloc_new_skb;
1376
1377         while (length > 0) {
1378                 /* Check if the remaining data fits into current packet. */
1379                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1380                 if (copy < length)
1381                         copy = maxfraglen - skb->len;
1382
1383                 if (copy <= 0) {
1384                         char *data;
1385                         unsigned int datalen;
1386                         unsigned int fraglen;
1387                         unsigned int fraggap;
1388                         unsigned int alloclen;
1389                         unsigned int pagedlen;
1390 alloc_new_skb:
1391                         /* There's no room in the current skb */
1392                         if (skb)
1393                                 fraggap = skb->len - maxfraglen;
1394                         else
1395                                 fraggap = 0;
1396                         /* update mtu and maxfraglen if necessary */
1397                         if (!skb || !skb_prev)
1398                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1399                                                     fragheaderlen, skb, rt,
1400                                                     orig_mtu);
1401
1402                         skb_prev = skb;
1403
1404                         /*
1405                          * If remaining data exceeds the mtu,
1406                          * we know we need more fragment(s).
1407                          */
1408                         datalen = length + fraggap;
1409
1410                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1411                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1412                         fraglen = datalen + fragheaderlen;
1413                         pagedlen = 0;
1414
1415                         if ((flags & MSG_MORE) &&
1416                             !(rt->dst.dev->features&NETIF_F_SG))
1417                                 alloclen = mtu;
1418                         else if (!paged)
1419                                 alloclen = fraglen;
1420                         else {
1421                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1422                                 pagedlen = fraglen - alloclen;
1423                         }
1424
1425                         alloclen += dst_exthdrlen;
1426
1427                         if (datalen != length + fraggap) {
1428                                 /*
1429                                  * this is not the last fragment, the trailer
1430                                  * space is regarded as data space.
1431                                  */
1432                                 datalen += rt->dst.trailer_len;
1433                         }
1434
1435                         alloclen += rt->dst.trailer_len;
1436                         fraglen = datalen + fragheaderlen;
1437
1438                         /*
1439                          * We just reserve space for fragment header.
1440                          * Note: this may be overallocation if the message
1441                          * (without MSG_MORE) fits into the MTU.
1442                          */
1443                         alloclen += sizeof(struct frag_hdr);
1444
1445                         copy = datalen - transhdrlen - fraggap - pagedlen;
1446                         if (copy < 0) {
1447                                 err = -EINVAL;
1448                                 goto error;
1449                         }
1450                         if (transhdrlen) {
1451                                 skb = sock_alloc_send_skb(sk,
1452                                                 alloclen + hh_len,
1453                                                 (flags & MSG_DONTWAIT), &err);
1454                         } else {
1455                                 skb = NULL;
1456                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1457                                     2 * sk->sk_sndbuf)
1458                                         skb = alloc_skb(alloclen + hh_len,
1459                                                         sk->sk_allocation);
1460                                 if (unlikely(!skb))
1461                                         err = -ENOBUFS;
1462                         }
1463                         if (!skb)
1464                                 goto error;
1465                         /*
1466                          *      Fill in the control structures
1467                          */
1468                         skb->protocol = htons(ETH_P_IPV6);
1469                         skb->ip_summed = csummode;
1470                         skb->csum = 0;
1471                         /* reserve for fragmentation and ipsec header */
1472                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1473                                     dst_exthdrlen);
1474
1475                         /*
1476                          *      Find where to start putting bytes
1477                          */
1478                         data = skb_put(skb, fraglen - pagedlen);
1479                         skb_set_network_header(skb, exthdrlen);
1480                         data += fragheaderlen;
1481                         skb->transport_header = (skb->network_header +
1482                                                  fragheaderlen);
1483                         if (fraggap) {
1484                                 skb->csum = skb_copy_and_csum_bits(
1485                                         skb_prev, maxfraglen,
1486                                         data + transhdrlen, fraggap, 0);
1487                                 skb_prev->csum = csum_sub(skb_prev->csum,
1488                                                           skb->csum);
1489                                 data += fraggap;
1490                                 pskb_trim_unique(skb_prev, maxfraglen);
1491                         }
1492                         if (copy > 0 &&
1493                             getfrag(from, data + transhdrlen, offset,
1494                                     copy, fraggap, skb) < 0) {
1495                                 err = -EFAULT;
1496                                 kfree_skb(skb);
1497                                 goto error;
1498                         }
1499
1500                         offset += copy;
1501                         length -= copy + transhdrlen;
1502                         transhdrlen = 0;
1503                         exthdrlen = 0;
1504                         dst_exthdrlen = 0;
1505
1506                         /* Only the initial fragment is time stamped */
1507                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1508                         cork->tx_flags = 0;
1509                         skb_shinfo(skb)->tskey = tskey;
1510                         tskey = 0;
1511                         skb_zcopy_set(skb, uarg, &extra_uref);
1512
1513                         if ((flags & MSG_CONFIRM) && !skb_prev)
1514                                 skb_set_dst_pending_confirm(skb, 1);
1515
1516                         /*
1517                          * Put the packet on the pending queue
1518                          */
1519                         if (!skb->destructor) {
1520                                 skb->destructor = sock_wfree;
1521                                 skb->sk = sk;
1522                                 wmem_alloc_delta += skb->truesize;
1523                         }
1524                         __skb_queue_tail(queue, skb);
1525                         continue;
1526                 }
1527
1528                 if (copy > length)
1529                         copy = length;
1530
1531                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1532                     skb_tailroom(skb) >= copy) {
1533                         unsigned int off;
1534
1535                         off = skb->len;
1536                         if (getfrag(from, skb_put(skb, copy),
1537                                                 offset, copy, off, skb) < 0) {
1538                                 __skb_trim(skb, off);
1539                                 err = -EFAULT;
1540                                 goto error;
1541                         }
1542                 } else if (!uarg || !uarg->zerocopy) {
1543                         int i = skb_shinfo(skb)->nr_frags;
1544
1545                         err = -ENOMEM;
1546                         if (!sk_page_frag_refill(sk, pfrag))
1547                                 goto error;
1548
1549                         if (!skb_can_coalesce(skb, i, pfrag->page,
1550                                               pfrag->offset)) {
1551                                 err = -EMSGSIZE;
1552                                 if (i == MAX_SKB_FRAGS)
1553                                         goto error;
1554
1555                                 __skb_fill_page_desc(skb, i, pfrag->page,
1556                                                      pfrag->offset, 0);
1557                                 skb_shinfo(skb)->nr_frags = ++i;
1558                                 get_page(pfrag->page);
1559                         }
1560                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1561                         if (getfrag(from,
1562                                     page_address(pfrag->page) + pfrag->offset,
1563                                     offset, copy, skb->len, skb) < 0)
1564                                 goto error_efault;
1565
1566                         pfrag->offset += copy;
1567                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1568                         skb->len += copy;
1569                         skb->data_len += copy;
1570                         skb->truesize += copy;
1571                         wmem_alloc_delta += copy;
1572                 } else {
1573                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1574                         if (err < 0)
1575                                 goto error;
1576                 }
1577                 offset += copy;
1578                 length -= copy;
1579         }
1580
1581         if (wmem_alloc_delta)
1582                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1583         return 0;
1584
1585 error_efault:
1586         err = -EFAULT;
1587 error:
1588         if (uarg)
1589                 sock_zerocopy_put_abort(uarg, extra_uref);
1590         cork->length -= length;
1591         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1592         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1593         return err;
1594 }
1595
1596 int ip6_append_data(struct sock *sk,
1597                     int getfrag(void *from, char *to, int offset, int len,
1598                                 int odd, struct sk_buff *skb),
1599                     void *from, int length, int transhdrlen,
1600                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1601                     struct rt6_info *rt, unsigned int flags)
1602 {
1603         struct inet_sock *inet = inet_sk(sk);
1604         struct ipv6_pinfo *np = inet6_sk(sk);
1605         int exthdrlen;
1606         int err;
1607
1608         if (flags&MSG_PROBE)
1609                 return 0;
1610         if (skb_queue_empty(&sk->sk_write_queue)) {
1611                 /*
1612                  * setup for corking
1613                  */
1614                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1615                                      ipc6, rt, fl6);
1616                 if (err)
1617                         return err;
1618
1619                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1620                 length += exthdrlen;
1621                 transhdrlen += exthdrlen;
1622         } else {
1623                 fl6 = &inet->cork.fl.u.ip6;
1624                 transhdrlen = 0;
1625         }
1626
1627         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1628                                  &np->cork, sk_page_frag(sk), getfrag,
1629                                  from, length, transhdrlen, flags, ipc6);
1630 }
1631 EXPORT_SYMBOL_GPL(ip6_append_data);
1632
1633 static void ip6_cork_release(struct inet_cork_full *cork,
1634                              struct inet6_cork *v6_cork)
1635 {
1636         if (v6_cork->opt) {
1637                 kfree(v6_cork->opt->dst0opt);
1638                 kfree(v6_cork->opt->dst1opt);
1639                 kfree(v6_cork->opt->hopopt);
1640                 kfree(v6_cork->opt->srcrt);
1641                 kfree(v6_cork->opt);
1642                 v6_cork->opt = NULL;
1643         }
1644
1645         if (cork->base.dst) {
1646                 dst_release(cork->base.dst);
1647                 cork->base.dst = NULL;
1648                 cork->base.flags &= ~IPCORK_ALLFRAG;
1649         }
1650         memset(&cork->fl, 0, sizeof(cork->fl));
1651 }
1652
1653 struct sk_buff *__ip6_make_skb(struct sock *sk,
1654                                struct sk_buff_head *queue,
1655                                struct inet_cork_full *cork,
1656                                struct inet6_cork *v6_cork)
1657 {
1658         struct sk_buff *skb, *tmp_skb;
1659         struct sk_buff **tail_skb;
1660         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1661         struct ipv6_pinfo *np = inet6_sk(sk);
1662         struct net *net = sock_net(sk);
1663         struct ipv6hdr *hdr;
1664         struct ipv6_txoptions *opt = v6_cork->opt;
1665         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1666         struct flowi6 *fl6 = &cork->fl.u.ip6;
1667         unsigned char proto = fl6->flowi6_proto;
1668
1669         skb = __skb_dequeue(queue);
1670         if (!skb)
1671                 goto out;
1672         tail_skb = &(skb_shinfo(skb)->frag_list);
1673
1674         /* move skb->data to ip header from ext header */
1675         if (skb->data < skb_network_header(skb))
1676                 __skb_pull(skb, skb_network_offset(skb));
1677         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1678                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1679                 *tail_skb = tmp_skb;
1680                 tail_skb = &(tmp_skb->next);
1681                 skb->len += tmp_skb->len;
1682                 skb->data_len += tmp_skb->len;
1683                 skb->truesize += tmp_skb->truesize;
1684                 tmp_skb->destructor = NULL;
1685                 tmp_skb->sk = NULL;
1686         }
1687
1688         /* Allow local fragmentation. */
1689         skb->ignore_df = ip6_sk_ignore_df(sk);
1690
1691         *final_dst = fl6->daddr;
1692         __skb_pull(skb, skb_network_header_len(skb));
1693         if (opt && opt->opt_flen)
1694                 ipv6_push_frag_opts(skb, opt, &proto);
1695         if (opt && opt->opt_nflen)
1696                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1697
1698         skb_push(skb, sizeof(struct ipv6hdr));
1699         skb_reset_network_header(skb);
1700         hdr = ipv6_hdr(skb);
1701
1702         ip6_flow_hdr(hdr, v6_cork->tclass,
1703                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1704                                         ip6_autoflowlabel(net, np), fl6));
1705         hdr->hop_limit = v6_cork->hop_limit;
1706         hdr->nexthdr = proto;
1707         hdr->saddr = fl6->saddr;
1708         hdr->daddr = *final_dst;
1709
1710         skb->priority = sk->sk_priority;
1711         skb->mark = sk->sk_mark;
1712
1713         skb->tstamp = cork->base.transmit_time;
1714
1715         skb_dst_set(skb, dst_clone(&rt->dst));
1716         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1717         if (proto == IPPROTO_ICMPV6) {
1718                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1719
1720                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1721                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1722         }
1723
1724         ip6_cork_release(cork, v6_cork);
1725 out:
1726         return skb;
1727 }
1728
1729 int ip6_send_skb(struct sk_buff *skb)
1730 {
1731         struct net *net = sock_net(skb->sk);
1732         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1733         int err;
1734
1735         err = ip6_local_out(net, skb->sk, skb);
1736         if (err) {
1737                 if (err > 0)
1738                         err = net_xmit_errno(err);
1739                 if (err)
1740                         IP6_INC_STATS(net, rt->rt6i_idev,
1741                                       IPSTATS_MIB_OUTDISCARDS);
1742         }
1743
1744         return err;
1745 }
1746
1747 int ip6_push_pending_frames(struct sock *sk)
1748 {
1749         struct sk_buff *skb;
1750
1751         skb = ip6_finish_skb(sk);
1752         if (!skb)
1753                 return 0;
1754
1755         return ip6_send_skb(skb);
1756 }
1757 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1758
1759 static void __ip6_flush_pending_frames(struct sock *sk,
1760                                        struct sk_buff_head *queue,
1761                                        struct inet_cork_full *cork,
1762                                        struct inet6_cork *v6_cork)
1763 {
1764         struct sk_buff *skb;
1765
1766         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1767                 if (skb_dst(skb))
1768                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1769                                       IPSTATS_MIB_OUTDISCARDS);
1770                 kfree_skb(skb);
1771         }
1772
1773         ip6_cork_release(cork, v6_cork);
1774 }
1775
1776 void ip6_flush_pending_frames(struct sock *sk)
1777 {
1778         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1779                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1780 }
1781 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1782
1783 struct sk_buff *ip6_make_skb(struct sock *sk,
1784                              int getfrag(void *from, char *to, int offset,
1785                                          int len, int odd, struct sk_buff *skb),
1786                              void *from, int length, int transhdrlen,
1787                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1788                              struct rt6_info *rt, unsigned int flags,
1789                              struct inet_cork_full *cork)
1790 {
1791         struct inet6_cork v6_cork;
1792         struct sk_buff_head queue;
1793         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1794         int err;
1795
1796         if (flags & MSG_PROBE)
1797                 return NULL;
1798
1799         __skb_queue_head_init(&queue);
1800
1801         cork->base.flags = 0;
1802         cork->base.addr = 0;
1803         cork->base.opt = NULL;
1804         cork->base.dst = NULL;
1805         v6_cork.opt = NULL;
1806         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1807         if (err) {
1808                 ip6_cork_release(cork, &v6_cork);
1809                 return ERR_PTR(err);
1810         }
1811         if (ipc6->dontfrag < 0)
1812                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1813
1814         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1815                                 &current->task_frag, getfrag, from,
1816                                 length + exthdrlen, transhdrlen + exthdrlen,
1817                                 flags, ipc6);
1818         if (err) {
1819                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1820                 return ERR_PTR(err);
1821         }
1822
1823         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1824 }