ipv6: allow to cache dst for a connected sk in ip6_sk_dst_lookup_flow()
[linux-2.6-microblaze.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_is_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142         /* Policy lookup after SNAT yielded a new policy */
143         if (skb_dst(skb)->xfrm) {
144                 IPCB(skb)->flags |= IPSKB_REROUTED;
145                 return dst_output(net, sk, skb);
146         }
147 #endif
148
149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150             dst_allfrag(skb_dst(skb)) ||
151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162         skb->protocol = htons(ETH_P_IPV6);
163         skb->dev = dev;
164
165         if (unlikely(idev->cnf.disable_ipv6)) {
166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167                 kfree_skb(skb);
168                 return 0;
169         }
170
171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172                             net, sk, skb, NULL, dev,
173                             ip6_finish_output,
174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179         if (!np->autoflowlabel_set)
180                 return ip6_default_np_autolabel(net);
181         else
182                 return np->autoflowlabel;
183 }
184
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194         struct net *net = sock_net(sk);
195         const struct ipv6_pinfo *np = inet6_sk(sk);
196         struct in6_addr *first_hop = &fl6->daddr;
197         struct dst_entry *dst = skb_dst(skb);
198         struct ipv6hdr *hdr;
199         u8  proto = fl6->flowi6_proto;
200         int seg_len = skb->len;
201         int hlimit = -1;
202         u32 mtu;
203
204         if (opt) {
205                 unsigned int head_room;
206
207                 /* First: exthdrs may take lots of space (~8K for now)
208                    MAX_HEADER is not enough.
209                  */
210                 head_room = opt->opt_nflen + opt->opt_flen;
211                 seg_len += head_room;
212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214                 if (skb_headroom(skb) < head_room) {
215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216                         if (!skb2) {
217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218                                               IPSTATS_MIB_OUTDISCARDS);
219                                 kfree_skb(skb);
220                                 return -ENOBUFS;
221                         }
222                         consume_skb(skb);
223                         skb = skb2;
224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225                          * it is safe to call in our context (socket lock not held)
226                          */
227                         skb_set_owner_w(skb, (struct sock *)sk);
228                 }
229                 if (opt->opt_flen)
230                         ipv6_push_frag_opts(skb, opt, &proto);
231                 if (opt->opt_nflen)
232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233                                              &fl6->saddr);
234         }
235
236         skb_push(skb, sizeof(struct ipv6hdr));
237         skb_reset_network_header(skb);
238         hdr = ipv6_hdr(skb);
239
240         /*
241          *      Fill in the IPv6 header
242          */
243         if (np)
244                 hlimit = np->hop_limit;
245         if (hlimit < 0)
246                 hlimit = ip6_dst_hoplimit(dst);
247
248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249                                 ip6_autoflowlabel(net, np), fl6));
250
251         hdr->payload_len = htons(seg_len);
252         hdr->nexthdr = proto;
253         hdr->hop_limit = hlimit;
254
255         hdr->saddr = fl6->saddr;
256         hdr->daddr = *first_hop;
257
258         skb->protocol = htons(ETH_P_IPV6);
259         skb->priority = sk->sk_priority;
260         skb->mark = mark;
261
262         mtu = dst_mtu(dst);
263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265                               IPSTATS_MIB_OUT, skb->len);
266
267                 /* if egress device is enslaved to an L3 master device pass the
268                  * skb to its handler for processing
269                  */
270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
271                 if (unlikely(!skb))
272                         return 0;
273
274                 /* hooks should never assume socket lock is held.
275                  * we promote our socket to non const
276                  */
277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278                                net, (struct sock *)sk, skb, NULL, dst->dev,
279                                dst_output);
280         }
281
282         skb->dev = dst->dev;
283         /* ipv6_local_error() does not require socket lock,
284          * we promote our socket to non const
285          */
286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287
288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289         kfree_skb(skb);
290         return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296         struct ip6_ra_chain *ra;
297         struct sock *last = NULL;
298
299         read_lock(&ip6_ra_lock);
300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
301                 struct sock *sk = ra->sk;
302                 if (sk && ra->sel == sel &&
303                     (!sk->sk_bound_dev_if ||
304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
305                         if (last) {
306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307                                 if (skb2)
308                                         rawv6_rcv(last, skb2);
309                         }
310                         last = sk;
311                 }
312         }
313
314         if (last) {
315                 rawv6_rcv(last, skb);
316                 read_unlock(&ip6_ra_lock);
317                 return 1;
318         }
319         read_unlock(&ip6_ra_lock);
320         return 0;
321 }
322
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325         struct ipv6hdr *hdr = ipv6_hdr(skb);
326         u8 nexthdr = hdr->nexthdr;
327         __be16 frag_off;
328         int offset;
329
330         if (ipv6_ext_hdr(nexthdr)) {
331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332                 if (offset < 0)
333                         return 0;
334         } else
335                 offset = sizeof(struct ipv6hdr);
336
337         if (nexthdr == IPPROTO_ICMPV6) {
338                 struct icmp6hdr *icmp6;
339
340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341                                          offset + 1 - skb->data)))
342                         return 0;
343
344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346                 switch (icmp6->icmp6_type) {
347                 case NDISC_ROUTER_SOLICITATION:
348                 case NDISC_ROUTER_ADVERTISEMENT:
349                 case NDISC_NEIGHBOUR_SOLICITATION:
350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351                 case NDISC_REDIRECT:
352                         /* For reaction involving unicast neighbor discovery
353                          * message destined to the proxied address, pass it to
354                          * input function.
355                          */
356                         return 1;
357                 default:
358                         break;
359                 }
360         }
361
362         /*
363          * The proxying router can't forward traffic sent to a link-local
364          * address, so signal the sender and discard the packet. This
365          * behavior is clarified by the MIPv6 specification.
366          */
367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368                 dst_link_failure(skb);
369                 return -1;
370         }
371
372         return 0;
373 }
374
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376                                      struct sk_buff *skb)
377 {
378         return dst_output(net, sk, skb);
379 }
380
381 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
382 {
383         unsigned int mtu;
384         struct inet6_dev *idev;
385
386         if (dst_metric_locked(dst, RTAX_MTU)) {
387                 mtu = dst_metric_raw(dst, RTAX_MTU);
388                 if (mtu)
389                         return mtu;
390         }
391
392         mtu = IPV6_MIN_MTU;
393         rcu_read_lock();
394         idev = __in6_dev_get(dst->dev);
395         if (idev)
396                 mtu = idev->cnf.mtu6;
397         rcu_read_unlock();
398
399         return mtu;
400 }
401 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
402
403 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
404 {
405         if (skb->len <= mtu)
406                 return false;
407
408         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
409         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
410                 return true;
411
412         if (skb->ignore_df)
413                 return false;
414
415         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
416                 return false;
417
418         return true;
419 }
420
421 int ip6_forward(struct sk_buff *skb)
422 {
423         struct dst_entry *dst = skb_dst(skb);
424         struct ipv6hdr *hdr = ipv6_hdr(skb);
425         struct inet6_skb_parm *opt = IP6CB(skb);
426         struct net *net = dev_net(dst->dev);
427         u32 mtu;
428
429         if (net->ipv6.devconf_all->forwarding == 0)
430                 goto error;
431
432         if (skb->pkt_type != PACKET_HOST)
433                 goto drop;
434
435         if (unlikely(skb->sk))
436                 goto drop;
437
438         if (skb_warn_if_lro(skb))
439                 goto drop;
440
441         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
442                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
443                                 IPSTATS_MIB_INDISCARDS);
444                 goto drop;
445         }
446
447         skb_forward_csum(skb);
448
449         /*
450          *      We DO NOT make any processing on
451          *      RA packets, pushing them to user level AS IS
452          *      without ane WARRANTY that application will be able
453          *      to interpret them. The reason is that we
454          *      cannot make anything clever here.
455          *
456          *      We are not end-node, so that if packet contains
457          *      AH/ESP, we cannot make anything.
458          *      Defragmentation also would be mistake, RA packets
459          *      cannot be fragmented, because there is no warranty
460          *      that different fragments will go along one path. --ANK
461          */
462         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
463                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
464                         return 0;
465         }
466
467         /*
468          *      check and decrement ttl
469          */
470         if (hdr->hop_limit <= 1) {
471                 /* Force OUTPUT device used as source address */
472                 skb->dev = dst->dev;
473                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
474                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
475                                 IPSTATS_MIB_INHDRERRORS);
476
477                 kfree_skb(skb);
478                 return -ETIMEDOUT;
479         }
480
481         /* XXX: idev->cnf.proxy_ndp? */
482         if (net->ipv6.devconf_all->proxy_ndp &&
483             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
484                 int proxied = ip6_forward_proxy_check(skb);
485                 if (proxied > 0)
486                         return ip6_input(skb);
487                 else if (proxied < 0) {
488                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
489                                         IPSTATS_MIB_INDISCARDS);
490                         goto drop;
491                 }
492         }
493
494         if (!xfrm6_route_forward(skb)) {
495                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
496                                 IPSTATS_MIB_INDISCARDS);
497                 goto drop;
498         }
499         dst = skb_dst(skb);
500
501         /* IPv6 specs say nothing about it, but it is clear that we cannot
502            send redirects to source routed frames.
503            We don't send redirects to frames decapsulated from IPsec.
504          */
505         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
506                 struct in6_addr *target = NULL;
507                 struct inet_peer *peer;
508                 struct rt6_info *rt;
509
510                 /*
511                  *      incoming and outgoing devices are the same
512                  *      send a redirect.
513                  */
514
515                 rt = (struct rt6_info *) dst;
516                 if (rt->rt6i_flags & RTF_GATEWAY)
517                         target = &rt->rt6i_gateway;
518                 else
519                         target = &hdr->daddr;
520
521                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
522
523                 /* Limit redirects both by destination (here)
524                    and by source (inside ndisc_send_redirect)
525                  */
526                 if (inet_peer_xrlim_allow(peer, 1*HZ))
527                         ndisc_send_redirect(skb, target);
528                 if (peer)
529                         inet_putpeer(peer);
530         } else {
531                 int addrtype = ipv6_addr_type(&hdr->saddr);
532
533                 /* This check is security critical. */
534                 if (addrtype == IPV6_ADDR_ANY ||
535                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
536                         goto error;
537                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
538                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
539                                     ICMPV6_NOT_NEIGHBOUR, 0);
540                         goto error;
541                 }
542         }
543
544         mtu = ip6_dst_mtu_forward(dst);
545         if (mtu < IPV6_MIN_MTU)
546                 mtu = IPV6_MIN_MTU;
547
548         if (ip6_pkt_too_big(skb, mtu)) {
549                 /* Again, force OUTPUT device used as source address */
550                 skb->dev = dst->dev;
551                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
552                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
553                                 IPSTATS_MIB_INTOOBIGERRORS);
554                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
555                                 IPSTATS_MIB_FRAGFAILS);
556                 kfree_skb(skb);
557                 return -EMSGSIZE;
558         }
559
560         if (skb_cow(skb, dst->dev->hard_header_len)) {
561                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
562                                 IPSTATS_MIB_OUTDISCARDS);
563                 goto drop;
564         }
565
566         hdr = ipv6_hdr(skb);
567
568         /* Mangling hops number delayed to point after skb COW */
569
570         hdr->hop_limit--;
571
572         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
573         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
574         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
575                        net, NULL, skb, skb->dev, dst->dev,
576                        ip6_forward_finish);
577
578 error:
579         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
580 drop:
581         kfree_skb(skb);
582         return -EINVAL;
583 }
584
585 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
586 {
587         to->pkt_type = from->pkt_type;
588         to->priority = from->priority;
589         to->protocol = from->protocol;
590         skb_dst_drop(to);
591         skb_dst_set(to, dst_clone(skb_dst(from)));
592         to->dev = from->dev;
593         to->mark = from->mark;
594
595 #ifdef CONFIG_NET_SCHED
596         to->tc_index = from->tc_index;
597 #endif
598         nf_copy(to, from);
599         skb_copy_secmark(to, from);
600 }
601
602 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
603                  int (*output)(struct net *, struct sock *, struct sk_buff *))
604 {
605         struct sk_buff *frag;
606         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
607         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
608                                 inet6_sk(skb->sk) : NULL;
609         struct ipv6hdr *tmp_hdr;
610         struct frag_hdr *fh;
611         unsigned int mtu, hlen, left, len;
612         int hroom, troom;
613         __be32 frag_id;
614         int ptr, offset = 0, err = 0;
615         u8 *prevhdr, nexthdr = 0;
616
617         err = ip6_find_1stfragopt(skb, &prevhdr);
618         if (err < 0)
619                 goto fail;
620         hlen = err;
621         nexthdr = *prevhdr;
622
623         mtu = ip6_skb_dst_mtu(skb);
624
625         /* We must not fragment if the socket is set to force MTU discovery
626          * or if the skb it not generated by a local socket.
627          */
628         if (unlikely(!skb->ignore_df && skb->len > mtu))
629                 goto fail_toobig;
630
631         if (IP6CB(skb)->frag_max_size) {
632                 if (IP6CB(skb)->frag_max_size > mtu)
633                         goto fail_toobig;
634
635                 /* don't send fragments larger than what we received */
636                 mtu = IP6CB(skb)->frag_max_size;
637                 if (mtu < IPV6_MIN_MTU)
638                         mtu = IPV6_MIN_MTU;
639         }
640
641         if (np && np->frag_size < mtu) {
642                 if (np->frag_size)
643                         mtu = np->frag_size;
644         }
645         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
646                 goto fail_toobig;
647         mtu -= hlen + sizeof(struct frag_hdr);
648
649         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
650                                     &ipv6_hdr(skb)->saddr);
651
652         if (skb->ip_summed == CHECKSUM_PARTIAL &&
653             (err = skb_checksum_help(skb)))
654                 goto fail;
655
656         hroom = LL_RESERVED_SPACE(rt->dst.dev);
657         if (skb_has_frag_list(skb)) {
658                 unsigned int first_len = skb_pagelen(skb);
659                 struct sk_buff *frag2;
660
661                 if (first_len - hlen > mtu ||
662                     ((first_len - hlen) & 7) ||
663                     skb_cloned(skb) ||
664                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
665                         goto slow_path;
666
667                 skb_walk_frags(skb, frag) {
668                         /* Correct geometry. */
669                         if (frag->len > mtu ||
670                             ((frag->len & 7) && frag->next) ||
671                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
672                                 goto slow_path_clean;
673
674                         /* Partially cloned skb? */
675                         if (skb_shared(frag))
676                                 goto slow_path_clean;
677
678                         BUG_ON(frag->sk);
679                         if (skb->sk) {
680                                 frag->sk = skb->sk;
681                                 frag->destructor = sock_wfree;
682                         }
683                         skb->truesize -= frag->truesize;
684                 }
685
686                 err = 0;
687                 offset = 0;
688                 /* BUILD HEADER */
689
690                 *prevhdr = NEXTHDR_FRAGMENT;
691                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692                 if (!tmp_hdr) {
693                         err = -ENOMEM;
694                         goto fail;
695                 }
696                 frag = skb_shinfo(skb)->frag_list;
697                 skb_frag_list_init(skb);
698
699                 __skb_pull(skb, hlen);
700                 fh = __skb_push(skb, sizeof(struct frag_hdr));
701                 __skb_push(skb, hlen);
702                 skb_reset_network_header(skb);
703                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
704
705                 fh->nexthdr = nexthdr;
706                 fh->reserved = 0;
707                 fh->frag_off = htons(IP6_MF);
708                 fh->identification = frag_id;
709
710                 first_len = skb_pagelen(skb);
711                 skb->data_len = first_len - skb_headlen(skb);
712                 skb->len = first_len;
713                 ipv6_hdr(skb)->payload_len = htons(first_len -
714                                                    sizeof(struct ipv6hdr));
715
716                 for (;;) {
717                         /* Prepare header of the next frame,
718                          * before previous one went down. */
719                         if (frag) {
720                                 frag->ip_summed = CHECKSUM_NONE;
721                                 skb_reset_transport_header(frag);
722                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
723                                 __skb_push(frag, hlen);
724                                 skb_reset_network_header(frag);
725                                 memcpy(skb_network_header(frag), tmp_hdr,
726                                        hlen);
727                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
728                                 fh->nexthdr = nexthdr;
729                                 fh->reserved = 0;
730                                 fh->frag_off = htons(offset);
731                                 if (frag->next)
732                                         fh->frag_off |= htons(IP6_MF);
733                                 fh->identification = frag_id;
734                                 ipv6_hdr(frag)->payload_len =
735                                                 htons(frag->len -
736                                                       sizeof(struct ipv6hdr));
737                                 ip6_copy_metadata(frag, skb);
738                         }
739
740                         err = output(net, sk, skb);
741                         if (!err)
742                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743                                               IPSTATS_MIB_FRAGCREATES);
744
745                         if (err || !frag)
746                                 break;
747
748                         skb = frag;
749                         frag = skb->next;
750                         skb->next = NULL;
751                 }
752
753                 kfree(tmp_hdr);
754
755                 if (err == 0) {
756                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757                                       IPSTATS_MIB_FRAGOKS);
758                         return 0;
759                 }
760
761                 kfree_skb_list(frag);
762
763                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764                               IPSTATS_MIB_FRAGFAILS);
765                 return err;
766
767 slow_path_clean:
768                 skb_walk_frags(skb, frag2) {
769                         if (frag2 == frag)
770                                 break;
771                         frag2->sk = NULL;
772                         frag2->destructor = NULL;
773                         skb->truesize += frag2->truesize;
774                 }
775         }
776
777 slow_path:
778         left = skb->len - hlen;         /* Space per frame */
779         ptr = hlen;                     /* Where to start from */
780
781         /*
782          *      Fragment the datagram.
783          */
784
785         troom = rt->dst.dev->needed_tailroom;
786
787         /*
788          *      Keep copying data until we run out.
789          */
790         while (left > 0)        {
791                 u8 *fragnexthdr_offset;
792
793                 len = left;
794                 /* IF: it doesn't fit, use 'mtu' - the data space left */
795                 if (len > mtu)
796                         len = mtu;
797                 /* IF: we are not sending up to and including the packet end
798                    then align the next start on an eight byte boundary */
799                 if (len < left) {
800                         len &= ~7;
801                 }
802
803                 /* Allocate buffer */
804                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
805                                  hroom + troom, GFP_ATOMIC);
806                 if (!frag) {
807                         err = -ENOMEM;
808                         goto fail;
809                 }
810
811                 /*
812                  *      Set up data on packet
813                  */
814
815                 ip6_copy_metadata(frag, skb);
816                 skb_reserve(frag, hroom);
817                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
818                 skb_reset_network_header(frag);
819                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
820                 frag->transport_header = (frag->network_header + hlen +
821                                           sizeof(struct frag_hdr));
822
823                 /*
824                  *      Charge the memory for the fragment to any owner
825                  *      it might possess
826                  */
827                 if (skb->sk)
828                         skb_set_owner_w(frag, skb->sk);
829
830                 /*
831                  *      Copy the packet header into the new buffer.
832                  */
833                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
834
835                 fragnexthdr_offset = skb_network_header(frag);
836                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
837                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
838
839                 /*
840                  *      Build fragment header.
841                  */
842                 fh->nexthdr = nexthdr;
843                 fh->reserved = 0;
844                 fh->identification = frag_id;
845
846                 /*
847                  *      Copy a block of the IP datagram.
848                  */
849                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
850                                      len));
851                 left -= len;
852
853                 fh->frag_off = htons(offset);
854                 if (left > 0)
855                         fh->frag_off |= htons(IP6_MF);
856                 ipv6_hdr(frag)->payload_len = htons(frag->len -
857                                                     sizeof(struct ipv6hdr));
858
859                 ptr += len;
860                 offset += len;
861
862                 /*
863                  *      Put this fragment into the sending queue.
864                  */
865                 err = output(net, sk, frag);
866                 if (err)
867                         goto fail;
868
869                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
870                               IPSTATS_MIB_FRAGCREATES);
871         }
872         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873                       IPSTATS_MIB_FRAGOKS);
874         consume_skb(skb);
875         return err;
876
877 fail_toobig:
878         if (skb->sk && dst_allfrag(skb_dst(skb)))
879                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
880
881         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
882         err = -EMSGSIZE;
883
884 fail:
885         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886                       IPSTATS_MIB_FRAGFAILS);
887         kfree_skb(skb);
888         return err;
889 }
890
891 static inline int ip6_rt_check(const struct rt6key *rt_key,
892                                const struct in6_addr *fl_addr,
893                                const struct in6_addr *addr_cache)
894 {
895         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
896                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
897 }
898
899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
900                                           struct dst_entry *dst,
901                                           const struct flowi6 *fl6)
902 {
903         struct ipv6_pinfo *np = inet6_sk(sk);
904         struct rt6_info *rt;
905
906         if (!dst)
907                 goto out;
908
909         if (dst->ops->family != AF_INET6) {
910                 dst_release(dst);
911                 return NULL;
912         }
913
914         rt = (struct rt6_info *)dst;
915         /* Yes, checking route validity in not connected
916          * case is not very simple. Take into account,
917          * that we do not support routing by source, TOS,
918          * and MSG_DONTROUTE            --ANK (980726)
919          *
920          * 1. ip6_rt_check(): If route was host route,
921          *    check that cached destination is current.
922          *    If it is network route, we still may
923          *    check its validity using saved pointer
924          *    to the last used address: daddr_cache.
925          *    We do not want to save whole address now,
926          *    (because main consumer of this service
927          *    is tcp, which has not this problem),
928          *    so that the last trick works only on connected
929          *    sockets.
930          * 2. oif also should be the same.
931          */
932         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
937               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
938                 dst_release(dst);
939                 dst = NULL;
940         }
941
942 out:
943         return dst;
944 }
945
946 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
947                                struct dst_entry **dst, struct flowi6 *fl6)
948 {
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950         struct neighbour *n;
951         struct rt6_info *rt;
952 #endif
953         int err;
954         int flags = 0;
955
956         /* The correct way to handle this would be to do
957          * ip6_route_get_saddr, and then ip6_route_output; however,
958          * the route-specific preferred source forces the
959          * ip6_route_output call _before_ ip6_route_get_saddr.
960          *
961          * In source specific routing (no src=any default route),
962          * ip6_route_output will fail given src=any saddr, though, so
963          * that's why we try it again later.
964          */
965         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
966                 struct rt6_info *rt;
967                 bool had_dst = *dst != NULL;
968
969                 if (!had_dst)
970                         *dst = ip6_route_output(net, sk, fl6);
971                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
972                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
973                                           sk ? inet6_sk(sk)->srcprefs : 0,
974                                           &fl6->saddr);
975                 if (err)
976                         goto out_err_release;
977
978                 /* If we had an erroneous initial result, pretend it
979                  * never existed and let the SA-enabled version take
980                  * over.
981                  */
982                 if (!had_dst && (*dst)->error) {
983                         dst_release(*dst);
984                         *dst = NULL;
985                 }
986
987                 if (fl6->flowi6_oif)
988                         flags |= RT6_LOOKUP_F_IFACE;
989         }
990
991         if (!*dst)
992                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
993
994         err = (*dst)->error;
995         if (err)
996                 goto out_err_release;
997
998 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
999         /*
1000          * Here if the dst entry we've looked up
1001          * has a neighbour entry that is in the INCOMPLETE
1002          * state and the src address from the flow is
1003          * marked as OPTIMISTIC, we release the found
1004          * dst entry and replace it instead with the
1005          * dst entry of the nexthop router
1006          */
1007         rt = (struct rt6_info *) *dst;
1008         rcu_read_lock_bh();
1009         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1010                                       rt6_nexthop(rt, &fl6->daddr));
1011         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1012         rcu_read_unlock_bh();
1013
1014         if (err) {
1015                 struct inet6_ifaddr *ifp;
1016                 struct flowi6 fl_gw6;
1017                 int redirect;
1018
1019                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1020                                       (*dst)->dev, 1);
1021
1022                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1023                 if (ifp)
1024                         in6_ifa_put(ifp);
1025
1026                 if (redirect) {
1027                         /*
1028                          * We need to get the dst entry for the
1029                          * default router instead
1030                          */
1031                         dst_release(*dst);
1032                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1033                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1034                         *dst = ip6_route_output(net, sk, &fl_gw6);
1035                         err = (*dst)->error;
1036                         if (err)
1037                                 goto out_err_release;
1038                 }
1039         }
1040 #endif
1041         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1042             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1043                 err = -EAFNOSUPPORT;
1044                 goto out_err_release;
1045         }
1046
1047         return 0;
1048
1049 out_err_release:
1050         dst_release(*dst);
1051         *dst = NULL;
1052
1053         if (err == -ENETUNREACH)
1054                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1055         return err;
1056 }
1057
1058 /**
1059  *      ip6_dst_lookup - perform route lookup on flow
1060  *      @sk: socket which provides route info
1061  *      @dst: pointer to dst_entry * for result
1062  *      @fl6: flow to lookup
1063  *
1064  *      This function performs a route lookup on the given flow.
1065  *
1066  *      It returns zero on success, or a standard errno code on error.
1067  */
1068 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1069                    struct flowi6 *fl6)
1070 {
1071         *dst = NULL;
1072         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1075
1076 /**
1077  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1078  *      @sk: socket which provides route info
1079  *      @fl6: flow to lookup
1080  *      @final_dst: final destination address for ipsec lookup
1081  *
1082  *      This function performs a route lookup on the given flow.
1083  *
1084  *      It returns a valid dst pointer on success, or a pointer encoded
1085  *      error code.
1086  */
1087 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1088                                       const struct in6_addr *final_dst)
1089 {
1090         struct dst_entry *dst = NULL;
1091         int err;
1092
1093         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1094         if (err)
1095                 return ERR_PTR(err);
1096         if (final_dst)
1097                 fl6->daddr = *final_dst;
1098
1099         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1102
1103 /**
1104  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1105  *      @sk: socket which provides the dst cache and route info
1106  *      @fl6: flow to lookup
1107  *      @final_dst: final destination address for ipsec lookup
1108  *      @connected: whether @sk is connected or not
1109  *
1110  *      This function performs a route lookup on the given flow with the
1111  *      possibility of using the cached route in the socket if it is valid.
1112  *      It will take the socket dst lock when operating on the dst cache.
1113  *      As a result, this function can only be used in process context.
1114  *
1115  *      In addition, for a connected socket, cache the dst in the socket
1116  *      if the current cache is not valid.
1117  *
1118  *      It returns a valid dst pointer on success, or a pointer encoded
1119  *      error code.
1120  */
1121 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1122                                          const struct in6_addr *final_dst,
1123                                          bool connected)
1124 {
1125         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1126
1127         dst = ip6_sk_dst_check(sk, dst, fl6);
1128         if (dst)
1129                 return dst;
1130
1131         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1132         if (connected && !IS_ERR(dst))
1133                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1134
1135         return dst;
1136 }
1137 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1138
1139 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1140                                                gfp_t gfp)
1141 {
1142         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1143 }
1144
1145 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1146                                                 gfp_t gfp)
1147 {
1148         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1149 }
1150
1151 static void ip6_append_data_mtu(unsigned int *mtu,
1152                                 int *maxfraglen,
1153                                 unsigned int fragheaderlen,
1154                                 struct sk_buff *skb,
1155                                 struct rt6_info *rt,
1156                                 unsigned int orig_mtu)
1157 {
1158         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1159                 if (!skb) {
1160                         /* first fragment, reserve header_len */
1161                         *mtu = orig_mtu - rt->dst.header_len;
1162
1163                 } else {
1164                         /*
1165                          * this fragment is not first, the headers
1166                          * space is regarded as data space.
1167                          */
1168                         *mtu = orig_mtu;
1169                 }
1170                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1171                               + fragheaderlen - sizeof(struct frag_hdr);
1172         }
1173 }
1174
1175 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1176                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1177                           struct rt6_info *rt, struct flowi6 *fl6)
1178 {
1179         struct ipv6_pinfo *np = inet6_sk(sk);
1180         unsigned int mtu;
1181         struct ipv6_txoptions *opt = ipc6->opt;
1182
1183         /*
1184          * setup for corking
1185          */
1186         if (opt) {
1187                 if (WARN_ON(v6_cork->opt))
1188                         return -EINVAL;
1189
1190                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1191                 if (unlikely(!v6_cork->opt))
1192                         return -ENOBUFS;
1193
1194                 v6_cork->opt->tot_len = sizeof(*opt);
1195                 v6_cork->opt->opt_flen = opt->opt_flen;
1196                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1197
1198                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1199                                                     sk->sk_allocation);
1200                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1201                         return -ENOBUFS;
1202
1203                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1204                                                     sk->sk_allocation);
1205                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1206                         return -ENOBUFS;
1207
1208                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1209                                                    sk->sk_allocation);
1210                 if (opt->hopopt && !v6_cork->opt->hopopt)
1211                         return -ENOBUFS;
1212
1213                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1214                                                     sk->sk_allocation);
1215                 if (opt->srcrt && !v6_cork->opt->srcrt)
1216                         return -ENOBUFS;
1217
1218                 /* need source address above miyazawa*/
1219         }
1220         dst_hold(&rt->dst);
1221         cork->base.dst = &rt->dst;
1222         cork->fl.u.ip6 = *fl6;
1223         v6_cork->hop_limit = ipc6->hlimit;
1224         v6_cork->tclass = ipc6->tclass;
1225         if (rt->dst.flags & DST_XFRM_TUNNEL)
1226                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1227                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1228         else
1229                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1230                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1231         if (np->frag_size < mtu) {
1232                 if (np->frag_size)
1233                         mtu = np->frag_size;
1234         }
1235         if (mtu < IPV6_MIN_MTU)
1236                 return -EINVAL;
1237         cork->base.fragsize = mtu;
1238         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1239                 cork->base.flags |= IPCORK_ALLFRAG;
1240         cork->base.length = 0;
1241
1242         return 0;
1243 }
1244
1245 static int __ip6_append_data(struct sock *sk,
1246                              struct flowi6 *fl6,
1247                              struct sk_buff_head *queue,
1248                              struct inet_cork *cork,
1249                              struct inet6_cork *v6_cork,
1250                              struct page_frag *pfrag,
1251                              int getfrag(void *from, char *to, int offset,
1252                                          int len, int odd, struct sk_buff *skb),
1253                              void *from, int length, int transhdrlen,
1254                              unsigned int flags, struct ipcm6_cookie *ipc6,
1255                              const struct sockcm_cookie *sockc)
1256 {
1257         struct sk_buff *skb, *skb_prev = NULL;
1258         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1259         int exthdrlen = 0;
1260         int dst_exthdrlen = 0;
1261         int hh_len;
1262         int copy;
1263         int err;
1264         int offset = 0;
1265         __u8 tx_flags = 0;
1266         u32 tskey = 0;
1267         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1268         struct ipv6_txoptions *opt = v6_cork->opt;
1269         int csummode = CHECKSUM_NONE;
1270         unsigned int maxnonfragsize, headersize;
1271         unsigned int wmem_alloc_delta = 0;
1272
1273         skb = skb_peek_tail(queue);
1274         if (!skb) {
1275                 exthdrlen = opt ? opt->opt_flen : 0;
1276                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1277         }
1278
1279         mtu = cork->fragsize;
1280         orig_mtu = mtu;
1281
1282         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1283
1284         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1285                         (opt ? opt->opt_nflen : 0);
1286         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1287                      sizeof(struct frag_hdr);
1288
1289         headersize = sizeof(struct ipv6hdr) +
1290                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1291                      (dst_allfrag(&rt->dst) ?
1292                       sizeof(struct frag_hdr) : 0) +
1293                      rt->rt6i_nfheader_len;
1294
1295         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1296          * the first fragment
1297          */
1298         if (headersize + transhdrlen > mtu)
1299                 goto emsgsize;
1300
1301         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1302             (sk->sk_protocol == IPPROTO_UDP ||
1303              sk->sk_protocol == IPPROTO_RAW)) {
1304                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1305                                 sizeof(struct ipv6hdr));
1306                 goto emsgsize;
1307         }
1308
1309         if (ip6_sk_ignore_df(sk))
1310                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1311         else
1312                 maxnonfragsize = mtu;
1313
1314         if (cork->length + length > maxnonfragsize - headersize) {
1315 emsgsize:
1316                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1317                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1318                 return -EMSGSIZE;
1319         }
1320
1321         /* CHECKSUM_PARTIAL only with no extension headers and when
1322          * we are not going to fragment
1323          */
1324         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1325             headersize == sizeof(struct ipv6hdr) &&
1326             length <= mtu - headersize &&
1327             !(flags & MSG_MORE) &&
1328             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1329                 csummode = CHECKSUM_PARTIAL;
1330
1331         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1332                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1333                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1334                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1335                         tskey = sk->sk_tskey++;
1336         }
1337
1338         /*
1339          * Let's try using as much space as possible.
1340          * Use MTU if total length of the message fits into the MTU.
1341          * Otherwise, we need to reserve fragment header and
1342          * fragment alignment (= 8-15 octects, in total).
1343          *
1344          * Note that we may need to "move" the data from the tail of
1345          * of the buffer to the new fragment when we split
1346          * the message.
1347          *
1348          * FIXME: It may be fragmented into multiple chunks
1349          *        at once if non-fragmentable extension headers
1350          *        are too large.
1351          * --yoshfuji
1352          */
1353
1354         cork->length += length;
1355         if (!skb)
1356                 goto alloc_new_skb;
1357
1358         while (length > 0) {
1359                 /* Check if the remaining data fits into current packet. */
1360                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1361                 if (copy < length)
1362                         copy = maxfraglen - skb->len;
1363
1364                 if (copy <= 0) {
1365                         char *data;
1366                         unsigned int datalen;
1367                         unsigned int fraglen;
1368                         unsigned int fraggap;
1369                         unsigned int alloclen;
1370 alloc_new_skb:
1371                         /* There's no room in the current skb */
1372                         if (skb)
1373                                 fraggap = skb->len - maxfraglen;
1374                         else
1375                                 fraggap = 0;
1376                         /* update mtu and maxfraglen if necessary */
1377                         if (!skb || !skb_prev)
1378                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1379                                                     fragheaderlen, skb, rt,
1380                                                     orig_mtu);
1381
1382                         skb_prev = skb;
1383
1384                         /*
1385                          * If remaining data exceeds the mtu,
1386                          * we know we need more fragment(s).
1387                          */
1388                         datalen = length + fraggap;
1389
1390                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1391                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1392                         if ((flags & MSG_MORE) &&
1393                             !(rt->dst.dev->features&NETIF_F_SG))
1394                                 alloclen = mtu;
1395                         else
1396                                 alloclen = datalen + fragheaderlen;
1397
1398                         alloclen += dst_exthdrlen;
1399
1400                         if (datalen != length + fraggap) {
1401                                 /*
1402                                  * this is not the last fragment, the trailer
1403                                  * space is regarded as data space.
1404                                  */
1405                                 datalen += rt->dst.trailer_len;
1406                         }
1407
1408                         alloclen += rt->dst.trailer_len;
1409                         fraglen = datalen + fragheaderlen;
1410
1411                         /*
1412                          * We just reserve space for fragment header.
1413                          * Note: this may be overallocation if the message
1414                          * (without MSG_MORE) fits into the MTU.
1415                          */
1416                         alloclen += sizeof(struct frag_hdr);
1417
1418                         copy = datalen - transhdrlen - fraggap;
1419                         if (copy < 0) {
1420                                 err = -EINVAL;
1421                                 goto error;
1422                         }
1423                         if (transhdrlen) {
1424                                 skb = sock_alloc_send_skb(sk,
1425                                                 alloclen + hh_len,
1426                                                 (flags & MSG_DONTWAIT), &err);
1427                         } else {
1428                                 skb = NULL;
1429                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1430                                     2 * sk->sk_sndbuf)
1431                                         skb = alloc_skb(alloclen + hh_len,
1432                                                         sk->sk_allocation);
1433                                 if (unlikely(!skb))
1434                                         err = -ENOBUFS;
1435                         }
1436                         if (!skb)
1437                                 goto error;
1438                         /*
1439                          *      Fill in the control structures
1440                          */
1441                         skb->protocol = htons(ETH_P_IPV6);
1442                         skb->ip_summed = csummode;
1443                         skb->csum = 0;
1444                         /* reserve for fragmentation and ipsec header */
1445                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1446                                     dst_exthdrlen);
1447
1448                         /* Only the initial fragment is time stamped */
1449                         skb_shinfo(skb)->tx_flags = tx_flags;
1450                         tx_flags = 0;
1451                         skb_shinfo(skb)->tskey = tskey;
1452                         tskey = 0;
1453
1454                         /*
1455                          *      Find where to start putting bytes
1456                          */
1457                         data = skb_put(skb, fraglen);
1458                         skb_set_network_header(skb, exthdrlen);
1459                         data += fragheaderlen;
1460                         skb->transport_header = (skb->network_header +
1461                                                  fragheaderlen);
1462                         if (fraggap) {
1463                                 skb->csum = skb_copy_and_csum_bits(
1464                                         skb_prev, maxfraglen,
1465                                         data + transhdrlen, fraggap, 0);
1466                                 skb_prev->csum = csum_sub(skb_prev->csum,
1467                                                           skb->csum);
1468                                 data += fraggap;
1469                                 pskb_trim_unique(skb_prev, maxfraglen);
1470                         }
1471                         if (copy > 0 &&
1472                             getfrag(from, data + transhdrlen, offset,
1473                                     copy, fraggap, skb) < 0) {
1474                                 err = -EFAULT;
1475                                 kfree_skb(skb);
1476                                 goto error;
1477                         }
1478
1479                         offset += copy;
1480                         length -= datalen - fraggap;
1481                         transhdrlen = 0;
1482                         exthdrlen = 0;
1483                         dst_exthdrlen = 0;
1484
1485                         if ((flags & MSG_CONFIRM) && !skb_prev)
1486                                 skb_set_dst_pending_confirm(skb, 1);
1487
1488                         /*
1489                          * Put the packet on the pending queue
1490                          */
1491                         if (!skb->destructor) {
1492                                 skb->destructor = sock_wfree;
1493                                 skb->sk = sk;
1494                                 wmem_alloc_delta += skb->truesize;
1495                         }
1496                         __skb_queue_tail(queue, skb);
1497                         continue;
1498                 }
1499
1500                 if (copy > length)
1501                         copy = length;
1502
1503                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1504                         unsigned int off;
1505
1506                         off = skb->len;
1507                         if (getfrag(from, skb_put(skb, copy),
1508                                                 offset, copy, off, skb) < 0) {
1509                                 __skb_trim(skb, off);
1510                                 err = -EFAULT;
1511                                 goto error;
1512                         }
1513                 } else {
1514                         int i = skb_shinfo(skb)->nr_frags;
1515
1516                         err = -ENOMEM;
1517                         if (!sk_page_frag_refill(sk, pfrag))
1518                                 goto error;
1519
1520                         if (!skb_can_coalesce(skb, i, pfrag->page,
1521                                               pfrag->offset)) {
1522                                 err = -EMSGSIZE;
1523                                 if (i == MAX_SKB_FRAGS)
1524                                         goto error;
1525
1526                                 __skb_fill_page_desc(skb, i, pfrag->page,
1527                                                      pfrag->offset, 0);
1528                                 skb_shinfo(skb)->nr_frags = ++i;
1529                                 get_page(pfrag->page);
1530                         }
1531                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1532                         if (getfrag(from,
1533                                     page_address(pfrag->page) + pfrag->offset,
1534                                     offset, copy, skb->len, skb) < 0)
1535                                 goto error_efault;
1536
1537                         pfrag->offset += copy;
1538                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1539                         skb->len += copy;
1540                         skb->data_len += copy;
1541                         skb->truesize += copy;
1542                         wmem_alloc_delta += copy;
1543                 }
1544                 offset += copy;
1545                 length -= copy;
1546         }
1547
1548         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1549         return 0;
1550
1551 error_efault:
1552         err = -EFAULT;
1553 error:
1554         cork->length -= length;
1555         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1556         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1557         return err;
1558 }
1559
1560 int ip6_append_data(struct sock *sk,
1561                     int getfrag(void *from, char *to, int offset, int len,
1562                                 int odd, struct sk_buff *skb),
1563                     void *from, int length, int transhdrlen,
1564                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1565                     struct rt6_info *rt, unsigned int flags,
1566                     const struct sockcm_cookie *sockc)
1567 {
1568         struct inet_sock *inet = inet_sk(sk);
1569         struct ipv6_pinfo *np = inet6_sk(sk);
1570         int exthdrlen;
1571         int err;
1572
1573         if (flags&MSG_PROBE)
1574                 return 0;
1575         if (skb_queue_empty(&sk->sk_write_queue)) {
1576                 /*
1577                  * setup for corking
1578                  */
1579                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1580                                      ipc6, rt, fl6);
1581                 if (err)
1582                         return err;
1583
1584                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1585                 length += exthdrlen;
1586                 transhdrlen += exthdrlen;
1587         } else {
1588                 fl6 = &inet->cork.fl.u.ip6;
1589                 transhdrlen = 0;
1590         }
1591
1592         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1593                                  &np->cork, sk_page_frag(sk), getfrag,
1594                                  from, length, transhdrlen, flags, ipc6, sockc);
1595 }
1596 EXPORT_SYMBOL_GPL(ip6_append_data);
1597
1598 static void ip6_cork_release(struct inet_cork_full *cork,
1599                              struct inet6_cork *v6_cork)
1600 {
1601         if (v6_cork->opt) {
1602                 kfree(v6_cork->opt->dst0opt);
1603                 kfree(v6_cork->opt->dst1opt);
1604                 kfree(v6_cork->opt->hopopt);
1605                 kfree(v6_cork->opt->srcrt);
1606                 kfree(v6_cork->opt);
1607                 v6_cork->opt = NULL;
1608         }
1609
1610         if (cork->base.dst) {
1611                 dst_release(cork->base.dst);
1612                 cork->base.dst = NULL;
1613                 cork->base.flags &= ~IPCORK_ALLFRAG;
1614         }
1615         memset(&cork->fl, 0, sizeof(cork->fl));
1616 }
1617
1618 struct sk_buff *__ip6_make_skb(struct sock *sk,
1619                                struct sk_buff_head *queue,
1620                                struct inet_cork_full *cork,
1621                                struct inet6_cork *v6_cork)
1622 {
1623         struct sk_buff *skb, *tmp_skb;
1624         struct sk_buff **tail_skb;
1625         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1626         struct ipv6_pinfo *np = inet6_sk(sk);
1627         struct net *net = sock_net(sk);
1628         struct ipv6hdr *hdr;
1629         struct ipv6_txoptions *opt = v6_cork->opt;
1630         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1631         struct flowi6 *fl6 = &cork->fl.u.ip6;
1632         unsigned char proto = fl6->flowi6_proto;
1633
1634         skb = __skb_dequeue(queue);
1635         if (!skb)
1636                 goto out;
1637         tail_skb = &(skb_shinfo(skb)->frag_list);
1638
1639         /* move skb->data to ip header from ext header */
1640         if (skb->data < skb_network_header(skb))
1641                 __skb_pull(skb, skb_network_offset(skb));
1642         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1643                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1644                 *tail_skb = tmp_skb;
1645                 tail_skb = &(tmp_skb->next);
1646                 skb->len += tmp_skb->len;
1647                 skb->data_len += tmp_skb->len;
1648                 skb->truesize += tmp_skb->truesize;
1649                 tmp_skb->destructor = NULL;
1650                 tmp_skb->sk = NULL;
1651         }
1652
1653         /* Allow local fragmentation. */
1654         skb->ignore_df = ip6_sk_ignore_df(sk);
1655
1656         *final_dst = fl6->daddr;
1657         __skb_pull(skb, skb_network_header_len(skb));
1658         if (opt && opt->opt_flen)
1659                 ipv6_push_frag_opts(skb, opt, &proto);
1660         if (opt && opt->opt_nflen)
1661                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1662
1663         skb_push(skb, sizeof(struct ipv6hdr));
1664         skb_reset_network_header(skb);
1665         hdr = ipv6_hdr(skb);
1666
1667         ip6_flow_hdr(hdr, v6_cork->tclass,
1668                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1669                                         ip6_autoflowlabel(net, np), fl6));
1670         hdr->hop_limit = v6_cork->hop_limit;
1671         hdr->nexthdr = proto;
1672         hdr->saddr = fl6->saddr;
1673         hdr->daddr = *final_dst;
1674
1675         skb->priority = sk->sk_priority;
1676         skb->mark = sk->sk_mark;
1677
1678         skb_dst_set(skb, dst_clone(&rt->dst));
1679         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1680         if (proto == IPPROTO_ICMPV6) {
1681                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1682
1683                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1684                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1685         }
1686
1687         ip6_cork_release(cork, v6_cork);
1688 out:
1689         return skb;
1690 }
1691
1692 int ip6_send_skb(struct sk_buff *skb)
1693 {
1694         struct net *net = sock_net(skb->sk);
1695         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1696         int err;
1697
1698         err = ip6_local_out(net, skb->sk, skb);
1699         if (err) {
1700                 if (err > 0)
1701                         err = net_xmit_errno(err);
1702                 if (err)
1703                         IP6_INC_STATS(net, rt->rt6i_idev,
1704                                       IPSTATS_MIB_OUTDISCARDS);
1705         }
1706
1707         return err;
1708 }
1709
1710 int ip6_push_pending_frames(struct sock *sk)
1711 {
1712         struct sk_buff *skb;
1713
1714         skb = ip6_finish_skb(sk);
1715         if (!skb)
1716                 return 0;
1717
1718         return ip6_send_skb(skb);
1719 }
1720 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1721
1722 static void __ip6_flush_pending_frames(struct sock *sk,
1723                                        struct sk_buff_head *queue,
1724                                        struct inet_cork_full *cork,
1725                                        struct inet6_cork *v6_cork)
1726 {
1727         struct sk_buff *skb;
1728
1729         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1730                 if (skb_dst(skb))
1731                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1732                                       IPSTATS_MIB_OUTDISCARDS);
1733                 kfree_skb(skb);
1734         }
1735
1736         ip6_cork_release(cork, v6_cork);
1737 }
1738
1739 void ip6_flush_pending_frames(struct sock *sk)
1740 {
1741         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1742                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1743 }
1744 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1745
1746 struct sk_buff *ip6_make_skb(struct sock *sk,
1747                              int getfrag(void *from, char *to, int offset,
1748                                          int len, int odd, struct sk_buff *skb),
1749                              void *from, int length, int transhdrlen,
1750                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1751                              struct rt6_info *rt, unsigned int flags,
1752                              const struct sockcm_cookie *sockc)
1753 {
1754         struct inet_cork_full cork;
1755         struct inet6_cork v6_cork;
1756         struct sk_buff_head queue;
1757         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1758         int err;
1759
1760         if (flags & MSG_PROBE)
1761                 return NULL;
1762
1763         __skb_queue_head_init(&queue);
1764
1765         cork.base.flags = 0;
1766         cork.base.addr = 0;
1767         cork.base.opt = NULL;
1768         cork.base.dst = NULL;
1769         v6_cork.opt = NULL;
1770         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1771         if (err) {
1772                 ip6_cork_release(&cork, &v6_cork);
1773                 return ERR_PTR(err);
1774         }
1775         if (ipc6->dontfrag < 0)
1776                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1777
1778         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1779                                 &current->task_frag, getfrag, from,
1780                                 length + exthdrlen, transhdrlen + exthdrlen,
1781                                 flags, ipc6, sockc);
1782         if (err) {
1783                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1784                 return ERR_PTR(err);
1785         }
1786
1787         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1788 }