Merge tag 'mm-hotfixes-stable-2022-09-26' of git://git.kernel.org/pub/scm/linux/kerne...
[linux-2.6-microblaze.git] / net / ipv6 / ip6_output.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      IPv6 output functions
4  *      Linux INET6 implementation
5  *
6  *      Authors:
7  *      Pedro Roque             <roque@di.fc.ul.pt>
8  *
9  *      Based on linux/net/ipv4/ip_output.c
10  *
11  *      Changes:
12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
13  *                              extension headers are implemented.
14  *                              route changes now work.
15  *                              ip6_forward does not confuse sniffers.
16  *                              etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *      Imran Patel     :       frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *                      :       add ip6_append_data and related functions
22  *                              for datagram xmit
23  */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61         struct dst_entry *dst = skb_dst(skb);
62         struct net_device *dev = dst->dev;
63         struct inet6_dev *idev = ip6_dst_idev(dst);
64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
65         const struct in6_addr *daddr, *nexthop;
66         struct ipv6hdr *hdr;
67         struct neighbour *neigh;
68         int ret;
69
70         /* Be paranoid, rather than too clever. */
71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72                 skb = skb_expand_head(skb, hh_len);
73                 if (!skb) {
74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
75                         return -ENOMEM;
76                 }
77         }
78
79         hdr = ipv6_hdr(skb);
80         daddr = &hdr->daddr;
81         if (ipv6_addr_is_multicast(daddr)) {
82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83                     ((mroute6_is_socket(net, skb) &&
84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87
88                         /* Do not check for IFF_ALLMULTI; multicast routing
89                            is not supported in any case.
90                          */
91                         if (newskb)
92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93                                         net, sk, newskb, NULL, newskb->dev,
94                                         dev_loopback_xmit);
95
96                         if (hdr->hop_limit == 0) {
97                                 IP6_INC_STATS(net, idev,
98                                               IPSTATS_MIB_OUTDISCARDS);
99                                 kfree_skb(skb);
100                                 return 0;
101                         }
102                 }
103
104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106                     !(dev->flags & IFF_LOOPBACK)) {
107                         kfree_skb(skb);
108                         return 0;
109                 }
110         }
111
112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113                 int res = lwtunnel_xmit(skb);
114
115                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116                         return res;
117         }
118
119         rcu_read_lock_bh();
120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122
123         if (unlikely(IS_ERR_OR_NULL(neigh))) {
124                 if (unlikely(!neigh))
125                         neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
126                 if (IS_ERR(neigh)) {
127                         rcu_read_unlock_bh();
128                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
129                         kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
130                         return -EINVAL;
131                 }
132         }
133         sock_confirm_neigh(skb, neigh);
134         ret = neigh_output(neigh, skb, false);
135         rcu_read_unlock_bh();
136         return ret;
137 }
138
139 static int
140 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
141                                     struct sk_buff *skb, unsigned int mtu)
142 {
143         struct sk_buff *segs, *nskb;
144         netdev_features_t features;
145         int ret = 0;
146
147         /* Please see corresponding comment in ip_finish_output_gso
148          * describing the cases where GSO segment length exceeds the
149          * egress MTU.
150          */
151         features = netif_skb_features(skb);
152         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153         if (IS_ERR_OR_NULL(segs)) {
154                 kfree_skb(skb);
155                 return -ENOMEM;
156         }
157
158         consume_skb(skb);
159
160         skb_list_walk_safe(segs, segs, nskb) {
161                 int err;
162
163                 skb_mark_not_on_list(segs);
164                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
165                 if (err && ret == 0)
166                         ret = err;
167         }
168
169         return ret;
170 }
171
172 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
173 {
174         unsigned int mtu;
175
176 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
177         /* Policy lookup after SNAT yielded a new policy */
178         if (skb_dst(skb)->xfrm) {
179                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
180                 return dst_output(net, sk, skb);
181         }
182 #endif
183
184         mtu = ip6_skb_dst_mtu(skb);
185         if (skb_is_gso(skb) &&
186             !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
187             !skb_gso_validate_network_len(skb, mtu))
188                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
189
190         if ((skb->len > mtu && !skb_is_gso(skb)) ||
191             dst_allfrag(skb_dst(skb)) ||
192             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
193                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
194         else
195                 return ip6_finish_output2(net, sk, skb);
196 }
197
198 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
199 {
200         int ret;
201
202         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
203         switch (ret) {
204         case NET_XMIT_SUCCESS:
205         case NET_XMIT_CN:
206                 return __ip6_finish_output(net, sk, skb) ? : ret;
207         default:
208                 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
209                 return ret;
210         }
211 }
212
213 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
216         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
217
218         skb->protocol = htons(ETH_P_IPV6);
219         skb->dev = dev;
220
221         if (unlikely(idev->cnf.disable_ipv6)) {
222                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
223                 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
224                 return 0;
225         }
226
227         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
228                             net, sk, skb, indev, dev,
229                             ip6_finish_output,
230                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
231 }
232 EXPORT_SYMBOL(ip6_output);
233
234 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
235 {
236         if (!np->autoflowlabel_set)
237                 return ip6_default_np_autolabel(net);
238         else
239                 return np->autoflowlabel;
240 }
241
242 /*
243  * xmit an sk_buff (used by TCP, SCTP and DCCP)
244  * Note : socket lock is not held for SYNACK packets, but might be modified
245  * by calls to skb_set_owner_w() and ipv6_local_error(),
246  * which are using proper atomic operations or spinlocks.
247  */
248 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
249              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
250 {
251         struct net *net = sock_net(sk);
252         const struct ipv6_pinfo *np = inet6_sk(sk);
253         struct in6_addr *first_hop = &fl6->daddr;
254         struct dst_entry *dst = skb_dst(skb);
255         struct net_device *dev = dst->dev;
256         struct inet6_dev *idev = ip6_dst_idev(dst);
257         struct hop_jumbo_hdr *hop_jumbo;
258         int hoplen = sizeof(*hop_jumbo);
259         unsigned int head_room;
260         struct ipv6hdr *hdr;
261         u8  proto = fl6->flowi6_proto;
262         int seg_len = skb->len;
263         int hlimit = -1;
264         u32 mtu;
265
266         head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
267         if (opt)
268                 head_room += opt->opt_nflen + opt->opt_flen;
269
270         if (unlikely(head_room > skb_headroom(skb))) {
271                 skb = skb_expand_head(skb, head_room);
272                 if (!skb) {
273                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274                         return -ENOBUFS;
275                 }
276         }
277
278         if (opt) {
279                 seg_len += opt->opt_nflen + opt->opt_flen;
280
281                 if (opt->opt_flen)
282                         ipv6_push_frag_opts(skb, opt, &proto);
283
284                 if (opt->opt_nflen)
285                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
286                                              &fl6->saddr);
287         }
288
289         if (unlikely(seg_len > IPV6_MAXPLEN)) {
290                 hop_jumbo = skb_push(skb, hoplen);
291
292                 hop_jumbo->nexthdr = proto;
293                 hop_jumbo->hdrlen = 0;
294                 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
295                 hop_jumbo->tlv_len = 4;
296                 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
297
298                 proto = IPPROTO_HOPOPTS;
299                 seg_len = 0;
300                 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
301         }
302
303         skb_push(skb, sizeof(struct ipv6hdr));
304         skb_reset_network_header(skb);
305         hdr = ipv6_hdr(skb);
306
307         /*
308          *      Fill in the IPv6 header
309          */
310         if (np)
311                 hlimit = np->hop_limit;
312         if (hlimit < 0)
313                 hlimit = ip6_dst_hoplimit(dst);
314
315         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
316                                 ip6_autoflowlabel(net, np), fl6));
317
318         hdr->payload_len = htons(seg_len);
319         hdr->nexthdr = proto;
320         hdr->hop_limit = hlimit;
321
322         hdr->saddr = fl6->saddr;
323         hdr->daddr = *first_hop;
324
325         skb->protocol = htons(ETH_P_IPV6);
326         skb->priority = priority;
327         skb->mark = mark;
328
329         mtu = dst_mtu(dst);
330         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
331                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
332
333                 /* if egress device is enslaved to an L3 master device pass the
334                  * skb to its handler for processing
335                  */
336                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
337                 if (unlikely(!skb))
338                         return 0;
339
340                 /* hooks should never assume socket lock is held.
341                  * we promote our socket to non const
342                  */
343                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
344                                net, (struct sock *)sk, skb, NULL, dev,
345                                dst_output);
346         }
347
348         skb->dev = dev;
349         /* ipv6_local_error() does not require socket lock,
350          * we promote our socket to non const
351          */
352         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
353
354         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
355         kfree_skb(skb);
356         return -EMSGSIZE;
357 }
358 EXPORT_SYMBOL(ip6_xmit);
359
360 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
361 {
362         struct ip6_ra_chain *ra;
363         struct sock *last = NULL;
364
365         read_lock(&ip6_ra_lock);
366         for (ra = ip6_ra_chain; ra; ra = ra->next) {
367                 struct sock *sk = ra->sk;
368                 if (sk && ra->sel == sel &&
369                     (!sk->sk_bound_dev_if ||
370                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
371                         struct ipv6_pinfo *np = inet6_sk(sk);
372
373                         if (np && np->rtalert_isolate &&
374                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
375                                 continue;
376                         }
377                         if (last) {
378                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
379                                 if (skb2)
380                                         rawv6_rcv(last, skb2);
381                         }
382                         last = sk;
383                 }
384         }
385
386         if (last) {
387                 rawv6_rcv(last, skb);
388                 read_unlock(&ip6_ra_lock);
389                 return 1;
390         }
391         read_unlock(&ip6_ra_lock);
392         return 0;
393 }
394
395 static int ip6_forward_proxy_check(struct sk_buff *skb)
396 {
397         struct ipv6hdr *hdr = ipv6_hdr(skb);
398         u8 nexthdr = hdr->nexthdr;
399         __be16 frag_off;
400         int offset;
401
402         if (ipv6_ext_hdr(nexthdr)) {
403                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
404                 if (offset < 0)
405                         return 0;
406         } else
407                 offset = sizeof(struct ipv6hdr);
408
409         if (nexthdr == IPPROTO_ICMPV6) {
410                 struct icmp6hdr *icmp6;
411
412                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
413                                          offset + 1 - skb->data)))
414                         return 0;
415
416                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
417
418                 switch (icmp6->icmp6_type) {
419                 case NDISC_ROUTER_SOLICITATION:
420                 case NDISC_ROUTER_ADVERTISEMENT:
421                 case NDISC_NEIGHBOUR_SOLICITATION:
422                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
423                 case NDISC_REDIRECT:
424                         /* For reaction involving unicast neighbor discovery
425                          * message destined to the proxied address, pass it to
426                          * input function.
427                          */
428                         return 1;
429                 default:
430                         break;
431                 }
432         }
433
434         /*
435          * The proxying router can't forward traffic sent to a link-local
436          * address, so signal the sender and discard the packet. This
437          * behavior is clarified by the MIPv6 specification.
438          */
439         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
440                 dst_link_failure(skb);
441                 return -1;
442         }
443
444         return 0;
445 }
446
447 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
448                                      struct sk_buff *skb)
449 {
450         struct dst_entry *dst = skb_dst(skb);
451
452         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
453         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
454
455 #ifdef CONFIG_NET_SWITCHDEV
456         if (skb->offload_l3_fwd_mark) {
457                 consume_skb(skb);
458                 return 0;
459         }
460 #endif
461
462         skb_clear_tstamp(skb);
463         return dst_output(net, sk, skb);
464 }
465
466 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
467 {
468         if (skb->len <= mtu)
469                 return false;
470
471         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
472         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
473                 return true;
474
475         if (skb->ignore_df)
476                 return false;
477
478         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
479                 return false;
480
481         return true;
482 }
483
484 int ip6_forward(struct sk_buff *skb)
485 {
486         struct dst_entry *dst = skb_dst(skb);
487         struct ipv6hdr *hdr = ipv6_hdr(skb);
488         struct inet6_skb_parm *opt = IP6CB(skb);
489         struct net *net = dev_net(dst->dev);
490         struct inet6_dev *idev;
491         SKB_DR(reason);
492         u32 mtu;
493
494         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
495         if (net->ipv6.devconf_all->forwarding == 0)
496                 goto error;
497
498         if (skb->pkt_type != PACKET_HOST)
499                 goto drop;
500
501         if (unlikely(skb->sk))
502                 goto drop;
503
504         if (skb_warn_if_lro(skb))
505                 goto drop;
506
507         if (!net->ipv6.devconf_all->disable_policy &&
508             (!idev || !idev->cnf.disable_policy) &&
509             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
510                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
511                 goto drop;
512         }
513
514         skb_forward_csum(skb);
515
516         /*
517          *      We DO NOT make any processing on
518          *      RA packets, pushing them to user level AS IS
519          *      without ane WARRANTY that application will be able
520          *      to interpret them. The reason is that we
521          *      cannot make anything clever here.
522          *
523          *      We are not end-node, so that if packet contains
524          *      AH/ESP, we cannot make anything.
525          *      Defragmentation also would be mistake, RA packets
526          *      cannot be fragmented, because there is no warranty
527          *      that different fragments will go along one path. --ANK
528          */
529         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
530                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
531                         return 0;
532         }
533
534         /*
535          *      check and decrement ttl
536          */
537         if (hdr->hop_limit <= 1) {
538                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
540
541                 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
542                 return -ETIMEDOUT;
543         }
544
545         /* XXX: idev->cnf.proxy_ndp? */
546         if (net->ipv6.devconf_all->proxy_ndp &&
547             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
548                 int proxied = ip6_forward_proxy_check(skb);
549                 if (proxied > 0) {
550                         hdr->hop_limit--;
551                         return ip6_input(skb);
552                 } else if (proxied < 0) {
553                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
554                         goto drop;
555                 }
556         }
557
558         if (!xfrm6_route_forward(skb)) {
559                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
560                 SKB_DR_SET(reason, XFRM_POLICY);
561                 goto drop;
562         }
563         dst = skb_dst(skb);
564
565         /* IPv6 specs say nothing about it, but it is clear that we cannot
566            send redirects to source routed frames.
567            We don't send redirects to frames decapsulated from IPsec.
568          */
569         if (IP6CB(skb)->iif == dst->dev->ifindex &&
570             opt->srcrt == 0 && !skb_sec_path(skb)) {
571                 struct in6_addr *target = NULL;
572                 struct inet_peer *peer;
573                 struct rt6_info *rt;
574
575                 /*
576                  *      incoming and outgoing devices are the same
577                  *      send a redirect.
578                  */
579
580                 rt = (struct rt6_info *) dst;
581                 if (rt->rt6i_flags & RTF_GATEWAY)
582                         target = &rt->rt6i_gateway;
583                 else
584                         target = &hdr->daddr;
585
586                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
587
588                 /* Limit redirects both by destination (here)
589                    and by source (inside ndisc_send_redirect)
590                  */
591                 if (inet_peer_xrlim_allow(peer, 1*HZ))
592                         ndisc_send_redirect(skb, target);
593                 if (peer)
594                         inet_putpeer(peer);
595         } else {
596                 int addrtype = ipv6_addr_type(&hdr->saddr);
597
598                 /* This check is security critical. */
599                 if (addrtype == IPV6_ADDR_ANY ||
600                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
601                         goto error;
602                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
603                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
604                                     ICMPV6_NOT_NEIGHBOUR, 0);
605                         goto error;
606                 }
607         }
608
609         mtu = ip6_dst_mtu_maybe_forward(dst, true);
610         if (mtu < IPV6_MIN_MTU)
611                 mtu = IPV6_MIN_MTU;
612
613         if (ip6_pkt_too_big(skb, mtu)) {
614                 /* Again, force OUTPUT device used as source address */
615                 skb->dev = dst->dev;
616                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
617                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
618                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
619                                 IPSTATS_MIB_FRAGFAILS);
620                 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
621                 return -EMSGSIZE;
622         }
623
624         if (skb_cow(skb, dst->dev->hard_header_len)) {
625                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
626                                 IPSTATS_MIB_OUTDISCARDS);
627                 goto drop;
628         }
629
630         hdr = ipv6_hdr(skb);
631
632         /* Mangling hops number delayed to point after skb COW */
633
634         hdr->hop_limit--;
635
636         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
637                        net, NULL, skb, skb->dev, dst->dev,
638                        ip6_forward_finish);
639
640 error:
641         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
642         SKB_DR_SET(reason, IP_INADDRERRORS);
643 drop:
644         kfree_skb_reason(skb, reason);
645         return -EINVAL;
646 }
647
648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
649 {
650         to->pkt_type = from->pkt_type;
651         to->priority = from->priority;
652         to->protocol = from->protocol;
653         skb_dst_drop(to);
654         skb_dst_set(to, dst_clone(skb_dst(from)));
655         to->dev = from->dev;
656         to->mark = from->mark;
657
658         skb_copy_hash(to, from);
659
660 #ifdef CONFIG_NET_SCHED
661         to->tc_index = from->tc_index;
662 #endif
663         nf_copy(to, from);
664         skb_ext_copy(to, from);
665         skb_copy_secmark(to, from);
666 }
667
668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
669                       u8 nexthdr, __be32 frag_id,
670                       struct ip6_fraglist_iter *iter)
671 {
672         unsigned int first_len;
673         struct frag_hdr *fh;
674
675         /* BUILD HEADER */
676         *prevhdr = NEXTHDR_FRAGMENT;
677         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
678         if (!iter->tmp_hdr)
679                 return -ENOMEM;
680
681         iter->frag = skb_shinfo(skb)->frag_list;
682         skb_frag_list_init(skb);
683
684         iter->offset = 0;
685         iter->hlen = hlen;
686         iter->frag_id = frag_id;
687         iter->nexthdr = nexthdr;
688
689         __skb_pull(skb, hlen);
690         fh = __skb_push(skb, sizeof(struct frag_hdr));
691         __skb_push(skb, hlen);
692         skb_reset_network_header(skb);
693         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
694
695         fh->nexthdr = nexthdr;
696         fh->reserved = 0;
697         fh->frag_off = htons(IP6_MF);
698         fh->identification = frag_id;
699
700         first_len = skb_pagelen(skb);
701         skb->data_len = first_len - skb_headlen(skb);
702         skb->len = first_len;
703         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
704
705         return 0;
706 }
707 EXPORT_SYMBOL(ip6_fraglist_init);
708
709 void ip6_fraglist_prepare(struct sk_buff *skb,
710                           struct ip6_fraglist_iter *iter)
711 {
712         struct sk_buff *frag = iter->frag;
713         unsigned int hlen = iter->hlen;
714         struct frag_hdr *fh;
715
716         frag->ip_summed = CHECKSUM_NONE;
717         skb_reset_transport_header(frag);
718         fh = __skb_push(frag, sizeof(struct frag_hdr));
719         __skb_push(frag, hlen);
720         skb_reset_network_header(frag);
721         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
722         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
723         fh->nexthdr = iter->nexthdr;
724         fh->reserved = 0;
725         fh->frag_off = htons(iter->offset);
726         if (frag->next)
727                 fh->frag_off |= htons(IP6_MF);
728         fh->identification = iter->frag_id;
729         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
730         ip6_copy_metadata(frag, skb);
731 }
732 EXPORT_SYMBOL(ip6_fraglist_prepare);
733
734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
735                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
736                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
737 {
738         state->prevhdr = prevhdr;
739         state->nexthdr = nexthdr;
740         state->frag_id = frag_id;
741
742         state->hlen = hlen;
743         state->mtu = mtu;
744
745         state->left = skb->len - hlen;  /* Space per frame */
746         state->ptr = hlen;              /* Where to start from */
747
748         state->hroom = hdr_room;
749         state->troom = needed_tailroom;
750
751         state->offset = 0;
752 }
753 EXPORT_SYMBOL(ip6_frag_init);
754
755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
756 {
757         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
758         struct sk_buff *frag;
759         struct frag_hdr *fh;
760         unsigned int len;
761
762         len = state->left;
763         /* IF: it doesn't fit, use 'mtu' - the data space left */
764         if (len > state->mtu)
765                 len = state->mtu;
766         /* IF: we are not sending up to and including the packet end
767            then align the next start on an eight byte boundary */
768         if (len < state->left)
769                 len &= ~7;
770
771         /* Allocate buffer */
772         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
773                          state->hroom + state->troom, GFP_ATOMIC);
774         if (!frag)
775                 return ERR_PTR(-ENOMEM);
776
777         /*
778          *      Set up data on packet
779          */
780
781         ip6_copy_metadata(frag, skb);
782         skb_reserve(frag, state->hroom);
783         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
784         skb_reset_network_header(frag);
785         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
786         frag->transport_header = (frag->network_header + state->hlen +
787                                   sizeof(struct frag_hdr));
788
789         /*
790          *      Charge the memory for the fragment to any owner
791          *      it might possess
792          */
793         if (skb->sk)
794                 skb_set_owner_w(frag, skb->sk);
795
796         /*
797          *      Copy the packet header into the new buffer.
798          */
799         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
800
801         fragnexthdr_offset = skb_network_header(frag);
802         fragnexthdr_offset += prevhdr - skb_network_header(skb);
803         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
804
805         /*
806          *      Build fragment header.
807          */
808         fh->nexthdr = state->nexthdr;
809         fh->reserved = 0;
810         fh->identification = state->frag_id;
811
812         /*
813          *      Copy a block of the IP datagram.
814          */
815         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
816                              len));
817         state->left -= len;
818
819         fh->frag_off = htons(state->offset);
820         if (state->left > 0)
821                 fh->frag_off |= htons(IP6_MF);
822         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
823
824         state->ptr += len;
825         state->offset += len;
826
827         return frag;
828 }
829 EXPORT_SYMBOL(ip6_frag_next);
830
831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
832                  int (*output)(struct net *, struct sock *, struct sk_buff *))
833 {
834         struct sk_buff *frag;
835         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
836         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
837                                 inet6_sk(skb->sk) : NULL;
838         bool mono_delivery_time = skb->mono_delivery_time;
839         struct ip6_frag_state state;
840         unsigned int mtu, hlen, nexthdr_offset;
841         ktime_t tstamp = skb->tstamp;
842         int hroom, err = 0;
843         __be32 frag_id;
844         u8 *prevhdr, nexthdr = 0;
845
846         err = ip6_find_1stfragopt(skb, &prevhdr);
847         if (err < 0)
848                 goto fail;
849         hlen = err;
850         nexthdr = *prevhdr;
851         nexthdr_offset = prevhdr - skb_network_header(skb);
852
853         mtu = ip6_skb_dst_mtu(skb);
854
855         /* We must not fragment if the socket is set to force MTU discovery
856          * or if the skb it not generated by a local socket.
857          */
858         if (unlikely(!skb->ignore_df && skb->len > mtu))
859                 goto fail_toobig;
860
861         if (IP6CB(skb)->frag_max_size) {
862                 if (IP6CB(skb)->frag_max_size > mtu)
863                         goto fail_toobig;
864
865                 /* don't send fragments larger than what we received */
866                 mtu = IP6CB(skb)->frag_max_size;
867                 if (mtu < IPV6_MIN_MTU)
868                         mtu = IPV6_MIN_MTU;
869         }
870
871         if (np && np->frag_size < mtu) {
872                 if (np->frag_size)
873                         mtu = np->frag_size;
874         }
875         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
876                 goto fail_toobig;
877         mtu -= hlen + sizeof(struct frag_hdr);
878
879         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
880                                     &ipv6_hdr(skb)->saddr);
881
882         if (skb->ip_summed == CHECKSUM_PARTIAL &&
883             (err = skb_checksum_help(skb)))
884                 goto fail;
885
886         prevhdr = skb_network_header(skb) + nexthdr_offset;
887         hroom = LL_RESERVED_SPACE(rt->dst.dev);
888         if (skb_has_frag_list(skb)) {
889                 unsigned int first_len = skb_pagelen(skb);
890                 struct ip6_fraglist_iter iter;
891                 struct sk_buff *frag2;
892
893                 if (first_len - hlen > mtu ||
894                     ((first_len - hlen) & 7) ||
895                     skb_cloned(skb) ||
896                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
897                         goto slow_path;
898
899                 skb_walk_frags(skb, frag) {
900                         /* Correct geometry. */
901                         if (frag->len > mtu ||
902                             ((frag->len & 7) && frag->next) ||
903                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
904                                 goto slow_path_clean;
905
906                         /* Partially cloned skb? */
907                         if (skb_shared(frag))
908                                 goto slow_path_clean;
909
910                         BUG_ON(frag->sk);
911                         if (skb->sk) {
912                                 frag->sk = skb->sk;
913                                 frag->destructor = sock_wfree;
914                         }
915                         skb->truesize -= frag->truesize;
916                 }
917
918                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
919                                         &iter);
920                 if (err < 0)
921                         goto fail;
922
923                 for (;;) {
924                         /* Prepare header of the next frame,
925                          * before previous one went down. */
926                         if (iter.frag)
927                                 ip6_fraglist_prepare(skb, &iter);
928
929                         skb_set_delivery_time(skb, tstamp, mono_delivery_time);
930                         err = output(net, sk, skb);
931                         if (!err)
932                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
933                                               IPSTATS_MIB_FRAGCREATES);
934
935                         if (err || !iter.frag)
936                                 break;
937
938                         skb = ip6_fraglist_next(&iter);
939                 }
940
941                 kfree(iter.tmp_hdr);
942
943                 if (err == 0) {
944                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
945                                       IPSTATS_MIB_FRAGOKS);
946                         return 0;
947                 }
948
949                 kfree_skb_list(iter.frag);
950
951                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
952                               IPSTATS_MIB_FRAGFAILS);
953                 return err;
954
955 slow_path_clean:
956                 skb_walk_frags(skb, frag2) {
957                         if (frag2 == frag)
958                                 break;
959                         frag2->sk = NULL;
960                         frag2->destructor = NULL;
961                         skb->truesize += frag2->truesize;
962                 }
963         }
964
965 slow_path:
966         /*
967          *      Fragment the datagram.
968          */
969
970         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
971                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
972                       &state);
973
974         /*
975          *      Keep copying data until we run out.
976          */
977
978         while (state.left > 0) {
979                 frag = ip6_frag_next(skb, &state);
980                 if (IS_ERR(frag)) {
981                         err = PTR_ERR(frag);
982                         goto fail;
983                 }
984
985                 /*
986                  *      Put this fragment into the sending queue.
987                  */
988                 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
989                 err = output(net, sk, frag);
990                 if (err)
991                         goto fail;
992
993                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
994                               IPSTATS_MIB_FRAGCREATES);
995         }
996         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
997                       IPSTATS_MIB_FRAGOKS);
998         consume_skb(skb);
999         return err;
1000
1001 fail_toobig:
1002         if (skb->sk && dst_allfrag(skb_dst(skb)))
1003                 sk_gso_disable(skb->sk);
1004
1005         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1006         err = -EMSGSIZE;
1007
1008 fail:
1009         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1010                       IPSTATS_MIB_FRAGFAILS);
1011         kfree_skb(skb);
1012         return err;
1013 }
1014
1015 static inline int ip6_rt_check(const struct rt6key *rt_key,
1016                                const struct in6_addr *fl_addr,
1017                                const struct in6_addr *addr_cache)
1018 {
1019         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1020                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1021 }
1022
1023 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1024                                           struct dst_entry *dst,
1025                                           const struct flowi6 *fl6)
1026 {
1027         struct ipv6_pinfo *np = inet6_sk(sk);
1028         struct rt6_info *rt;
1029
1030         if (!dst)
1031                 goto out;
1032
1033         if (dst->ops->family != AF_INET6) {
1034                 dst_release(dst);
1035                 return NULL;
1036         }
1037
1038         rt = (struct rt6_info *)dst;
1039         /* Yes, checking route validity in not connected
1040          * case is not very simple. Take into account,
1041          * that we do not support routing by source, TOS,
1042          * and MSG_DONTROUTE            --ANK (980726)
1043          *
1044          * 1. ip6_rt_check(): If route was host route,
1045          *    check that cached destination is current.
1046          *    If it is network route, we still may
1047          *    check its validity using saved pointer
1048          *    to the last used address: daddr_cache.
1049          *    We do not want to save whole address now,
1050          *    (because main consumer of this service
1051          *    is tcp, which has not this problem),
1052          *    so that the last trick works only on connected
1053          *    sockets.
1054          * 2. oif also should be the same.
1055          */
1056         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1057 #ifdef CONFIG_IPV6_SUBTREES
1058             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1059 #endif
1060            (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1061                 dst_release(dst);
1062                 dst = NULL;
1063         }
1064
1065 out:
1066         return dst;
1067 }
1068
1069 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1070                                struct dst_entry **dst, struct flowi6 *fl6)
1071 {
1072 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1073         struct neighbour *n;
1074         struct rt6_info *rt;
1075 #endif
1076         int err;
1077         int flags = 0;
1078
1079         /* The correct way to handle this would be to do
1080          * ip6_route_get_saddr, and then ip6_route_output; however,
1081          * the route-specific preferred source forces the
1082          * ip6_route_output call _before_ ip6_route_get_saddr.
1083          *
1084          * In source specific routing (no src=any default route),
1085          * ip6_route_output will fail given src=any saddr, though, so
1086          * that's why we try it again later.
1087          */
1088         if (ipv6_addr_any(&fl6->saddr)) {
1089                 struct fib6_info *from;
1090                 struct rt6_info *rt;
1091
1092                 *dst = ip6_route_output(net, sk, fl6);
1093                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1094
1095                 rcu_read_lock();
1096                 from = rt ? rcu_dereference(rt->from) : NULL;
1097                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1098                                           sk ? inet6_sk(sk)->srcprefs : 0,
1099                                           &fl6->saddr);
1100                 rcu_read_unlock();
1101
1102                 if (err)
1103                         goto out_err_release;
1104
1105                 /* If we had an erroneous initial result, pretend it
1106                  * never existed and let the SA-enabled version take
1107                  * over.
1108                  */
1109                 if ((*dst)->error) {
1110                         dst_release(*dst);
1111                         *dst = NULL;
1112                 }
1113
1114                 if (fl6->flowi6_oif)
1115                         flags |= RT6_LOOKUP_F_IFACE;
1116         }
1117
1118         if (!*dst)
1119                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1120
1121         err = (*dst)->error;
1122         if (err)
1123                 goto out_err_release;
1124
1125 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1126         /*
1127          * Here if the dst entry we've looked up
1128          * has a neighbour entry that is in the INCOMPLETE
1129          * state and the src address from the flow is
1130          * marked as OPTIMISTIC, we release the found
1131          * dst entry and replace it instead with the
1132          * dst entry of the nexthop router
1133          */
1134         rt = (struct rt6_info *) *dst;
1135         rcu_read_lock_bh();
1136         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1137                                       rt6_nexthop(rt, &fl6->daddr));
1138         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1139         rcu_read_unlock_bh();
1140
1141         if (err) {
1142                 struct inet6_ifaddr *ifp;
1143                 struct flowi6 fl_gw6;
1144                 int redirect;
1145
1146                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1147                                       (*dst)->dev, 1);
1148
1149                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1150                 if (ifp)
1151                         in6_ifa_put(ifp);
1152
1153                 if (redirect) {
1154                         /*
1155                          * We need to get the dst entry for the
1156                          * default router instead
1157                          */
1158                         dst_release(*dst);
1159                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1160                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1161                         *dst = ip6_route_output(net, sk, &fl_gw6);
1162                         err = (*dst)->error;
1163                         if (err)
1164                                 goto out_err_release;
1165                 }
1166         }
1167 #endif
1168         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1169             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1170                 err = -EAFNOSUPPORT;
1171                 goto out_err_release;
1172         }
1173
1174         return 0;
1175
1176 out_err_release:
1177         dst_release(*dst);
1178         *dst = NULL;
1179
1180         if (err == -ENETUNREACH)
1181                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1182         return err;
1183 }
1184
1185 /**
1186  *      ip6_dst_lookup - perform route lookup on flow
1187  *      @net: Network namespace to perform lookup in
1188  *      @sk: socket which provides route info
1189  *      @dst: pointer to dst_entry * for result
1190  *      @fl6: flow to lookup
1191  *
1192  *      This function performs a route lookup on the given flow.
1193  *
1194  *      It returns zero on success, or a standard errno code on error.
1195  */
1196 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1197                    struct flowi6 *fl6)
1198 {
1199         *dst = NULL;
1200         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1201 }
1202 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1203
1204 /**
1205  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1206  *      @net: Network namespace to perform lookup in
1207  *      @sk: socket which provides route info
1208  *      @fl6: flow to lookup
1209  *      @final_dst: final destination address for ipsec lookup
1210  *
1211  *      This function performs a route lookup on the given flow.
1212  *
1213  *      It returns a valid dst pointer on success, or a pointer encoded
1214  *      error code.
1215  */
1216 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1217                                       const struct in6_addr *final_dst)
1218 {
1219         struct dst_entry *dst = NULL;
1220         int err;
1221
1222         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1223         if (err)
1224                 return ERR_PTR(err);
1225         if (final_dst)
1226                 fl6->daddr = *final_dst;
1227
1228         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1229 }
1230 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1231
1232 /**
1233  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1234  *      @sk: socket which provides the dst cache and route info
1235  *      @fl6: flow to lookup
1236  *      @final_dst: final destination address for ipsec lookup
1237  *      @connected: whether @sk is connected or not
1238  *
1239  *      This function performs a route lookup on the given flow with the
1240  *      possibility of using the cached route in the socket if it is valid.
1241  *      It will take the socket dst lock when operating on the dst cache.
1242  *      As a result, this function can only be used in process context.
1243  *
1244  *      In addition, for a connected socket, cache the dst in the socket
1245  *      if the current cache is not valid.
1246  *
1247  *      It returns a valid dst pointer on success, or a pointer encoded
1248  *      error code.
1249  */
1250 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1251                                          const struct in6_addr *final_dst,
1252                                          bool connected)
1253 {
1254         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1255
1256         dst = ip6_sk_dst_check(sk, dst, fl6);
1257         if (dst)
1258                 return dst;
1259
1260         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1261         if (connected && !IS_ERR(dst))
1262                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1263
1264         return dst;
1265 }
1266 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1267
1268 /**
1269  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1270  *      @skb: Packet for which lookup is done
1271  *      @dev: Tunnel device
1272  *      @net: Network namespace of tunnel device
1273  *      @sock: Socket which provides route info
1274  *      @saddr: Memory to store the src ip address
1275  *      @info: Tunnel information
1276  *      @protocol: IP protocol
1277  *      @use_cache: Flag to enable cache usage
1278  *      This function performs a route lookup on a tunnel
1279  *
1280  *      It returns a valid dst pointer and stores src address to be used in
1281  *      tunnel in param saddr on success, else a pointer encoded error code.
1282  */
1283
1284 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1285                                         struct net_device *dev,
1286                                         struct net *net,
1287                                         struct socket *sock,
1288                                         struct in6_addr *saddr,
1289                                         const struct ip_tunnel_info *info,
1290                                         u8 protocol,
1291                                         bool use_cache)
1292 {
1293         struct dst_entry *dst = NULL;
1294 #ifdef CONFIG_DST_CACHE
1295         struct dst_cache *dst_cache;
1296 #endif
1297         struct flowi6 fl6;
1298         __u8 prio;
1299
1300 #ifdef CONFIG_DST_CACHE
1301         dst_cache = (struct dst_cache *)&info->dst_cache;
1302         if (use_cache) {
1303                 dst = dst_cache_get_ip6(dst_cache, saddr);
1304                 if (dst)
1305                         return dst;
1306         }
1307 #endif
1308         memset(&fl6, 0, sizeof(fl6));
1309         fl6.flowi6_mark = skb->mark;
1310         fl6.flowi6_proto = protocol;
1311         fl6.daddr = info->key.u.ipv6.dst;
1312         fl6.saddr = info->key.u.ipv6.src;
1313         prio = info->key.tos;
1314         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1315
1316         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1317                                               NULL);
1318         if (IS_ERR(dst)) {
1319                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1320                 return ERR_PTR(-ENETUNREACH);
1321         }
1322         if (dst->dev == dev) { /* is this necessary? */
1323                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1324                 dst_release(dst);
1325                 return ERR_PTR(-ELOOP);
1326         }
1327 #ifdef CONFIG_DST_CACHE
1328         if (use_cache)
1329                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1330 #endif
1331         *saddr = fl6.saddr;
1332         return dst;
1333 }
1334 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1335
1336 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1337                                                gfp_t gfp)
1338 {
1339         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1340 }
1341
1342 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1343                                                 gfp_t gfp)
1344 {
1345         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1346 }
1347
1348 static void ip6_append_data_mtu(unsigned int *mtu,
1349                                 int *maxfraglen,
1350                                 unsigned int fragheaderlen,
1351                                 struct sk_buff *skb,
1352                                 struct rt6_info *rt,
1353                                 unsigned int orig_mtu)
1354 {
1355         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1356                 if (!skb) {
1357                         /* first fragment, reserve header_len */
1358                         *mtu = orig_mtu - rt->dst.header_len;
1359
1360                 } else {
1361                         /*
1362                          * this fragment is not first, the headers
1363                          * space is regarded as data space.
1364                          */
1365                         *mtu = orig_mtu;
1366                 }
1367                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1368                               + fragheaderlen - sizeof(struct frag_hdr);
1369         }
1370 }
1371
1372 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1373                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1374                           struct rt6_info *rt)
1375 {
1376         struct ipv6_pinfo *np = inet6_sk(sk);
1377         unsigned int mtu;
1378         struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1379
1380         /* callers pass dst together with a reference, set it first so
1381          * ip6_cork_release() can put it down even in case of an error.
1382          */
1383         cork->base.dst = &rt->dst;
1384
1385         /*
1386          * setup for corking
1387          */
1388         if (opt) {
1389                 if (WARN_ON(v6_cork->opt))
1390                         return -EINVAL;
1391
1392                 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1393                 if (unlikely(!nopt))
1394                         return -ENOBUFS;
1395
1396                 nopt->tot_len = sizeof(*opt);
1397                 nopt->opt_flen = opt->opt_flen;
1398                 nopt->opt_nflen = opt->opt_nflen;
1399
1400                 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1401                 if (opt->dst0opt && !nopt->dst0opt)
1402                         return -ENOBUFS;
1403
1404                 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1405                 if (opt->dst1opt && !nopt->dst1opt)
1406                         return -ENOBUFS;
1407
1408                 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1409                 if (opt->hopopt && !nopt->hopopt)
1410                         return -ENOBUFS;
1411
1412                 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1413                 if (opt->srcrt && !nopt->srcrt)
1414                         return -ENOBUFS;
1415
1416                 /* need source address above miyazawa*/
1417         }
1418         v6_cork->hop_limit = ipc6->hlimit;
1419         v6_cork->tclass = ipc6->tclass;
1420         if (rt->dst.flags & DST_XFRM_TUNNEL)
1421                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1422                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1423         else
1424                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1425                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1426         if (np->frag_size < mtu) {
1427                 if (np->frag_size)
1428                         mtu = np->frag_size;
1429         }
1430         cork->base.fragsize = mtu;
1431         cork->base.gso_size = ipc6->gso_size;
1432         cork->base.tx_flags = 0;
1433         cork->base.mark = ipc6->sockc.mark;
1434         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1435
1436         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1437                 cork->base.flags |= IPCORK_ALLFRAG;
1438         cork->base.length = 0;
1439
1440         cork->base.transmit_time = ipc6->sockc.transmit_time;
1441
1442         return 0;
1443 }
1444
1445 static int __ip6_append_data(struct sock *sk,
1446                              struct sk_buff_head *queue,
1447                              struct inet_cork_full *cork_full,
1448                              struct inet6_cork *v6_cork,
1449                              struct page_frag *pfrag,
1450                              int getfrag(void *from, char *to, int offset,
1451                                          int len, int odd, struct sk_buff *skb),
1452                              void *from, size_t length, int transhdrlen,
1453                              unsigned int flags, struct ipcm6_cookie *ipc6)
1454 {
1455         struct sk_buff *skb, *skb_prev = NULL;
1456         struct inet_cork *cork = &cork_full->base;
1457         struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1458         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1459         struct ubuf_info *uarg = NULL;
1460         int exthdrlen = 0;
1461         int dst_exthdrlen = 0;
1462         int hh_len;
1463         int copy;
1464         int err;
1465         int offset = 0;
1466         bool zc = false;
1467         u32 tskey = 0;
1468         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1469         struct ipv6_txoptions *opt = v6_cork->opt;
1470         int csummode = CHECKSUM_NONE;
1471         unsigned int maxnonfragsize, headersize;
1472         unsigned int wmem_alloc_delta = 0;
1473         bool paged, extra_uref = false;
1474
1475         skb = skb_peek_tail(queue);
1476         if (!skb) {
1477                 exthdrlen = opt ? opt->opt_flen : 0;
1478                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1479         }
1480
1481         paged = !!cork->gso_size;
1482         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1483         orig_mtu = mtu;
1484
1485         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1486             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1487                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1488
1489         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1490
1491         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1492                         (opt ? opt->opt_nflen : 0);
1493
1494         headersize = sizeof(struct ipv6hdr) +
1495                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1496                      (dst_allfrag(&rt->dst) ?
1497                       sizeof(struct frag_hdr) : 0) +
1498                      rt->rt6i_nfheader_len;
1499
1500         if (mtu <= fragheaderlen ||
1501             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1502                 goto emsgsize;
1503
1504         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1505                      sizeof(struct frag_hdr);
1506
1507         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1508          * the first fragment
1509          */
1510         if (headersize + transhdrlen > mtu)
1511                 goto emsgsize;
1512
1513         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1514             (sk->sk_protocol == IPPROTO_UDP ||
1515              sk->sk_protocol == IPPROTO_ICMPV6 ||
1516              sk->sk_protocol == IPPROTO_RAW)) {
1517                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1518                                 sizeof(struct ipv6hdr));
1519                 goto emsgsize;
1520         }
1521
1522         if (ip6_sk_ignore_df(sk))
1523                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1524         else
1525                 maxnonfragsize = mtu;
1526
1527         if (cork->length + length > maxnonfragsize - headersize) {
1528 emsgsize:
1529                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1530                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1531                 return -EMSGSIZE;
1532         }
1533
1534         /* CHECKSUM_PARTIAL only with no extension headers and when
1535          * we are not going to fragment
1536          */
1537         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1538             headersize == sizeof(struct ipv6hdr) &&
1539             length <= mtu - headersize &&
1540             (!(flags & MSG_MORE) || cork->gso_size) &&
1541             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1542                 csummode = CHECKSUM_PARTIAL;
1543
1544         if ((flags & MSG_ZEROCOPY) && length) {
1545                 struct msghdr *msg = from;
1546
1547                 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1548                         if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1549                                 return -EINVAL;
1550
1551                         /* Leave uarg NULL if can't zerocopy, callers should
1552                          * be able to handle it.
1553                          */
1554                         if ((rt->dst.dev->features & NETIF_F_SG) &&
1555                             csummode == CHECKSUM_PARTIAL) {
1556                                 paged = true;
1557                                 zc = true;
1558                                 uarg = msg->msg_ubuf;
1559                         }
1560                 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1561                         uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1562                         if (!uarg)
1563                                 return -ENOBUFS;
1564                         extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1565                         if (rt->dst.dev->features & NETIF_F_SG &&
1566                             csummode == CHECKSUM_PARTIAL) {
1567                                 paged = true;
1568                                 zc = true;
1569                         } else {
1570                                 uarg->zerocopy = 0;
1571                                 skb_zcopy_set(skb, uarg, &extra_uref);
1572                         }
1573                 }
1574         }
1575
1576         /*
1577          * Let's try using as much space as possible.
1578          * Use MTU if total length of the message fits into the MTU.
1579          * Otherwise, we need to reserve fragment header and
1580          * fragment alignment (= 8-15 octects, in total).
1581          *
1582          * Note that we may need to "move" the data from the tail
1583          * of the buffer to the new fragment when we split
1584          * the message.
1585          *
1586          * FIXME: It may be fragmented into multiple chunks
1587          *        at once if non-fragmentable extension headers
1588          *        are too large.
1589          * --yoshfuji
1590          */
1591
1592         cork->length += length;
1593         if (!skb)
1594                 goto alloc_new_skb;
1595
1596         while (length > 0) {
1597                 /* Check if the remaining data fits into current packet. */
1598                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1599                 if (copy < length)
1600                         copy = maxfraglen - skb->len;
1601
1602                 if (copy <= 0) {
1603                         char *data;
1604                         unsigned int datalen;
1605                         unsigned int fraglen;
1606                         unsigned int fraggap;
1607                         unsigned int alloclen, alloc_extra;
1608                         unsigned int pagedlen;
1609 alloc_new_skb:
1610                         /* There's no room in the current skb */
1611                         if (skb)
1612                                 fraggap = skb->len - maxfraglen;
1613                         else
1614                                 fraggap = 0;
1615                         /* update mtu and maxfraglen if necessary */
1616                         if (!skb || !skb_prev)
1617                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1618                                                     fragheaderlen, skb, rt,
1619                                                     orig_mtu);
1620
1621                         skb_prev = skb;
1622
1623                         /*
1624                          * If remaining data exceeds the mtu,
1625                          * we know we need more fragment(s).
1626                          */
1627                         datalen = length + fraggap;
1628
1629                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1630                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1631                         fraglen = datalen + fragheaderlen;
1632                         pagedlen = 0;
1633
1634                         alloc_extra = hh_len;
1635                         alloc_extra += dst_exthdrlen;
1636                         alloc_extra += rt->dst.trailer_len;
1637
1638                         /* We just reserve space for fragment header.
1639                          * Note: this may be overallocation if the message
1640                          * (without MSG_MORE) fits into the MTU.
1641                          */
1642                         alloc_extra += sizeof(struct frag_hdr);
1643
1644                         if ((flags & MSG_MORE) &&
1645                             !(rt->dst.dev->features&NETIF_F_SG))
1646                                 alloclen = mtu;
1647                         else if (!paged &&
1648                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1649                                   !(rt->dst.dev->features & NETIF_F_SG)))
1650                                 alloclen = fraglen;
1651                         else if (!zc) {
1652                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1653                                 pagedlen = fraglen - alloclen;
1654                         } else {
1655                                 alloclen = fragheaderlen + transhdrlen;
1656                                 pagedlen = datalen - transhdrlen;
1657                         }
1658                         alloclen += alloc_extra;
1659
1660                         if (datalen != length + fraggap) {
1661                                 /*
1662                                  * this is not the last fragment, the trailer
1663                                  * space is regarded as data space.
1664                                  */
1665                                 datalen += rt->dst.trailer_len;
1666                         }
1667
1668                         fraglen = datalen + fragheaderlen;
1669
1670                         copy = datalen - transhdrlen - fraggap - pagedlen;
1671                         if (copy < 0) {
1672                                 err = -EINVAL;
1673                                 goto error;
1674                         }
1675                         if (transhdrlen) {
1676                                 skb = sock_alloc_send_skb(sk, alloclen,
1677                                                 (flags & MSG_DONTWAIT), &err);
1678                         } else {
1679                                 skb = NULL;
1680                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1681                                     2 * sk->sk_sndbuf)
1682                                         skb = alloc_skb(alloclen,
1683                                                         sk->sk_allocation);
1684                                 if (unlikely(!skb))
1685                                         err = -ENOBUFS;
1686                         }
1687                         if (!skb)
1688                                 goto error;
1689                         /*
1690                          *      Fill in the control structures
1691                          */
1692                         skb->protocol = htons(ETH_P_IPV6);
1693                         skb->ip_summed = csummode;
1694                         skb->csum = 0;
1695                         /* reserve for fragmentation and ipsec header */
1696                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1697                                     dst_exthdrlen);
1698
1699                         /*
1700                          *      Find where to start putting bytes
1701                          */
1702                         data = skb_put(skb, fraglen - pagedlen);
1703                         skb_set_network_header(skb, exthdrlen);
1704                         data += fragheaderlen;
1705                         skb->transport_header = (skb->network_header +
1706                                                  fragheaderlen);
1707                         if (fraggap) {
1708                                 skb->csum = skb_copy_and_csum_bits(
1709                                         skb_prev, maxfraglen,
1710                                         data + transhdrlen, fraggap);
1711                                 skb_prev->csum = csum_sub(skb_prev->csum,
1712                                                           skb->csum);
1713                                 data += fraggap;
1714                                 pskb_trim_unique(skb_prev, maxfraglen);
1715                         }
1716                         if (copy > 0 &&
1717                             getfrag(from, data + transhdrlen, offset,
1718                                     copy, fraggap, skb) < 0) {
1719                                 err = -EFAULT;
1720                                 kfree_skb(skb);
1721                                 goto error;
1722                         }
1723
1724                         offset += copy;
1725                         length -= copy + transhdrlen;
1726                         transhdrlen = 0;
1727                         exthdrlen = 0;
1728                         dst_exthdrlen = 0;
1729
1730                         /* Only the initial fragment is time stamped */
1731                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1732                         cork->tx_flags = 0;
1733                         skb_shinfo(skb)->tskey = tskey;
1734                         tskey = 0;
1735                         skb_zcopy_set(skb, uarg, &extra_uref);
1736
1737                         if ((flags & MSG_CONFIRM) && !skb_prev)
1738                                 skb_set_dst_pending_confirm(skb, 1);
1739
1740                         /*
1741                          * Put the packet on the pending queue
1742                          */
1743                         if (!skb->destructor) {
1744                                 skb->destructor = sock_wfree;
1745                                 skb->sk = sk;
1746                                 wmem_alloc_delta += skb->truesize;
1747                         }
1748                         __skb_queue_tail(queue, skb);
1749                         continue;
1750                 }
1751
1752                 if (copy > length)
1753                         copy = length;
1754
1755                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1756                     skb_tailroom(skb) >= copy) {
1757                         unsigned int off;
1758
1759                         off = skb->len;
1760                         if (getfrag(from, skb_put(skb, copy),
1761                                                 offset, copy, off, skb) < 0) {
1762                                 __skb_trim(skb, off);
1763                                 err = -EFAULT;
1764                                 goto error;
1765                         }
1766                 } else if (!zc) {
1767                         int i = skb_shinfo(skb)->nr_frags;
1768
1769                         err = -ENOMEM;
1770                         if (!sk_page_frag_refill(sk, pfrag))
1771                                 goto error;
1772
1773                         skb_zcopy_downgrade_managed(skb);
1774                         if (!skb_can_coalesce(skb, i, pfrag->page,
1775                                               pfrag->offset)) {
1776                                 err = -EMSGSIZE;
1777                                 if (i == MAX_SKB_FRAGS)
1778                                         goto error;
1779
1780                                 __skb_fill_page_desc(skb, i, pfrag->page,
1781                                                      pfrag->offset, 0);
1782                                 skb_shinfo(skb)->nr_frags = ++i;
1783                                 get_page(pfrag->page);
1784                         }
1785                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1786                         if (getfrag(from,
1787                                     page_address(pfrag->page) + pfrag->offset,
1788                                     offset, copy, skb->len, skb) < 0)
1789                                 goto error_efault;
1790
1791                         pfrag->offset += copy;
1792                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1793                         skb->len += copy;
1794                         skb->data_len += copy;
1795                         skb->truesize += copy;
1796                         wmem_alloc_delta += copy;
1797                 } else {
1798                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1799                         if (err < 0)
1800                                 goto error;
1801                 }
1802                 offset += copy;
1803                 length -= copy;
1804         }
1805
1806         if (wmem_alloc_delta)
1807                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1808         return 0;
1809
1810 error_efault:
1811         err = -EFAULT;
1812 error:
1813         net_zcopy_put_abort(uarg, extra_uref);
1814         cork->length -= length;
1815         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1816         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1817         return err;
1818 }
1819
1820 int ip6_append_data(struct sock *sk,
1821                     int getfrag(void *from, char *to, int offset, int len,
1822                                 int odd, struct sk_buff *skb),
1823                     void *from, size_t length, int transhdrlen,
1824                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1825                     struct rt6_info *rt, unsigned int flags)
1826 {
1827         struct inet_sock *inet = inet_sk(sk);
1828         struct ipv6_pinfo *np = inet6_sk(sk);
1829         int exthdrlen;
1830         int err;
1831
1832         if (flags&MSG_PROBE)
1833                 return 0;
1834         if (skb_queue_empty(&sk->sk_write_queue)) {
1835                 /*
1836                  * setup for corking
1837                  */
1838                 dst_hold(&rt->dst);
1839                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1840                                      ipc6, rt);
1841                 if (err)
1842                         return err;
1843
1844                 inet->cork.fl.u.ip6 = *fl6;
1845                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1846                 length += exthdrlen;
1847                 transhdrlen += exthdrlen;
1848         } else {
1849                 transhdrlen = 0;
1850         }
1851
1852         return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1853                                  &np->cork, sk_page_frag(sk), getfrag,
1854                                  from, length, transhdrlen, flags, ipc6);
1855 }
1856 EXPORT_SYMBOL_GPL(ip6_append_data);
1857
1858 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1859 {
1860         struct dst_entry *dst = cork->base.dst;
1861
1862         cork->base.dst = NULL;
1863         cork->base.flags &= ~IPCORK_ALLFRAG;
1864         skb_dst_set(skb, dst);
1865 }
1866
1867 static void ip6_cork_release(struct inet_cork_full *cork,
1868                              struct inet6_cork *v6_cork)
1869 {
1870         if (v6_cork->opt) {
1871                 struct ipv6_txoptions *opt = v6_cork->opt;
1872
1873                 kfree(opt->dst0opt);
1874                 kfree(opt->dst1opt);
1875                 kfree(opt->hopopt);
1876                 kfree(opt->srcrt);
1877                 kfree(opt);
1878                 v6_cork->opt = NULL;
1879         }
1880
1881         if (cork->base.dst) {
1882                 dst_release(cork->base.dst);
1883                 cork->base.dst = NULL;
1884                 cork->base.flags &= ~IPCORK_ALLFRAG;
1885         }
1886 }
1887
1888 struct sk_buff *__ip6_make_skb(struct sock *sk,
1889                                struct sk_buff_head *queue,
1890                                struct inet_cork_full *cork,
1891                                struct inet6_cork *v6_cork)
1892 {
1893         struct sk_buff *skb, *tmp_skb;
1894         struct sk_buff **tail_skb;
1895         struct in6_addr *final_dst;
1896         struct ipv6_pinfo *np = inet6_sk(sk);
1897         struct net *net = sock_net(sk);
1898         struct ipv6hdr *hdr;
1899         struct ipv6_txoptions *opt = v6_cork->opt;
1900         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1901         struct flowi6 *fl6 = &cork->fl.u.ip6;
1902         unsigned char proto = fl6->flowi6_proto;
1903
1904         skb = __skb_dequeue(queue);
1905         if (!skb)
1906                 goto out;
1907         tail_skb = &(skb_shinfo(skb)->frag_list);
1908
1909         /* move skb->data to ip header from ext header */
1910         if (skb->data < skb_network_header(skb))
1911                 __skb_pull(skb, skb_network_offset(skb));
1912         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1913                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1914                 *tail_skb = tmp_skb;
1915                 tail_skb = &(tmp_skb->next);
1916                 skb->len += tmp_skb->len;
1917                 skb->data_len += tmp_skb->len;
1918                 skb->truesize += tmp_skb->truesize;
1919                 tmp_skb->destructor = NULL;
1920                 tmp_skb->sk = NULL;
1921         }
1922
1923         /* Allow local fragmentation. */
1924         skb->ignore_df = ip6_sk_ignore_df(sk);
1925         __skb_pull(skb, skb_network_header_len(skb));
1926
1927         final_dst = &fl6->daddr;
1928         if (opt && opt->opt_flen)
1929                 ipv6_push_frag_opts(skb, opt, &proto);
1930         if (opt && opt->opt_nflen)
1931                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1932
1933         skb_push(skb, sizeof(struct ipv6hdr));
1934         skb_reset_network_header(skb);
1935         hdr = ipv6_hdr(skb);
1936
1937         ip6_flow_hdr(hdr, v6_cork->tclass,
1938                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1939                                         ip6_autoflowlabel(net, np), fl6));
1940         hdr->hop_limit = v6_cork->hop_limit;
1941         hdr->nexthdr = proto;
1942         hdr->saddr = fl6->saddr;
1943         hdr->daddr = *final_dst;
1944
1945         skb->priority = sk->sk_priority;
1946         skb->mark = cork->base.mark;
1947         skb->tstamp = cork->base.transmit_time;
1948
1949         ip6_cork_steal_dst(skb, cork);
1950         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1951         if (proto == IPPROTO_ICMPV6) {
1952                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1953
1954                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1955                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1956         }
1957
1958         ip6_cork_release(cork, v6_cork);
1959 out:
1960         return skb;
1961 }
1962
1963 int ip6_send_skb(struct sk_buff *skb)
1964 {
1965         struct net *net = sock_net(skb->sk);
1966         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1967         int err;
1968
1969         err = ip6_local_out(net, skb->sk, skb);
1970         if (err) {
1971                 if (err > 0)
1972                         err = net_xmit_errno(err);
1973                 if (err)
1974                         IP6_INC_STATS(net, rt->rt6i_idev,
1975                                       IPSTATS_MIB_OUTDISCARDS);
1976         }
1977
1978         return err;
1979 }
1980
1981 int ip6_push_pending_frames(struct sock *sk)
1982 {
1983         struct sk_buff *skb;
1984
1985         skb = ip6_finish_skb(sk);
1986         if (!skb)
1987                 return 0;
1988
1989         return ip6_send_skb(skb);
1990 }
1991 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1992
1993 static void __ip6_flush_pending_frames(struct sock *sk,
1994                                        struct sk_buff_head *queue,
1995                                        struct inet_cork_full *cork,
1996                                        struct inet6_cork *v6_cork)
1997 {
1998         struct sk_buff *skb;
1999
2000         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2001                 if (skb_dst(skb))
2002                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2003                                       IPSTATS_MIB_OUTDISCARDS);
2004                 kfree_skb(skb);
2005         }
2006
2007         ip6_cork_release(cork, v6_cork);
2008 }
2009
2010 void ip6_flush_pending_frames(struct sock *sk)
2011 {
2012         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2013                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2014 }
2015 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2016
2017 struct sk_buff *ip6_make_skb(struct sock *sk,
2018                              int getfrag(void *from, char *to, int offset,
2019                                          int len, int odd, struct sk_buff *skb),
2020                              void *from, size_t length, int transhdrlen,
2021                              struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2022                              unsigned int flags, struct inet_cork_full *cork)
2023 {
2024         struct inet6_cork v6_cork;
2025         struct sk_buff_head queue;
2026         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2027         int err;
2028
2029         if (flags & MSG_PROBE) {
2030                 dst_release(&rt->dst);
2031                 return NULL;
2032         }
2033
2034         __skb_queue_head_init(&queue);
2035
2036         cork->base.flags = 0;
2037         cork->base.addr = 0;
2038         cork->base.opt = NULL;
2039         v6_cork.opt = NULL;
2040         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2041         if (err) {
2042                 ip6_cork_release(cork, &v6_cork);
2043                 return ERR_PTR(err);
2044         }
2045         if (ipc6->dontfrag < 0)
2046                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2047
2048         err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2049                                 &current->task_frag, getfrag, from,
2050                                 length + exthdrlen, transhdrlen + exthdrlen,
2051                                 flags, ipc6);
2052         if (err) {
2053                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2054                 return ERR_PTR(err);
2055         }
2056
2057         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2058 }