GNU Linux-libre 4.9.337-gnu1
[releases.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59 #include <net/lwtunnel.h>
60
61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
62 {
63         struct dst_entry *dst = skb_dst(skb);
64         struct net_device *dev = dst->dev;
65         struct neighbour *neigh;
66         struct in6_addr *nexthop;
67         int ret;
68
69         skb->protocol = htons(ETH_P_IPV6);
70         skb->dev = dev;
71
72         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
73                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
74
75                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
76                     ((mroute6_socket(net, skb) &&
77                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
78                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
79                                          &ipv6_hdr(skb)->saddr))) {
80                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
81
82                         /* Do not check for IFF_ALLMULTI; multicast routing
83                            is not supported in any case.
84                          */
85                         if (newskb)
86                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
87                                         net, sk, newskb, NULL, newskb->dev,
88                                         dev_loopback_xmit);
89
90                         if (ipv6_hdr(skb)->hop_limit == 0) {
91                                 IP6_INC_STATS(net, idev,
92                                               IPSTATS_MIB_OUTDISCARDS);
93                                 kfree_skb(skb);
94                                 return 0;
95                         }
96                 }
97
98                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
99
100                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
101                     IPV6_ADDR_SCOPE_NODELOCAL &&
102                     !(dev->flags & IFF_LOOPBACK)) {
103                         kfree_skb(skb);
104                         return 0;
105                 }
106         }
107
108         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
109                 int res = lwtunnel_xmit(skb);
110
111                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
112                         return res;
113         }
114
115         rcu_read_lock_bh();
116         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
117         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
118         if (unlikely(!neigh))
119                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
120         if (!IS_ERR(neigh)) {
121                 ret = dst_neigh_output(dst, neigh, skb);
122                 rcu_read_unlock_bh();
123                 return ret;
124         }
125         rcu_read_unlock_bh();
126
127         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
128         kfree_skb(skb);
129         return -EINVAL;
130 }
131
132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 {
134         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
135             dst_allfrag(skb_dst(skb)) ||
136             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
137                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
138         else
139                 return ip6_finish_output2(net, sk, skb);
140 }
141
142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
143 {
144         struct net_device *dev = skb_dst(skb)->dev;
145         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
146
147         if (unlikely(idev->cnf.disable_ipv6)) {
148                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
149                 kfree_skb(skb);
150                 return 0;
151         }
152
153         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
154                             net, sk, skb, NULL, dev,
155                             ip6_finish_output,
156                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
157 }
158
159 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
160 {
161         if (!np->autoflowlabel_set)
162                 return ip6_default_np_autolabel(net);
163         else
164                 return np->autoflowlabel;
165 }
166
167 /*
168  * xmit an sk_buff (used by TCP, SCTP and DCCP)
169  * Note : socket lock is not held for SYNACK packets, but might be modified
170  * by calls to skb_set_owner_w() and ipv6_local_error(),
171  * which are using proper atomic operations or spinlocks.
172  */
173 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
174              __u32 mark, struct ipv6_txoptions *opt, int tclass)
175 {
176         struct net *net = sock_net(sk);
177         const struct ipv6_pinfo *np = inet6_sk(sk);
178         struct in6_addr *first_hop = &fl6->daddr;
179         struct dst_entry *dst = skb_dst(skb);
180         unsigned int head_room;
181         struct ipv6hdr *hdr;
182         u8  proto = fl6->flowi6_proto;
183         int seg_len = skb->len;
184         int hlimit = -1;
185         u32 mtu;
186
187         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
188         if (opt)
189                 head_room += opt->opt_nflen + opt->opt_flen;
190
191         if (unlikely(skb_headroom(skb) < head_room)) {
192                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
193                 if (!skb2) {
194                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
195                                       IPSTATS_MIB_OUTDISCARDS);
196                         kfree_skb(skb);
197                         return -ENOBUFS;
198                 }
199                 if (skb->sk)
200                         skb_set_owner_w(skb2, skb->sk);
201                 consume_skb(skb);
202                 skb = skb2;
203         }
204
205         if (opt) {
206                 seg_len += opt->opt_nflen + opt->opt_flen;
207
208                 if (opt->opt_flen)
209                         ipv6_push_frag_opts(skb, opt, &proto);
210
211                 if (opt->opt_nflen)
212                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
213         }
214
215         skb_push(skb, sizeof(struct ipv6hdr));
216         skb_reset_network_header(skb);
217         hdr = ipv6_hdr(skb);
218
219         /*
220          *      Fill in the IPv6 header
221          */
222         if (np)
223                 hlimit = np->hop_limit;
224         if (hlimit < 0)
225                 hlimit = ip6_dst_hoplimit(dst);
226
227         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
228                                 ip6_autoflowlabel(net, np), fl6));
229
230         hdr->payload_len = htons(seg_len);
231         hdr->nexthdr = proto;
232         hdr->hop_limit = hlimit;
233
234         hdr->saddr = fl6->saddr;
235         hdr->daddr = *first_hop;
236
237         skb->protocol = htons(ETH_P_IPV6);
238         skb->priority = sk->sk_priority;
239         skb->mark = mark;
240
241         mtu = dst_mtu(dst);
242         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
243                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
244                               IPSTATS_MIB_OUT, skb->len);
245
246                 /* if egress device is enslaved to an L3 master device pass the
247                  * skb to its handler for processing
248                  */
249                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
250                 if (unlikely(!skb))
251                         return 0;
252
253                 /* hooks should never assume socket lock is held.
254                  * we promote our socket to non const
255                  */
256                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
257                                net, (struct sock *)sk, skb, NULL, dst->dev,
258                                dst_output);
259         }
260
261         skb->dev = dst->dev;
262         /* ipv6_local_error() does not require socket lock,
263          * we promote our socket to non const
264          */
265         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
266
267         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
268         kfree_skb(skb);
269         return -EMSGSIZE;
270 }
271 EXPORT_SYMBOL(ip6_xmit);
272
273 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
274 {
275         struct ip6_ra_chain *ra;
276         struct sock *last = NULL;
277
278         read_lock(&ip6_ra_lock);
279         for (ra = ip6_ra_chain; ra; ra = ra->next) {
280                 struct sock *sk = ra->sk;
281                 if (sk && ra->sel == sel &&
282                     (!sk->sk_bound_dev_if ||
283                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
284                         if (last) {
285                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
286                                 if (skb2)
287                                         rawv6_rcv(last, skb2);
288                         }
289                         last = sk;
290                 }
291         }
292
293         if (last) {
294                 rawv6_rcv(last, skb);
295                 read_unlock(&ip6_ra_lock);
296                 return 1;
297         }
298         read_unlock(&ip6_ra_lock);
299         return 0;
300 }
301
302 static int ip6_forward_proxy_check(struct sk_buff *skb)
303 {
304         struct ipv6hdr *hdr = ipv6_hdr(skb);
305         u8 nexthdr = hdr->nexthdr;
306         __be16 frag_off;
307         int offset;
308
309         if (ipv6_ext_hdr(nexthdr)) {
310                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
311                 if (offset < 0)
312                         return 0;
313         } else
314                 offset = sizeof(struct ipv6hdr);
315
316         if (nexthdr == IPPROTO_ICMPV6) {
317                 struct icmp6hdr *icmp6;
318
319                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
320                                          offset + 1 - skb->data)))
321                         return 0;
322
323                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
324
325                 switch (icmp6->icmp6_type) {
326                 case NDISC_ROUTER_SOLICITATION:
327                 case NDISC_ROUTER_ADVERTISEMENT:
328                 case NDISC_NEIGHBOUR_SOLICITATION:
329                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
330                 case NDISC_REDIRECT:
331                         /* For reaction involving unicast neighbor discovery
332                          * message destined to the proxied address, pass it to
333                          * input function.
334                          */
335                         return 1;
336                 default:
337                         break;
338                 }
339         }
340
341         /*
342          * The proxying router can't forward traffic sent to a link-local
343          * address, so signal the sender and discard the packet. This
344          * behavior is clarified by the MIPv6 specification.
345          */
346         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
347                 dst_link_failure(skb);
348                 return -1;
349         }
350
351         return 0;
352 }
353
354 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
355                                      struct sk_buff *skb)
356 {
357         struct dst_entry *dst = skb_dst(skb);
358
359         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
360         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
361
362         return dst_output(net, sk, skb);
363 }
364
365 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
366 {
367         unsigned int mtu;
368         struct inet6_dev *idev;
369
370         if (dst_metric_locked(dst, RTAX_MTU)) {
371                 mtu = dst_metric_raw(dst, RTAX_MTU);
372                 if (mtu)
373                         return mtu;
374         }
375
376         mtu = IPV6_MIN_MTU;
377         rcu_read_lock();
378         idev = __in6_dev_get(dst->dev);
379         if (idev)
380                 mtu = idev->cnf.mtu6;
381         rcu_read_unlock();
382
383         return mtu;
384 }
385
386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
387 {
388         if (skb->len <= mtu)
389                 return false;
390
391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
393                 return true;
394
395         if (skb->ignore_df)
396                 return false;
397
398         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
399                 return false;
400
401         return true;
402 }
403
404 int ip6_forward(struct sk_buff *skb)
405 {
406         struct dst_entry *dst = skb_dst(skb);
407         struct ipv6hdr *hdr = ipv6_hdr(skb);
408         struct inet6_skb_parm *opt = IP6CB(skb);
409         struct net *net = dev_net(dst->dev);
410         u32 mtu;
411
412         if (net->ipv6.devconf_all->forwarding == 0)
413                 goto error;
414
415         if (skb->pkt_type != PACKET_HOST)
416                 goto drop;
417
418         if (unlikely(skb->sk))
419                 goto drop;
420
421         if (skb_warn_if_lro(skb))
422                 goto drop;
423
424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
426                                 IPSTATS_MIB_INDISCARDS);
427                 goto drop;
428         }
429
430         skb_forward_csum(skb);
431
432         /*
433          *      We DO NOT make any processing on
434          *      RA packets, pushing them to user level AS IS
435          *      without ane WARRANTY that application will be able
436          *      to interpret them. The reason is that we
437          *      cannot make anything clever here.
438          *
439          *      We are not end-node, so that if packet contains
440          *      AH/ESP, we cannot make anything.
441          *      Defragmentation also would be mistake, RA packets
442          *      cannot be fragmented, because there is no warranty
443          *      that different fragments will go along one path. --ANK
444          */
445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
447                         return 0;
448         }
449
450         /*
451          *      check and decrement ttl
452          */
453         if (hdr->hop_limit <= 1) {
454                 /* Force OUTPUT device used as source address */
455                 skb->dev = dst->dev;
456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
457                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
458                                 IPSTATS_MIB_INHDRERRORS);
459
460                 kfree_skb(skb);
461                 return -ETIMEDOUT;
462         }
463
464         /* XXX: idev->cnf.proxy_ndp? */
465         if (net->ipv6.devconf_all->proxy_ndp &&
466             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
467                 int proxied = ip6_forward_proxy_check(skb);
468                 if (proxied > 0)
469                         return ip6_input(skb);
470                 else if (proxied < 0) {
471                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
472                                         IPSTATS_MIB_INDISCARDS);
473                         goto drop;
474                 }
475         }
476
477         if (!xfrm6_route_forward(skb)) {
478                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
479                                 IPSTATS_MIB_INDISCARDS);
480                 goto drop;
481         }
482         dst = skb_dst(skb);
483
484         /* IPv6 specs say nothing about it, but it is clear that we cannot
485            send redirects to source routed frames.
486            We don't send redirects to frames decapsulated from IPsec.
487          */
488         if (IP6CB(skb)->iif == dst->dev->ifindex &&
489             opt->srcrt == 0 && !skb_sec_path(skb)) {
490                 struct in6_addr *target = NULL;
491                 struct inet_peer *peer;
492                 struct rt6_info *rt;
493
494                 /*
495                  *      incoming and outgoing devices are the same
496                  *      send a redirect.
497                  */
498
499                 rt = (struct rt6_info *) dst;
500                 if (rt->rt6i_flags & RTF_GATEWAY)
501                         target = &rt->rt6i_gateway;
502                 else
503                         target = &hdr->daddr;
504
505                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
506
507                 /* Limit redirects both by destination (here)
508                    and by source (inside ndisc_send_redirect)
509                  */
510                 if (inet_peer_xrlim_allow(peer, 1*HZ))
511                         ndisc_send_redirect(skb, target);
512                 if (peer)
513                         inet_putpeer(peer);
514         } else {
515                 int addrtype = ipv6_addr_type(&hdr->saddr);
516
517                 /* This check is security critical. */
518                 if (addrtype == IPV6_ADDR_ANY ||
519                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
520                         goto error;
521                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
522                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
523                                     ICMPV6_NOT_NEIGHBOUR, 0);
524                         goto error;
525                 }
526         }
527
528         mtu = ip6_dst_mtu_forward(dst);
529         if (mtu < IPV6_MIN_MTU)
530                 mtu = IPV6_MIN_MTU;
531
532         if (ip6_pkt_too_big(skb, mtu)) {
533                 /* Again, force OUTPUT device used as source address */
534                 skb->dev = dst->dev;
535                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
536                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
537                                 IPSTATS_MIB_INTOOBIGERRORS);
538                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
539                                 IPSTATS_MIB_FRAGFAILS);
540                 kfree_skb(skb);
541                 return -EMSGSIZE;
542         }
543
544         if (skb_cow(skb, dst->dev->hard_header_len)) {
545                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
546                                 IPSTATS_MIB_OUTDISCARDS);
547                 goto drop;
548         }
549
550         hdr = ipv6_hdr(skb);
551
552         /* Mangling hops number delayed to point after skb COW */
553
554         hdr->hop_limit--;
555
556         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
557                        net, NULL, skb, skb->dev, dst->dev,
558                        ip6_forward_finish);
559
560 error:
561         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
562 drop:
563         kfree_skb(skb);
564         return -EINVAL;
565 }
566
567 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
568 {
569         to->pkt_type = from->pkt_type;
570         to->priority = from->priority;
571         to->protocol = from->protocol;
572         skb_dst_drop(to);
573         skb_dst_set(to, dst_clone(skb_dst(from)));
574         to->dev = from->dev;
575         to->mark = from->mark;
576
577         skb_copy_hash(to, from);
578
579 #ifdef CONFIG_NET_SCHED
580         to->tc_index = from->tc_index;
581 #endif
582         nf_copy(to, from);
583         skb_copy_secmark(to, from);
584 }
585
586 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
587                  int (*output)(struct net *, struct sock *, struct sk_buff *))
588 {
589         struct sk_buff *frag;
590         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
591         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
592                                 inet6_sk(skb->sk) : NULL;
593         struct ipv6hdr *tmp_hdr;
594         struct frag_hdr *fh;
595         unsigned int mtu, hlen, left, len, nexthdr_offset;
596         int hroom, troom;
597         __be32 frag_id;
598         int ptr, offset = 0, err = 0;
599         u8 *prevhdr, nexthdr = 0;
600
601         err = ip6_find_1stfragopt(skb, &prevhdr);
602         if (err < 0)
603                 goto fail;
604         hlen = err;
605         nexthdr = *prevhdr;
606         nexthdr_offset = prevhdr - skb_network_header(skb);
607
608         mtu = ip6_skb_dst_mtu(skb);
609
610         /* We must not fragment if the socket is set to force MTU discovery
611          * or if the skb it not generated by a local socket.
612          */
613         if (unlikely(!skb->ignore_df && skb->len > mtu))
614                 goto fail_toobig;
615
616         if (IP6CB(skb)->frag_max_size) {
617                 if (IP6CB(skb)->frag_max_size > mtu)
618                         goto fail_toobig;
619
620                 /* don't send fragments larger than what we received */
621                 mtu = IP6CB(skb)->frag_max_size;
622                 if (mtu < IPV6_MIN_MTU)
623                         mtu = IPV6_MIN_MTU;
624         }
625
626         if (np && np->frag_size < mtu) {
627                 if (np->frag_size)
628                         mtu = np->frag_size;
629         }
630         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
631                 goto fail_toobig;
632         mtu -= hlen + sizeof(struct frag_hdr);
633
634         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
635                                     &ipv6_hdr(skb)->saddr);
636
637         if (skb->ip_summed == CHECKSUM_PARTIAL &&
638             (err = skb_checksum_help(skb)))
639                 goto fail;
640
641         prevhdr = skb_network_header(skb) + nexthdr_offset;
642         hroom = LL_RESERVED_SPACE(rt->dst.dev);
643         if (skb_has_frag_list(skb)) {
644                 int first_len = skb_pagelen(skb);
645                 struct sk_buff *frag2;
646
647                 if (first_len - hlen > mtu ||
648                     ((first_len - hlen) & 7) ||
649                     skb_cloned(skb) ||
650                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
651                         goto slow_path;
652
653                 skb_walk_frags(skb, frag) {
654                         /* Correct geometry. */
655                         if (frag->len > mtu ||
656                             ((frag->len & 7) && frag->next) ||
657                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
658                                 goto slow_path_clean;
659
660                         /* Partially cloned skb? */
661                         if (skb_shared(frag))
662                                 goto slow_path_clean;
663
664                         BUG_ON(frag->sk);
665                         if (skb->sk) {
666                                 frag->sk = skb->sk;
667                                 frag->destructor = sock_wfree;
668                         }
669                         skb->truesize -= frag->truesize;
670                 }
671
672                 err = 0;
673                 offset = 0;
674                 /* BUILD HEADER */
675
676                 *prevhdr = NEXTHDR_FRAGMENT;
677                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
678                 if (!tmp_hdr) {
679                         err = -ENOMEM;
680                         goto fail;
681                 }
682                 frag = skb_shinfo(skb)->frag_list;
683                 skb_frag_list_init(skb);
684
685                 __skb_pull(skb, hlen);
686                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
687                 __skb_push(skb, hlen);
688                 skb_reset_network_header(skb);
689                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
690
691                 fh->nexthdr = nexthdr;
692                 fh->reserved = 0;
693                 fh->frag_off = htons(IP6_MF);
694                 fh->identification = frag_id;
695
696                 first_len = skb_pagelen(skb);
697                 skb->data_len = first_len - skb_headlen(skb);
698                 skb->len = first_len;
699                 ipv6_hdr(skb)->payload_len = htons(first_len -
700                                                    sizeof(struct ipv6hdr));
701
702                 dst_hold(&rt->dst);
703
704                 for (;;) {
705                         /* Prepare header of the next frame,
706                          * before previous one went down. */
707                         if (frag) {
708                                 frag->ip_summed = CHECKSUM_NONE;
709                                 skb_reset_transport_header(frag);
710                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
711                                 __skb_push(frag, hlen);
712                                 skb_reset_network_header(frag);
713                                 memcpy(skb_network_header(frag), tmp_hdr,
714                                        hlen);
715                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
716                                 fh->nexthdr = nexthdr;
717                                 fh->reserved = 0;
718                                 fh->frag_off = htons(offset);
719                                 if (frag->next)
720                                         fh->frag_off |= htons(IP6_MF);
721                                 fh->identification = frag_id;
722                                 ipv6_hdr(frag)->payload_len =
723                                                 htons(frag->len -
724                                                       sizeof(struct ipv6hdr));
725                                 ip6_copy_metadata(frag, skb);
726                         }
727
728                         err = output(net, sk, skb);
729                         if (!err)
730                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
731                                               IPSTATS_MIB_FRAGCREATES);
732
733                         if (err || !frag)
734                                 break;
735
736                         skb = frag;
737                         frag = skb->next;
738                         skb->next = NULL;
739                 }
740
741                 kfree(tmp_hdr);
742
743                 if (err == 0) {
744                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
745                                       IPSTATS_MIB_FRAGOKS);
746                         ip6_rt_put(rt);
747                         return 0;
748                 }
749
750                 kfree_skb_list(frag);
751
752                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753                               IPSTATS_MIB_FRAGFAILS);
754                 ip6_rt_put(rt);
755                 return err;
756
757 slow_path_clean:
758                 skb_walk_frags(skb, frag2) {
759                         if (frag2 == frag)
760                                 break;
761                         frag2->sk = NULL;
762                         frag2->destructor = NULL;
763                         skb->truesize += frag2->truesize;
764                 }
765         }
766
767 slow_path:
768         left = skb->len - hlen;         /* Space per frame */
769         ptr = hlen;                     /* Where to start from */
770
771         /*
772          *      Fragment the datagram.
773          */
774
775         troom = rt->dst.dev->needed_tailroom;
776
777         /*
778          *      Keep copying data until we run out.
779          */
780         while (left > 0)        {
781                 u8 *fragnexthdr_offset;
782
783                 len = left;
784                 /* IF: it doesn't fit, use 'mtu' - the data space left */
785                 if (len > mtu)
786                         len = mtu;
787                 /* IF: we are not sending up to and including the packet end
788                    then align the next start on an eight byte boundary */
789                 if (len < left) {
790                         len &= ~7;
791                 }
792
793                 /* Allocate buffer */
794                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
795                                  hroom + troom, GFP_ATOMIC);
796                 if (!frag) {
797                         err = -ENOMEM;
798                         goto fail;
799                 }
800
801                 /*
802                  *      Set up data on packet
803                  */
804
805                 ip6_copy_metadata(frag, skb);
806                 skb_reserve(frag, hroom);
807                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
808                 skb_reset_network_header(frag);
809                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
810                 frag->transport_header = (frag->network_header + hlen +
811                                           sizeof(struct frag_hdr));
812
813                 /*
814                  *      Charge the memory for the fragment to any owner
815                  *      it might possess
816                  */
817                 if (skb->sk)
818                         skb_set_owner_w(frag, skb->sk);
819
820                 /*
821                  *      Copy the packet header into the new buffer.
822                  */
823                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
824
825                 fragnexthdr_offset = skb_network_header(frag);
826                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
827                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
828
829                 /*
830                  *      Build fragment header.
831                  */
832                 fh->nexthdr = nexthdr;
833                 fh->reserved = 0;
834                 fh->identification = frag_id;
835
836                 /*
837                  *      Copy a block of the IP datagram.
838                  */
839                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
840                                      len));
841                 left -= len;
842
843                 fh->frag_off = htons(offset);
844                 if (left > 0)
845                         fh->frag_off |= htons(IP6_MF);
846                 ipv6_hdr(frag)->payload_len = htons(frag->len -
847                                                     sizeof(struct ipv6hdr));
848
849                 ptr += len;
850                 offset += len;
851
852                 /*
853                  *      Put this fragment into the sending queue.
854                  */
855                 err = output(net, sk, frag);
856                 if (err)
857                         goto fail;
858
859                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
860                               IPSTATS_MIB_FRAGCREATES);
861         }
862         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
863                       IPSTATS_MIB_FRAGOKS);
864         consume_skb(skb);
865         return err;
866
867 fail_toobig:
868         if (skb->sk && dst_allfrag(skb_dst(skb)))
869                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
870
871         skb->dev = skb_dst(skb)->dev;
872         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
873         err = -EMSGSIZE;
874
875 fail:
876         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
877                       IPSTATS_MIB_FRAGFAILS);
878         kfree_skb(skb);
879         return err;
880 }
881
882 static inline int ip6_rt_check(const struct rt6key *rt_key,
883                                const struct in6_addr *fl_addr,
884                                const struct in6_addr *addr_cache)
885 {
886         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
887                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
888 }
889
890 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
891                                           struct dst_entry *dst,
892                                           const struct flowi6 *fl6)
893 {
894         struct ipv6_pinfo *np = inet6_sk(sk);
895         struct rt6_info *rt;
896
897         if (!dst)
898                 goto out;
899
900         if (dst->ops->family != AF_INET6) {
901                 dst_release(dst);
902                 return NULL;
903         }
904
905         rt = (struct rt6_info *)dst;
906         /* Yes, checking route validity in not connected
907          * case is not very simple. Take into account,
908          * that we do not support routing by source, TOS,
909          * and MSG_DONTROUTE            --ANK (980726)
910          *
911          * 1. ip6_rt_check(): If route was host route,
912          *    check that cached destination is current.
913          *    If it is network route, we still may
914          *    check its validity using saved pointer
915          *    to the last used address: daddr_cache.
916          *    We do not want to save whole address now,
917          *    (because main consumer of this service
918          *    is tcp, which has not this problem),
919          *    so that the last trick works only on connected
920          *    sockets.
921          * 2. oif also should be the same.
922          */
923         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
924 #ifdef CONFIG_IPV6_SUBTREES
925             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
926 #endif
927            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
928               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
929                 dst_release(dst);
930                 dst = NULL;
931         }
932
933 out:
934         return dst;
935 }
936
937 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
938                                struct dst_entry **dst, struct flowi6 *fl6)
939 {
940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
941         struct neighbour *n;
942         struct rt6_info *rt;
943 #endif
944         int err;
945         int flags = 0;
946
947         /* The correct way to handle this would be to do
948          * ip6_route_get_saddr, and then ip6_route_output; however,
949          * the route-specific preferred source forces the
950          * ip6_route_output call _before_ ip6_route_get_saddr.
951          *
952          * In source specific routing (no src=any default route),
953          * ip6_route_output will fail given src=any saddr, though, so
954          * that's why we try it again later.
955          */
956         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
957                 struct rt6_info *rt;
958                 bool had_dst = *dst != NULL;
959
960                 if (!had_dst)
961                         *dst = ip6_route_output(net, sk, fl6);
962                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
963                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
964                                           sk ? inet6_sk(sk)->srcprefs : 0,
965                                           &fl6->saddr);
966                 if (err)
967                         goto out_err_release;
968
969                 /* If we had an erroneous initial result, pretend it
970                  * never existed and let the SA-enabled version take
971                  * over.
972                  */
973                 if (!had_dst && (*dst)->error) {
974                         dst_release(*dst);
975                         *dst = NULL;
976                 }
977
978                 if (fl6->flowi6_oif)
979                         flags |= RT6_LOOKUP_F_IFACE;
980         }
981
982         if (!*dst)
983                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
984
985         err = (*dst)->error;
986         if (err)
987                 goto out_err_release;
988
989 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
990         /*
991          * Here if the dst entry we've looked up
992          * has a neighbour entry that is in the INCOMPLETE
993          * state and the src address from the flow is
994          * marked as OPTIMISTIC, we release the found
995          * dst entry and replace it instead with the
996          * dst entry of the nexthop router
997          */
998         rt = (struct rt6_info *) *dst;
999         rcu_read_lock_bh();
1000         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1001                                       rt6_nexthop(rt, &fl6->daddr));
1002         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1003         rcu_read_unlock_bh();
1004
1005         if (err) {
1006                 struct inet6_ifaddr *ifp;
1007                 struct flowi6 fl_gw6;
1008                 int redirect;
1009
1010                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1011                                       (*dst)->dev, 1);
1012
1013                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1014                 if (ifp)
1015                         in6_ifa_put(ifp);
1016
1017                 if (redirect) {
1018                         /*
1019                          * We need to get the dst entry for the
1020                          * default router instead
1021                          */
1022                         dst_release(*dst);
1023                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1024                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1025                         *dst = ip6_route_output(net, sk, &fl_gw6);
1026                         err = (*dst)->error;
1027                         if (err)
1028                                 goto out_err_release;
1029                 }
1030         }
1031 #endif
1032         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1033             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1034                 err = -EAFNOSUPPORT;
1035                 goto out_err_release;
1036         }
1037
1038         return 0;
1039
1040 out_err_release:
1041         dst_release(*dst);
1042         *dst = NULL;
1043
1044         if (err == -ENETUNREACH)
1045                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1046         return err;
1047 }
1048
1049 /**
1050  *      ip6_dst_lookup - perform route lookup on flow
1051  *      @sk: socket which provides route info
1052  *      @dst: pointer to dst_entry * for result
1053  *      @fl6: flow to lookup
1054  *
1055  *      This function performs a route lookup on the given flow.
1056  *
1057  *      It returns zero on success, or a standard errno code on error.
1058  */
1059 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1060                    struct flowi6 *fl6)
1061 {
1062         *dst = NULL;
1063         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1064 }
1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1066
1067 /**
1068  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1069  *      @sk: socket which provides route info
1070  *      @fl6: flow to lookup
1071  *      @final_dst: final destination address for ipsec lookup
1072  *
1073  *      This function performs a route lookup on the given flow.
1074  *
1075  *      It returns a valid dst pointer on success, or a pointer encoded
1076  *      error code.
1077  */
1078 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1079                                       const struct in6_addr *final_dst)
1080 {
1081         struct dst_entry *dst = NULL;
1082         int err;
1083
1084         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1085         if (err)
1086                 return ERR_PTR(err);
1087         if (final_dst)
1088                 fl6->daddr = *final_dst;
1089
1090         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1091 }
1092 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1093
1094 /**
1095  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1096  *      @sk: socket which provides the dst cache and route info
1097  *      @fl6: flow to lookup
1098  *      @final_dst: final destination address for ipsec lookup
1099  *
1100  *      This function performs a route lookup on the given flow with the
1101  *      possibility of using the cached route in the socket if it is valid.
1102  *      It will take the socket dst lock when operating on the dst cache.
1103  *      As a result, this function can only be used in process context.
1104  *
1105  *      It returns a valid dst pointer on success, or a pointer encoded
1106  *      error code.
1107  */
1108 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1109                                          const struct in6_addr *final_dst)
1110 {
1111         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1112
1113         dst = ip6_sk_dst_check(sk, dst, fl6);
1114         if (!dst)
1115                 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1116
1117         return dst;
1118 }
1119 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1120
1121 static inline int ip6_ufo_append_data(struct sock *sk,
1122                         struct sk_buff_head *queue,
1123                         int getfrag(void *from, char *to, int offset, int len,
1124                         int odd, struct sk_buff *skb),
1125                         void *from, int length, int hh_len, int fragheaderlen,
1126                         int exthdrlen, int transhdrlen, int mtu,
1127                         unsigned int flags, const struct flowi6 *fl6)
1128
1129 {
1130         struct sk_buff *skb;
1131         int err;
1132
1133         /* There is support for UDP large send offload by network
1134          * device, so create one single skb packet containing complete
1135          * udp datagram
1136          */
1137         skb = skb_peek_tail(queue);
1138         if (!skb) {
1139                 skb = sock_alloc_send_skb(sk,
1140                         hh_len + fragheaderlen + transhdrlen + 20,
1141                         (flags & MSG_DONTWAIT), &err);
1142                 if (!skb)
1143                         return err;
1144
1145                 /* reserve space for Hardware header */
1146                 skb_reserve(skb, hh_len);
1147
1148                 /* create space for UDP/IP header */
1149                 skb_put(skb, fragheaderlen + transhdrlen);
1150
1151                 /* initialize network header pointer */
1152                 skb_set_network_header(skb, exthdrlen);
1153
1154                 /* initialize protocol header pointer */
1155                 skb->transport_header = skb->network_header + fragheaderlen;
1156
1157                 skb->protocol = htons(ETH_P_IPV6);
1158                 skb->csum = 0;
1159
1160                 __skb_queue_tail(queue, skb);
1161         } else if (skb_is_gso(skb)) {
1162                 goto append;
1163         }
1164
1165         skb->ip_summed = CHECKSUM_PARTIAL;
1166         /* Specify the length of each IPv6 datagram fragment.
1167          * It has to be a multiple of 8.
1168          */
1169         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1170                                      sizeof(struct frag_hdr)) & ~7;
1171         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1172         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1173                                                          &fl6->daddr,
1174                                                          &fl6->saddr);
1175
1176 append:
1177         return skb_append_datato_frags(sk, skb, getfrag, from,
1178                                        (length - transhdrlen));
1179 }
1180
1181 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1182                                                gfp_t gfp)
1183 {
1184         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1185 }
1186
1187 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1188                                                 gfp_t gfp)
1189 {
1190         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1191 }
1192
1193 static void ip6_append_data_mtu(unsigned int *mtu,
1194                                 int *maxfraglen,
1195                                 unsigned int fragheaderlen,
1196                                 struct sk_buff *skb,
1197                                 struct rt6_info *rt,
1198                                 unsigned int orig_mtu)
1199 {
1200         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1201                 if (!skb) {
1202                         /* first fragment, reserve header_len */
1203                         *mtu = orig_mtu - rt->dst.header_len;
1204
1205                 } else {
1206                         /*
1207                          * this fragment is not first, the headers
1208                          * space is regarded as data space.
1209                          */
1210                         *mtu = orig_mtu;
1211                 }
1212                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1213                               + fragheaderlen - sizeof(struct frag_hdr);
1214         }
1215 }
1216
1217 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1218                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1219                           struct rt6_info *rt, struct flowi6 *fl6)
1220 {
1221         struct ipv6_pinfo *np = inet6_sk(sk);
1222         unsigned int mtu;
1223         struct ipv6_txoptions *opt = ipc6->opt;
1224
1225         /*
1226          * setup for corking
1227          */
1228         if (opt) {
1229                 if (WARN_ON(v6_cork->opt))
1230                         return -EINVAL;
1231
1232                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1233                 if (unlikely(!v6_cork->opt))
1234                         return -ENOBUFS;
1235
1236                 v6_cork->opt->tot_len = sizeof(*opt);
1237                 v6_cork->opt->opt_flen = opt->opt_flen;
1238                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1239
1240                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1241                                                     sk->sk_allocation);
1242                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1243                         return -ENOBUFS;
1244
1245                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1246                                                     sk->sk_allocation);
1247                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1248                         return -ENOBUFS;
1249
1250                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1251                                                    sk->sk_allocation);
1252                 if (opt->hopopt && !v6_cork->opt->hopopt)
1253                         return -ENOBUFS;
1254
1255                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1256                                                     sk->sk_allocation);
1257                 if (opt->srcrt && !v6_cork->opt->srcrt)
1258                         return -ENOBUFS;
1259
1260                 /* need source address above miyazawa*/
1261         }
1262         dst_hold(&rt->dst);
1263         cork->base.dst = &rt->dst;
1264         cork->fl.u.ip6 = *fl6;
1265         v6_cork->hop_limit = ipc6->hlimit;
1266         v6_cork->tclass = ipc6->tclass;
1267         if (rt->dst.flags & DST_XFRM_TUNNEL)
1268                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1269                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1270         else
1271                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1272                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1273         if (np->frag_size < mtu) {
1274                 if (np->frag_size)
1275                         mtu = np->frag_size;
1276         }
1277         cork->base.fragsize = mtu;
1278         if (dst_allfrag(rt->dst.path))
1279                 cork->base.flags |= IPCORK_ALLFRAG;
1280         cork->base.length = 0;
1281
1282         return 0;
1283 }
1284
1285 static int __ip6_append_data(struct sock *sk,
1286                              struct flowi6 *fl6,
1287                              struct sk_buff_head *queue,
1288                              struct inet_cork *cork,
1289                              struct inet6_cork *v6_cork,
1290                              struct page_frag *pfrag,
1291                              int getfrag(void *from, char *to, int offset,
1292                                          int len, int odd, struct sk_buff *skb),
1293                              void *from, int length, int transhdrlen,
1294                              unsigned int flags, struct ipcm6_cookie *ipc6,
1295                              const struct sockcm_cookie *sockc)
1296 {
1297         struct sk_buff *skb, *skb_prev = NULL;
1298         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1299         int exthdrlen = 0;
1300         int dst_exthdrlen = 0;
1301         int hh_len;
1302         int copy;
1303         int err;
1304         int offset = 0;
1305         __u8 tx_flags = 0;
1306         u32 tskey = 0;
1307         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1308         struct ipv6_txoptions *opt = v6_cork->opt;
1309         int csummode = CHECKSUM_NONE;
1310         unsigned int maxnonfragsize, headersize;
1311
1312         skb = skb_peek_tail(queue);
1313         if (!skb) {
1314                 exthdrlen = opt ? opt->opt_flen : 0;
1315                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1316         }
1317
1318         mtu = cork->fragsize;
1319         orig_mtu = mtu;
1320
1321         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1322
1323         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1324                         (opt ? opt->opt_nflen : 0);
1325
1326         headersize = sizeof(struct ipv6hdr) +
1327                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1328                      (dst_allfrag(&rt->dst) ?
1329                       sizeof(struct frag_hdr) : 0) +
1330                      rt->rt6i_nfheader_len;
1331
1332         if (mtu <= fragheaderlen ||
1333             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1334                 goto emsgsize;
1335
1336         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1337                      sizeof(struct frag_hdr);
1338
1339         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1340          * the first fragment
1341          */
1342         if (headersize + transhdrlen > mtu)
1343                 goto emsgsize;
1344
1345         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1346             (sk->sk_protocol == IPPROTO_UDP ||
1347              sk->sk_protocol == IPPROTO_RAW)) {
1348                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1349                                 sizeof(struct ipv6hdr));
1350                 goto emsgsize;
1351         }
1352
1353         if (ip6_sk_ignore_df(sk))
1354                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1355         else
1356                 maxnonfragsize = mtu;
1357
1358         if (cork->length + length > maxnonfragsize - headersize) {
1359 emsgsize:
1360                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1361                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1362                 return -EMSGSIZE;
1363         }
1364
1365         /* CHECKSUM_PARTIAL only with no extension headers and when
1366          * we are not going to fragment
1367          */
1368         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1369             headersize == sizeof(struct ipv6hdr) &&
1370             length < mtu - headersize &&
1371             !(flags & MSG_MORE) &&
1372             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1373                 csummode = CHECKSUM_PARTIAL;
1374
1375         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1376                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1377                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1378                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1379                         tskey = sk->sk_tskey++;
1380         }
1381
1382         /*
1383          * Let's try using as much space as possible.
1384          * Use MTU if total length of the message fits into the MTU.
1385          * Otherwise, we need to reserve fragment header and
1386          * fragment alignment (= 8-15 octects, in total).
1387          *
1388          * Note that we may need to "move" the data from the tail of
1389          * of the buffer to the new fragment when we split
1390          * the message.
1391          *
1392          * FIXME: It may be fragmented into multiple chunks
1393          *        at once if non-fragmentable extension headers
1394          *        are too large.
1395          * --yoshfuji
1396          */
1397
1398         cork->length += length;
1399         if ((skb && skb_is_gso(skb)) ||
1400             (((length + fragheaderlen) > mtu) &&
1401             (skb_queue_len(queue) <= 1) &&
1402             (sk->sk_protocol == IPPROTO_UDP) &&
1403             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
1404             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) {
1405                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1406                                           hh_len, fragheaderlen, exthdrlen,
1407                                           transhdrlen, mtu, flags, fl6);
1408                 if (err)
1409                         goto error;
1410                 return 0;
1411         }
1412
1413         if (!skb)
1414                 goto alloc_new_skb;
1415
1416         while (length > 0) {
1417                 /* Check if the remaining data fits into current packet. */
1418                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1419                 if (copy < length)
1420                         copy = maxfraglen - skb->len;
1421
1422                 if (copy <= 0) {
1423                         char *data;
1424                         unsigned int datalen;
1425                         unsigned int fraglen;
1426                         unsigned int fraggap;
1427                         unsigned int alloclen;
1428 alloc_new_skb:
1429                         /* There's no room in the current skb */
1430                         if (skb)
1431                                 fraggap = skb->len - maxfraglen;
1432                         else
1433                                 fraggap = 0;
1434                         /* update mtu and maxfraglen if necessary */
1435                         if (!skb || !skb_prev)
1436                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1437                                                     fragheaderlen, skb, rt,
1438                                                     orig_mtu);
1439
1440                         skb_prev = skb;
1441
1442                         /*
1443                          * If remaining data exceeds the mtu,
1444                          * we know we need more fragment(s).
1445                          */
1446                         datalen = length + fraggap;
1447
1448                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1449                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1450                         if ((flags & MSG_MORE) &&
1451                             !(rt->dst.dev->features&NETIF_F_SG))
1452                                 alloclen = mtu;
1453                         else
1454                                 alloclen = datalen + fragheaderlen;
1455
1456                         alloclen += dst_exthdrlen;
1457
1458                         if (datalen != length + fraggap) {
1459                                 /*
1460                                  * this is not the last fragment, the trailer
1461                                  * space is regarded as data space.
1462                                  */
1463                                 datalen += rt->dst.trailer_len;
1464                         }
1465
1466                         alloclen += rt->dst.trailer_len;
1467                         fraglen = datalen + fragheaderlen;
1468
1469                         /*
1470                          * We just reserve space for fragment header.
1471                          * Note: this may be overallocation if the message
1472                          * (without MSG_MORE) fits into the MTU.
1473                          */
1474                         alloclen += sizeof(struct frag_hdr);
1475
1476                         copy = datalen - transhdrlen - fraggap;
1477                         if (copy < 0) {
1478                                 err = -EINVAL;
1479                                 goto error;
1480                         }
1481                         if (transhdrlen) {
1482                                 skb = sock_alloc_send_skb(sk,
1483                                                 alloclen + hh_len,
1484                                                 (flags & MSG_DONTWAIT), &err);
1485                         } else {
1486                                 skb = NULL;
1487                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1488                                     2 * sk->sk_sndbuf)
1489                                         skb = sock_wmalloc(sk,
1490                                                            alloclen + hh_len, 1,
1491                                                            sk->sk_allocation);
1492                                 if (unlikely(!skb))
1493                                         err = -ENOBUFS;
1494                         }
1495                         if (!skb)
1496                                 goto error;
1497                         /*
1498                          *      Fill in the control structures
1499                          */
1500                         skb->protocol = htons(ETH_P_IPV6);
1501                         skb->ip_summed = csummode;
1502                         skb->csum = 0;
1503                         /* reserve for fragmentation and ipsec header */
1504                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1505                                     dst_exthdrlen);
1506
1507                         /* Only the initial fragment is time stamped */
1508                         skb_shinfo(skb)->tx_flags = tx_flags;
1509                         tx_flags = 0;
1510                         skb_shinfo(skb)->tskey = tskey;
1511                         tskey = 0;
1512
1513                         /*
1514                          *      Find where to start putting bytes
1515                          */
1516                         data = skb_put(skb, fraglen);
1517                         skb_set_network_header(skb, exthdrlen);
1518                         data += fragheaderlen;
1519                         skb->transport_header = (skb->network_header +
1520                                                  fragheaderlen);
1521                         if (fraggap) {
1522                                 skb->csum = skb_copy_and_csum_bits(
1523                                         skb_prev, maxfraglen,
1524                                         data + transhdrlen, fraggap, 0);
1525                                 skb_prev->csum = csum_sub(skb_prev->csum,
1526                                                           skb->csum);
1527                                 data += fraggap;
1528                                 pskb_trim_unique(skb_prev, maxfraglen);
1529                         }
1530                         if (copy > 0 &&
1531                             getfrag(from, data + transhdrlen, offset,
1532                                     copy, fraggap, skb) < 0) {
1533                                 err = -EFAULT;
1534                                 kfree_skb(skb);
1535                                 goto error;
1536                         }
1537
1538                         offset += copy;
1539                         length -= datalen - fraggap;
1540                         transhdrlen = 0;
1541                         exthdrlen = 0;
1542                         dst_exthdrlen = 0;
1543
1544                         /*
1545                          * Put the packet on the pending queue
1546                          */
1547                         __skb_queue_tail(queue, skb);
1548                         continue;
1549                 }
1550
1551                 if (copy > length)
1552                         copy = length;
1553
1554                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1555                     skb_tailroom(skb) >= copy) {
1556                         unsigned int off;
1557
1558                         off = skb->len;
1559                         if (getfrag(from, skb_put(skb, copy),
1560                                                 offset, copy, off, skb) < 0) {
1561                                 __skb_trim(skb, off);
1562                                 err = -EFAULT;
1563                                 goto error;
1564                         }
1565                 } else {
1566                         int i = skb_shinfo(skb)->nr_frags;
1567
1568                         err = -ENOMEM;
1569                         if (!sk_page_frag_refill(sk, pfrag))
1570                                 goto error;
1571
1572                         if (!skb_can_coalesce(skb, i, pfrag->page,
1573                                               pfrag->offset)) {
1574                                 err = -EMSGSIZE;
1575                                 if (i == MAX_SKB_FRAGS)
1576                                         goto error;
1577
1578                                 __skb_fill_page_desc(skb, i, pfrag->page,
1579                                                      pfrag->offset, 0);
1580                                 skb_shinfo(skb)->nr_frags = ++i;
1581                                 get_page(pfrag->page);
1582                         }
1583                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1584                         if (getfrag(from,
1585                                     page_address(pfrag->page) + pfrag->offset,
1586                                     offset, copy, skb->len, skb) < 0)
1587                                 goto error_efault;
1588
1589                         pfrag->offset += copy;
1590                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1591                         skb->len += copy;
1592                         skb->data_len += copy;
1593                         skb->truesize += copy;
1594                         atomic_add(copy, &sk->sk_wmem_alloc);
1595                 }
1596                 offset += copy;
1597                 length -= copy;
1598         }
1599
1600         return 0;
1601
1602 error_efault:
1603         err = -EFAULT;
1604 error:
1605         cork->length -= length;
1606         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1607         return err;
1608 }
1609
1610 int ip6_append_data(struct sock *sk,
1611                     int getfrag(void *from, char *to, int offset, int len,
1612                                 int odd, struct sk_buff *skb),
1613                     void *from, int length, int transhdrlen,
1614                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1615                     struct rt6_info *rt, unsigned int flags,
1616                     const struct sockcm_cookie *sockc)
1617 {
1618         struct inet_sock *inet = inet_sk(sk);
1619         struct ipv6_pinfo *np = inet6_sk(sk);
1620         int exthdrlen;
1621         int err;
1622
1623         if (flags&MSG_PROBE)
1624                 return 0;
1625         if (skb_queue_empty(&sk->sk_write_queue)) {
1626                 /*
1627                  * setup for corking
1628                  */
1629                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1630                                      ipc6, rt, fl6);
1631                 if (err)
1632                         return err;
1633
1634                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1635                 length += exthdrlen;
1636                 transhdrlen += exthdrlen;
1637         } else {
1638                 fl6 = &inet->cork.fl.u.ip6;
1639                 transhdrlen = 0;
1640         }
1641
1642         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1643                                  &np->cork, sk_page_frag(sk), getfrag,
1644                                  from, length, transhdrlen, flags, ipc6, sockc);
1645 }
1646 EXPORT_SYMBOL_GPL(ip6_append_data);
1647
1648 static void ip6_cork_release(struct inet_cork_full *cork,
1649                              struct inet6_cork *v6_cork)
1650 {
1651         if (v6_cork->opt) {
1652                 kfree(v6_cork->opt->dst0opt);
1653                 kfree(v6_cork->opt->dst1opt);
1654                 kfree(v6_cork->opt->hopopt);
1655                 kfree(v6_cork->opt->srcrt);
1656                 kfree(v6_cork->opt);
1657                 v6_cork->opt = NULL;
1658         }
1659
1660         if (cork->base.dst) {
1661                 dst_release(cork->base.dst);
1662                 cork->base.dst = NULL;
1663                 cork->base.flags &= ~IPCORK_ALLFRAG;
1664         }
1665         memset(&cork->fl, 0, sizeof(cork->fl));
1666 }
1667
1668 struct sk_buff *__ip6_make_skb(struct sock *sk,
1669                                struct sk_buff_head *queue,
1670                                struct inet_cork_full *cork,
1671                                struct inet6_cork *v6_cork)
1672 {
1673         struct sk_buff *skb, *tmp_skb;
1674         struct sk_buff **tail_skb;
1675         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1676         struct ipv6_pinfo *np = inet6_sk(sk);
1677         struct net *net = sock_net(sk);
1678         struct ipv6hdr *hdr;
1679         struct ipv6_txoptions *opt = v6_cork->opt;
1680         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1681         struct flowi6 *fl6 = &cork->fl.u.ip6;
1682         unsigned char proto = fl6->flowi6_proto;
1683
1684         skb = __skb_dequeue(queue);
1685         if (!skb)
1686                 goto out;
1687         tail_skb = &(skb_shinfo(skb)->frag_list);
1688
1689         /* move skb->data to ip header from ext header */
1690         if (skb->data < skb_network_header(skb))
1691                 __skb_pull(skb, skb_network_offset(skb));
1692         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1693                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1694                 *tail_skb = tmp_skb;
1695                 tail_skb = &(tmp_skb->next);
1696                 skb->len += tmp_skb->len;
1697                 skb->data_len += tmp_skb->len;
1698                 skb->truesize += tmp_skb->truesize;
1699                 tmp_skb->destructor = NULL;
1700                 tmp_skb->sk = NULL;
1701         }
1702
1703         /* Allow local fragmentation. */
1704         skb->ignore_df = ip6_sk_ignore_df(sk);
1705
1706         *final_dst = fl6->daddr;
1707         __skb_pull(skb, skb_network_header_len(skb));
1708         if (opt && opt->opt_flen)
1709                 ipv6_push_frag_opts(skb, opt, &proto);
1710         if (opt && opt->opt_nflen)
1711                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1712
1713         skb_push(skb, sizeof(struct ipv6hdr));
1714         skb_reset_network_header(skb);
1715         hdr = ipv6_hdr(skb);
1716
1717         ip6_flow_hdr(hdr, v6_cork->tclass,
1718                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1719                                         ip6_autoflowlabel(net, np), fl6));
1720         hdr->hop_limit = v6_cork->hop_limit;
1721         hdr->nexthdr = proto;
1722         hdr->saddr = fl6->saddr;
1723         hdr->daddr = *final_dst;
1724
1725         skb->priority = sk->sk_priority;
1726         skb->mark = sk->sk_mark;
1727
1728         skb_dst_set(skb, dst_clone(&rt->dst));
1729         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1730         if (proto == IPPROTO_ICMPV6) {
1731                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1732
1733                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1734                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1735         }
1736
1737         ip6_cork_release(cork, v6_cork);
1738 out:
1739         return skb;
1740 }
1741
1742 int ip6_send_skb(struct sk_buff *skb)
1743 {
1744         struct net *net = sock_net(skb->sk);
1745         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1746         int err;
1747
1748         err = ip6_local_out(net, skb->sk, skb);
1749         if (err) {
1750                 if (err > 0)
1751                         err = net_xmit_errno(err);
1752                 if (err)
1753                         IP6_INC_STATS(net, rt->rt6i_idev,
1754                                       IPSTATS_MIB_OUTDISCARDS);
1755         }
1756
1757         return err;
1758 }
1759
1760 int ip6_push_pending_frames(struct sock *sk)
1761 {
1762         struct sk_buff *skb;
1763
1764         skb = ip6_finish_skb(sk);
1765         if (!skb)
1766                 return 0;
1767
1768         return ip6_send_skb(skb);
1769 }
1770 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1771
1772 static void __ip6_flush_pending_frames(struct sock *sk,
1773                                        struct sk_buff_head *queue,
1774                                        struct inet_cork_full *cork,
1775                                        struct inet6_cork *v6_cork)
1776 {
1777         struct sk_buff *skb;
1778
1779         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1780                 if (skb_dst(skb))
1781                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1782                                       IPSTATS_MIB_OUTDISCARDS);
1783                 kfree_skb(skb);
1784         }
1785
1786         ip6_cork_release(cork, v6_cork);
1787 }
1788
1789 void ip6_flush_pending_frames(struct sock *sk)
1790 {
1791         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1792                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1793 }
1794 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1795
1796 struct sk_buff *ip6_make_skb(struct sock *sk,
1797                              int getfrag(void *from, char *to, int offset,
1798                                          int len, int odd, struct sk_buff *skb),
1799                              void *from, int length, int transhdrlen,
1800                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1801                              struct rt6_info *rt, unsigned int flags,
1802                              const struct sockcm_cookie *sockc)
1803 {
1804         struct inet_cork_full cork;
1805         struct inet6_cork v6_cork;
1806         struct sk_buff_head queue;
1807         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1808         int err;
1809
1810         if (flags & MSG_PROBE)
1811                 return NULL;
1812
1813         __skb_queue_head_init(&queue);
1814
1815         cork.base.flags = 0;
1816         cork.base.addr = 0;
1817         cork.base.opt = NULL;
1818         cork.base.dst = NULL;
1819         v6_cork.opt = NULL;
1820         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1821         if (err) {
1822                 ip6_cork_release(&cork, &v6_cork);
1823                 return ERR_PTR(err);
1824         }
1825         if (ipc6->dontfrag < 0)
1826                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1827
1828         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1829                                 &current->task_frag, getfrag, from,
1830                                 length + exthdrlen, transhdrlen + exthdrlen,
1831                                 flags, ipc6, sockc);
1832         if (err) {
1833                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1834                 return ERR_PTR(err);
1835         }
1836
1837         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1838 }