net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int
 132 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 133                                     struct sk_buff *skb, unsigned int mtu)
 134 {
 135         struct sk_buff *segs, *nskb;
 136         netdev_features_t features;
 137         int ret = 0;
 138
 139         /* Please see corresponding comment in ip_finish_output_gso
 140          * describing the cases where GSO segment length exceeds the
 141          * egress MTU.
 142          */
 143         features = netif_skb_features(skb);
 144         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 145         if (IS_ERR_OR_NULL(segs)) {
 146                 kfree_skb(skb);
 147                 return -ENOMEM;
 148         }
 149
 150         consume_skb(skb);
 151
 152         skb_list_walk_safe(segs, segs, nskb) {
 153                 int err;
 154
 155                 skb_mark_not_on_list(segs);
 156                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 157                 if (err && ret == 0)
 158                         ret = err;
 159         }
 160
 161         return ret;
 162 }
 163
 164 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 165 {
 166         unsigned int mtu;
 167         int ret;
 168
 169         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 170         if (ret) {
 171                 kfree_skb(skb);
 172                 return ret;
 173         }
 174
 175 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 176         /* Policy lookup after SNAT yielded a new policy */
 177         if (skb_dst(skb)->xfrm) {
 178                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 179                 return dst_output(net, sk, skb);
 180         }
 181 #endif
 182
 183         mtu = ip6_skb_dst_mtu(skb);
 184         if (skb_is_gso(skb) && !skb_gso_validate_mtu(skb, mtu))
 185                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 186
 187         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 188             dst_allfrag(skb_dst(skb)) ||
 189             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 190                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 191         else
 192                 return ip6_finish_output2(net, sk, skb);
 193 }
 194
 195 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 196 {
 197         struct net_device *dev = skb_dst(skb)->dev;
 198         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 199
 200         skb->protocol = htons(ETH_P_IPV6);
 201         skb->dev = dev;
 202
 203         if (unlikely(idev->cnf.disable_ipv6)) {
 204                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 205                 kfree_skb(skb);
 206                 return 0;
 207         }
 208
 209         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 210                             net, sk, skb, NULL, dev,
 211                             ip6_finish_output,
 212                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 213 }
 214
 215 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 216 {
 217         if (!np->autoflowlabel_set)
 218                 return ip6_default_np_autolabel(net);
 219         else
 220                 return np->autoflowlabel;
 221 }
 222
 223 /*
 224  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 225  * Note : socket lock is not held for SYNACK packets, but might be modified
 226  * by calls to skb_set_owner_w() and ipv6_local_error(),
 227  * which are using proper atomic operations or spinlocks.
 228  */
 229 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 230              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 231 {
 232         struct net *net = sock_net(sk);
 233         const struct ipv6_pinfo *np = inet6_sk(sk);
 234         struct in6_addr *first_hop = &fl6->daddr;
 235         struct dst_entry *dst = skb_dst(skb);
 236         unsigned int head_room;
 237         struct ipv6hdr *hdr;
 238         u8  proto = fl6->flowi6_proto;
 239         int seg_len = skb->len;
 240         int hlimit = -1;
 241         u32 mtu;
 242
 243         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 244         if (opt)
 245                 head_room += opt->opt_nflen + opt->opt_flen;
 246
 247         if (unlikely(skb_headroom(skb) < head_room)) {
 248                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 249                 if (!skb2) {
 250                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 251                                       IPSTATS_MIB_OUTDISCARDS);
 252                         kfree_skb(skb);
 253                         return -ENOBUFS;
 254                 }
 255                 if (skb->sk)
 256                         skb_set_owner_w(skb2, skb->sk);
 257                 consume_skb(skb);
 258                 skb = skb2;
 259         }
 260
 261         if (opt) {
 262                 seg_len += opt->opt_nflen + opt->opt_flen;
 263
 264                 if (opt->opt_flen)
 265                         ipv6_push_frag_opts(skb, opt, &proto);
 266
 267                 if (opt->opt_nflen)
 268                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 269                                              &fl6->saddr);
 270         }
 271
 272         skb_push(skb, sizeof(struct ipv6hdr));
 273         skb_reset_network_header(skb);
 274         hdr = ipv6_hdr(skb);
 275
 276         /*
 277          *      Fill in the IPv6 header
 278          */
 279         if (np)
 280                 hlimit = np->hop_limit;
 281         if (hlimit < 0)
 282                 hlimit = ip6_dst_hoplimit(dst);
 283
 284         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 285                                 ip6_autoflowlabel(net, np), fl6));
 286
 287         hdr->payload_len = htons(seg_len);
 288         hdr->nexthdr = proto;
 289         hdr->hop_limit = hlimit;
 290
 291         hdr->saddr = fl6->saddr;
 292         hdr->daddr = *first_hop;
 293
 294         skb->protocol = htons(ETH_P_IPV6);
 295         skb->priority = sk->sk_priority;
 296         skb->mark = mark;
 297
 298         mtu = dst_mtu(dst);
 299         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 300                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 301                               IPSTATS_MIB_OUT, skb->len);
 302
 303                 /* if egress device is enslaved to an L3 master device pass the
 304                  * skb to its handler for processing
 305                  */
 306                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 307                 if (unlikely(!skb))
 308                         return 0;
 309
 310                 /* hooks should never assume socket lock is held.
 311                  * we promote our socket to non const
 312                  */
 313                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 314                                net, (struct sock *)sk, skb, NULL, dst->dev,
 315                                dst_output);
 316         }
 317
 318         skb->dev = dst->dev;
 319         /* ipv6_local_error() does not require socket lock,
 320          * we promote our socket to non const
 321          */
 322         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 323
 324         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 325         kfree_skb(skb);
 326         return -EMSGSIZE;
 327 }
 328 EXPORT_SYMBOL(ip6_xmit);
 329
 330 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 331 {
 332         struct ip6_ra_chain *ra;
 333         struct sock *last = NULL;
 334
 335         read_lock(&ip6_ra_lock);
 336         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 337                 struct sock *sk = ra->sk;
 338                 if (sk && ra->sel == sel &&
 339                     (!sk->sk_bound_dev_if ||
 340                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 341                         if (last) {
 342                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 343                                 if (skb2)
 344                                         rawv6_rcv(last, skb2);
 345                         }
 346                         last = sk;
 347                 }
 348         }
 349
 350         if (last) {
 351                 rawv6_rcv(last, skb);
 352                 read_unlock(&ip6_ra_lock);
 353                 return 1;
 354         }
 355         read_unlock(&ip6_ra_lock);
 356         return 0;
 357 }
 358
 359 static int ip6_forward_proxy_check(struct sk_buff *skb)
 360 {
 361         struct ipv6hdr *hdr = ipv6_hdr(skb);
 362         u8 nexthdr = hdr->nexthdr;
 363         __be16 frag_off;
 364         int offset;
 365
 366         if (ipv6_ext_hdr(nexthdr)) {
 367                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 368                 if (offset < 0)
 369                         return 0;
 370         } else
 371                 offset = sizeof(struct ipv6hdr);
 372
 373         if (nexthdr == IPPROTO_ICMPV6) {
 374                 struct icmp6hdr *icmp6;
 375
 376                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 377                                          offset + 1 - skb->data)))
 378                         return 0;
 379
 380                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 381
 382                 switch (icmp6->icmp6_type) {
 383                 case NDISC_ROUTER_SOLICITATION:
 384                 case NDISC_ROUTER_ADVERTISEMENT:
 385                 case NDISC_NEIGHBOUR_SOLICITATION:
 386                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 387                 case NDISC_REDIRECT:
 388                         /* For reaction involving unicast neighbor discovery
 389                          * message destined to the proxied address, pass it to
 390                          * input function.
 391                          */
 392                         return 1;
 393                 default:
 394                         break;
 395                 }
 396         }
 397
 398         /*
 399          * The proxying router can't forward traffic sent to a link-local
 400          * address, so signal the sender and discard the packet. This
 401          * behavior is clarified by the MIPv6 specification.
 402          */
 403         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 404                 dst_link_failure(skb);
 405                 return -1;
 406         }
 407
 408         return 0;
 409 }
 410
 411 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 412                                      struct sk_buff *skb)
 413 {
 414         struct dst_entry *dst = skb_dst(skb);
 415
 416         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 417         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 418
 419         return dst_output(net, sk, skb);
 420 }
 421
 422 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 423 {
 424         unsigned int mtu;
 425         struct inet6_dev *idev;
 426
 427         if (dst_metric_locked(dst, RTAX_MTU)) {
 428                 mtu = dst_metric_raw(dst, RTAX_MTU);
 429                 if (mtu)
 430                         return mtu;
 431         }
 432
 433         mtu = IPV6_MIN_MTU;
 434         rcu_read_lock();
 435         idev = __in6_dev_get(dst->dev);
 436         if (idev)
 437                 mtu = idev->cnf.mtu6;
 438         rcu_read_unlock();
 439
 440         return mtu;
 441 }
 442
 443 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 444 {
 445         if (skb->len <= mtu)
 446                 return false;
 447
 448         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 449         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 450                 return true;
 451
 452         if (skb->ignore_df)
 453                 return false;
 454
 455         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 456                 return false;
 457
 458         return true;
 459 }
 460
 461 int ip6_forward(struct sk_buff *skb)
 462 {
 463         struct dst_entry *dst = skb_dst(skb);
 464         struct ipv6hdr *hdr = ipv6_hdr(skb);
 465         struct inet6_skb_parm *opt = IP6CB(skb);
 466         struct net *net = dev_net(dst->dev);
 467         u32 mtu;
 468
 469         if (net->ipv6.devconf_all->forwarding == 0)
 470                 goto error;
 471
 472         if (skb->pkt_type != PACKET_HOST)
 473                 goto drop;
 474
 475         if (unlikely(skb->sk))
 476                 goto drop;
 477
 478         if (skb_warn_if_lro(skb))
 479                 goto drop;
 480
 481         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 482                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 483                                 IPSTATS_MIB_INDISCARDS);
 484                 goto drop;
 485         }
 486
 487         skb_forward_csum(skb);
 488
 489         /*
 490          *      We DO NOT make any processing on
 491          *      RA packets, pushing them to user level AS IS
 492          *      without ane WARRANTY that application will be able
 493          *      to interpret them. The reason is that we
 494          *      cannot make anything clever here.
 495          *
 496          *      We are not end-node, so that if packet contains
 497          *      AH/ESP, we cannot make anything.
 498          *      Defragmentation also would be mistake, RA packets
 499          *      cannot be fragmented, because there is no warranty
 500          *      that different fragments will go along one path. --ANK
 501          */
 502         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 503                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 504                         return 0;
 505         }
 506
 507         /*
 508          *      check and decrement ttl
 509          */
 510         if (hdr->hop_limit <= 1) {
 511                 /* Force OUTPUT device used as source address */
 512                 skb->dev = dst->dev;
 513                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 514                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 515                                 IPSTATS_MIB_INHDRERRORS);
 516
 517                 kfree_skb(skb);
 518                 return -ETIMEDOUT;
 519         }
 520
 521         /* XXX: idev->cnf.proxy_ndp? */
 522         if (net->ipv6.devconf_all->proxy_ndp &&
 523             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 524                 int proxied = ip6_forward_proxy_check(skb);
 525                 if (proxied > 0)
 526                         return ip6_input(skb);
 527                 else if (proxied < 0) {
 528                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 529                                         IPSTATS_MIB_INDISCARDS);
 530                         goto drop;
 531                 }
 532         }
 533
 534         if (!xfrm6_route_forward(skb)) {
 535                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 536                                 IPSTATS_MIB_INDISCARDS);
 537                 goto drop;
 538         }
 539         dst = skb_dst(skb);
 540
 541         /* IPv6 specs say nothing about it, but it is clear that we cannot
 542            send redirects to source routed frames.
 543            We don't send redirects to frames decapsulated from IPsec.
 544          */
 545         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 546             opt->srcrt == 0 && !skb_sec_path(skb)) {
 547                 struct in6_addr *target = NULL;
 548                 struct inet_peer *peer;
 549                 struct rt6_info *rt;
 550
 551                 /*
 552                  *      incoming and outgoing devices are the same
 553                  *      send a redirect.
 554                  */
 555
 556                 rt = (struct rt6_info *) dst;
 557                 if (rt->rt6i_flags & RTF_GATEWAY)
 558                         target = &rt->rt6i_gateway;
 559                 else
 560                         target = &hdr->daddr;
 561
 562                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 563
 564                 /* Limit redirects both by destination (here)
 565                    and by source (inside ndisc_send_redirect)
 566                  */
 567                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 568                         ndisc_send_redirect(skb, target);
 569                 if (peer)
 570                         inet_putpeer(peer);
 571         } else {
 572                 int addrtype = ipv6_addr_type(&hdr->saddr);
 573
 574                 /* This check is security critical. */
 575                 if (addrtype == IPV6_ADDR_ANY ||
 576                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 577                         goto error;
 578                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 579                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 580                                     ICMPV6_NOT_NEIGHBOUR, 0);
 581                         goto error;
 582                 }
 583         }
 584
 585         mtu = ip6_dst_mtu_forward(dst);
 586         if (mtu < IPV6_MIN_MTU)
 587                 mtu = IPV6_MIN_MTU;
 588
 589         if (ip6_pkt_too_big(skb, mtu)) {
 590                 /* Again, force OUTPUT device used as source address */
 591                 skb->dev = dst->dev;
 592                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 593                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 594                                 IPSTATS_MIB_INTOOBIGERRORS);
 595                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 596                                 IPSTATS_MIB_FRAGFAILS);
 597                 kfree_skb(skb);
 598                 return -EMSGSIZE;
 599         }
 600
 601         if (skb_cow(skb, dst->dev->hard_header_len)) {
 602                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 603                                 IPSTATS_MIB_OUTDISCARDS);
 604                 goto drop;
 605         }
 606
 607         hdr = ipv6_hdr(skb);
 608
 609         /* Mangling hops number delayed to point after skb COW */
 610
 611         hdr->hop_limit--;
 612
 613         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 614                        net, NULL, skb, skb->dev, dst->dev,
 615                        ip6_forward_finish);
 616
 617 error:
 618         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 619 drop:
 620         kfree_skb(skb);
 621         return -EINVAL;
 622 }
 623
 624 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 625 {
 626         to->pkt_type = from->pkt_type;
 627         to->priority = from->priority;
 628         to->protocol = from->protocol;
 629         skb_dst_drop(to);
 630         skb_dst_set(to, dst_clone(skb_dst(from)));
 631         to->dev = from->dev;
 632         to->mark = from->mark;
 633
 634         skb_copy_hash(to, from);
 635
 636 #ifdef CONFIG_NET_SCHED
 637         to->tc_index = from->tc_index;
 638 #endif
 639         nf_copy(to, from);
 640         skb_copy_secmark(to, from);
 641 }
 642
 643 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 644                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 645 {
 646         struct sk_buff *frag;
 647         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 648         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 649                                 inet6_sk(skb->sk) : NULL;
 650         struct ipv6hdr *tmp_hdr;
 651         struct frag_hdr *fh;
 652         unsigned int mtu, hlen, left, len, nexthdr_offset;
 653         int hroom, troom;
 654         __be32 frag_id;
 655         int ptr, offset = 0, err = 0;
 656         u8 *prevhdr, nexthdr = 0;
 657
 658         err = ip6_find_1stfragopt(skb, &prevhdr);
 659         if (err < 0)
 660                 goto fail;
 661         hlen = err;
 662         nexthdr = *prevhdr;
 663         nexthdr_offset = prevhdr - skb_network_header(skb);
 664
 665         mtu = ip6_skb_dst_mtu(skb);
 666
 667         /* We must not fragment if the socket is set to force MTU discovery
 668          * or if the skb it not generated by a local socket.
 669          */
 670         if (unlikely(!skb->ignore_df && skb->len > mtu))
 671                 goto fail_toobig;
 672
 673         if (IP6CB(skb)->frag_max_size) {
 674                 if (IP6CB(skb)->frag_max_size > mtu)
 675                         goto fail_toobig;
 676
 677                 /* don't send fragments larger than what we received */
 678                 mtu = IP6CB(skb)->frag_max_size;
 679                 if (mtu < IPV6_MIN_MTU)
 680                         mtu = IPV6_MIN_MTU;
 681         }
 682
 683         if (np && np->frag_size < mtu) {
 684                 if (np->frag_size)
 685                         mtu = np->frag_size;
 686         }
 687         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 688                 goto fail_toobig;
 689         mtu -= hlen + sizeof(struct frag_hdr);
 690
 691         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 692                                     &ipv6_hdr(skb)->saddr);
 693
 694         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 695             (err = skb_checksum_help(skb)))
 696                 goto fail;
 697
 698         prevhdr = skb_network_header(skb) + nexthdr_offset;
 699         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 700         if (skb_has_frag_list(skb)) {
 701                 unsigned int first_len = skb_pagelen(skb);
 702                 struct sk_buff *frag2;
 703
 704                 if (first_len - hlen > mtu ||
 705                     ((first_len - hlen) & 7) ||
 706                     skb_cloned(skb) ||
 707                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 708                         goto slow_path;
 709
 710                 skb_walk_frags(skb, frag) {
 711                         /* Correct geometry. */
 712                         if (frag->len > mtu ||
 713                             ((frag->len & 7) && frag->next) ||
 714                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 715                                 goto slow_path_clean;
 716
 717                         /* Partially cloned skb? */
 718                         if (skb_shared(frag))
 719                                 goto slow_path_clean;
 720
 721                         BUG_ON(frag->sk);
 722                         if (skb->sk) {
 723                                 frag->sk = skb->sk;
 724                                 frag->destructor = sock_wfree;
 725                         }
 726                         skb->truesize -= frag->truesize;
 727                 }
 728
 729                 err = 0;
 730                 offset = 0;
 731                 /* BUILD HEADER */
 732
 733                 *prevhdr = NEXTHDR_FRAGMENT;
 734                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 735                 if (!tmp_hdr) {
 736                         err = -ENOMEM;
 737                         goto fail;
 738                 }
 739                 frag = skb_shinfo(skb)->frag_list;
 740                 skb_frag_list_init(skb);
 741
 742                 __skb_pull(skb, hlen);
 743                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 744                 __skb_push(skb, hlen);
 745                 skb_reset_network_header(skb);
 746                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 747
 748                 fh->nexthdr = nexthdr;
 749                 fh->reserved = 0;
 750                 fh->frag_off = htons(IP6_MF);
 751                 fh->identification = frag_id;
 752
 753                 first_len = skb_pagelen(skb);
 754                 skb->data_len = first_len - skb_headlen(skb);
 755                 skb->len = first_len;
 756                 ipv6_hdr(skb)->payload_len = htons(first_len -
 757                                                    sizeof(struct ipv6hdr));
 758
 759                 for (;;) {
 760                         /* Prepare header of the next frame,
 761                          * before previous one went down. */
 762                         if (frag) {
 763                                 frag->ip_summed = CHECKSUM_NONE;
 764                                 skb_reset_transport_header(frag);
 765                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 766                                 __skb_push(frag, hlen);
 767                                 skb_reset_network_header(frag);
 768                                 memcpy(skb_network_header(frag), tmp_hdr,
 769                                        hlen);
 770                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 771                                 fh->nexthdr = nexthdr;
 772                                 fh->reserved = 0;
 773                                 fh->frag_off = htons(offset);
 774                                 if (frag->next)
 775                                         fh->frag_off |= htons(IP6_MF);
 776                                 fh->identification = frag_id;
 777                                 ipv6_hdr(frag)->payload_len =
 778                                                 htons(frag->len -
 779                                                       sizeof(struct ipv6hdr));
 780                                 ip6_copy_metadata(frag, skb);
 781                         }
 782
 783                         err = output(net, sk, skb);
 784                         if (!err)
 785                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 786                                               IPSTATS_MIB_FRAGCREATES);
 787
 788                         if (err || !frag)
 789                                 break;
 790
 791                         skb = frag;
 792                         frag = skb->next;
 793                         skb->next = NULL;
 794                 }
 795
 796                 kfree(tmp_hdr);
 797
 798                 if (err == 0) {
 799                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 800                                       IPSTATS_MIB_FRAGOKS);
 801                         return 0;
 802                 }
 803
 804                 kfree_skb_list(frag);
 805
 806                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 807                               IPSTATS_MIB_FRAGFAILS);
 808                 return err;
 809
 810 slow_path_clean:
 811                 skb_walk_frags(skb, frag2) {
 812                         if (frag2 == frag)
 813                                 break;
 814                         frag2->sk = NULL;
 815                         frag2->destructor = NULL;
 816                         skb->truesize += frag2->truesize;
 817                 }
 818         }
 819
 820 slow_path:
 821         left = skb->len - hlen;         /* Space per frame */
 822         ptr = hlen;                     /* Where to start from */
 823
 824         /*
 825          *      Fragment the datagram.
 826          */
 827
 828         troom = rt->dst.dev->needed_tailroom;
 829
 830         /*
 831          *      Keep copying data until we run out.
 832          */
 833         while (left > 0)        {
 834                 u8 *fragnexthdr_offset;
 835
 836                 len = left;
 837                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 838                 if (len > mtu)
 839                         len = mtu;
 840                 /* IF: we are not sending up to and including the packet end
 841                    then align the next start on an eight byte boundary */
 842                 if (len < left) {
 843                         len &= ~7;
 844                 }
 845
 846                 /* Allocate buffer */
 847                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 848                                  hroom + troom, GFP_ATOMIC);
 849                 if (!frag) {
 850                         err = -ENOMEM;
 851                         goto fail;
 852                 }
 853
 854                 /*
 855                  *      Set up data on packet
 856                  */
 857
 858                 ip6_copy_metadata(frag, skb);
 859                 skb_reserve(frag, hroom);
 860                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 861                 skb_reset_network_header(frag);
 862                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 863                 frag->transport_header = (frag->network_header + hlen +
 864                                           sizeof(struct frag_hdr));
 865
 866                 /*
 867                  *      Charge the memory for the fragment to any owner
 868                  *      it might possess
 869                  */
 870                 if (skb->sk)
 871                         skb_set_owner_w(frag, skb->sk);
 872
 873                 /*
 874                  *      Copy the packet header into the new buffer.
 875                  */
 876                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 877
 878                 fragnexthdr_offset = skb_network_header(frag);
 879                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 880                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 881
 882                 /*
 883                  *      Build fragment header.
 884                  */
 885                 fh->nexthdr = nexthdr;
 886                 fh->reserved = 0;
 887                 fh->identification = frag_id;
 888
 889                 /*
 890                  *      Copy a block of the IP datagram.
 891                  */
 892                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 893                                      len));
 894                 left -= len;
 895
 896                 fh->frag_off = htons(offset);
 897                 if (left > 0)
 898                         fh->frag_off |= htons(IP6_MF);
 899                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 900                                                     sizeof(struct ipv6hdr));
 901
 902                 ptr += len;
 903                 offset += len;
 904
 905                 /*
 906                  *      Put this fragment into the sending queue.
 907                  */
 908                 err = output(net, sk, frag);
 909                 if (err)
 910                         goto fail;
 911
 912                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 913                               IPSTATS_MIB_FRAGCREATES);
 914         }
 915         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 916                       IPSTATS_MIB_FRAGOKS);
 917         consume_skb(skb);
 918         return err;
 919
 920 fail_toobig:
 921         if (skb->sk && dst_allfrag(skb_dst(skb)))
 922                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 923
 924         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 925         err = -EMSGSIZE;
 926
 927 fail:
 928         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 929                       IPSTATS_MIB_FRAGFAILS);
 930         kfree_skb(skb);
 931         return err;
 932 }
 933
 934 static inline int ip6_rt_check(const struct rt6key *rt_key,
 935                                const struct in6_addr *fl_addr,
 936                                const struct in6_addr *addr_cache)
 937 {
 938         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 939                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 940 }
 941
 942 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 943                                           struct dst_entry *dst,
 944                                           const struct flowi6 *fl6)
 945 {
 946         struct ipv6_pinfo *np = inet6_sk(sk);
 947         struct rt6_info *rt;
 948
 949         if (!dst)
 950                 goto out;
 951
 952         if (dst->ops->family != AF_INET6) {
 953                 dst_release(dst);
 954                 return NULL;
 955         }
 956
 957         rt = (struct rt6_info *)dst;
 958         /* Yes, checking route validity in not connected
 959          * case is not very simple. Take into account,
 960          * that we do not support routing by source, TOS,
 961          * and MSG_DONTROUTE            --ANK (980726)
 962          *
 963          * 1. ip6_rt_check(): If route was host route,
 964          *    check that cached destination is current.
 965          *    If it is network route, we still may
 966          *    check its validity using saved pointer
 967          *    to the last used address: daddr_cache.
 968          *    We do not want to save whole address now,
 969          *    (because main consumer of this service
 970          *    is tcp, which has not this problem),
 971          *    so that the last trick works only on connected
 972          *    sockets.
 973          * 2. oif also should be the same.
 974          */
 975         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 976 #ifdef CONFIG_IPV6_SUBTREES
 977             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 978 #endif
 979            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 980               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 981                 dst_release(dst);
 982                 dst = NULL;
 983         }
 984
 985 out:
 986         return dst;
 987 }
 988
 989 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 990                                struct dst_entry **dst, struct flowi6 *fl6)
 991 {
 992 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 993         struct neighbour *n;
 994         struct rt6_info *rt;
 995 #endif
 996         int err;
 997         int flags = 0;
 998
 999         /* The correct way to handle this would be to do
1000          * ip6_route_get_saddr, and then ip6_route_output; however,
1001          * the route-specific preferred source forces the
1002          * ip6_route_output call _before_ ip6_route_get_saddr.
1003          *
1004          * In source specific routing (no src=any default route),
1005          * ip6_route_output will fail given src=any saddr, though, so
1006          * that's why we try it again later.
1007          */
1008         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1009                 struct rt6_info *rt;
1010                 bool had_dst = *dst != NULL;
1011
1012                 if (!had_dst)
1013                         *dst = ip6_route_output(net, sk, fl6);
1014                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1015                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
1016                                           sk ? inet6_sk(sk)->srcprefs : 0,
1017                                           &fl6->saddr);
1018                 if (err)
1019                         goto out_err_release;
1020
1021                 /* If we had an erroneous initial result, pretend it
1022                  * never existed and let the SA-enabled version take
1023                  * over.
1024                  */
1025                 if (!had_dst && (*dst)->error) {
1026                         dst_release(*dst);
1027                         *dst = NULL;
1028                 }
1029
1030                 if (fl6->flowi6_oif)
1031                         flags |= RT6_LOOKUP_F_IFACE;
1032         }
1033
1034         if (!*dst)
1035                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1036
1037         err = (*dst)->error;
1038         if (err)
1039                 goto out_err_release;
1040
1041 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1042         /*
1043          * Here if the dst entry we've looked up
1044          * has a neighbour entry that is in the INCOMPLETE
1045          * state and the src address from the flow is
1046          * marked as OPTIMISTIC, we release the found
1047          * dst entry and replace it instead with the
1048          * dst entry of the nexthop router
1049          */
1050         rt = (struct rt6_info *) *dst;
1051         rcu_read_lock_bh();
1052         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1053                                       rt6_nexthop(rt, &fl6->daddr));
1054         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1055         rcu_read_unlock_bh();
1056
1057         if (err) {
1058                 struct inet6_ifaddr *ifp;
1059                 struct flowi6 fl_gw6;
1060                 int redirect;
1061
1062                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1063                                       (*dst)->dev, 1);
1064
1065                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1066                 if (ifp)
1067                         in6_ifa_put(ifp);
1068
1069                 if (redirect) {
1070                         /*
1071                          * We need to get the dst entry for the
1072                          * default router instead
1073                          */
1074                         dst_release(*dst);
1075                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1076                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1077                         *dst = ip6_route_output(net, sk, &fl_gw6);
1078                         err = (*dst)->error;
1079                         if (err)
1080                                 goto out_err_release;
1081                 }
1082         }
1083 #endif
1084         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1085             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1086                 err = -EAFNOSUPPORT;
1087                 goto out_err_release;
1088         }
1089
1090         return 0;
1091
1092 out_err_release:
1093         dst_release(*dst);
1094         *dst = NULL;
1095
1096         if (err == -ENETUNREACH)
1097                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1098         return err;
1099 }
1100
1101 /**
1102  *      ip6_dst_lookup - perform route lookup on flow
1103  *      @sk: socket which provides route info
1104  *      @dst: pointer to dst_entry * for result
1105  *      @fl6: flow to lookup
1106  *
1107  *      This function performs a route lookup on the given flow.
1108  *
1109  *      It returns zero on success, or a standard errno code on error.
1110  */
1111 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1112                    struct flowi6 *fl6)
1113 {
1114         *dst = NULL;
1115         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1116 }
1117 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1118
1119 /**
1120  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1121  *      @sk: socket which provides route info
1122  *      @fl6: flow to lookup
1123  *      @final_dst: final destination address for ipsec lookup
1124  *
1125  *      This function performs a route lookup on the given flow.
1126  *
1127  *      It returns a valid dst pointer on success, or a pointer encoded
1128  *      error code.
1129  */
1130 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1131                                       const struct in6_addr *final_dst)
1132 {
1133         struct dst_entry *dst = NULL;
1134         int err;
1135
1136         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1137         if (err)
1138                 return ERR_PTR(err);
1139         if (final_dst)
1140                 fl6->daddr = *final_dst;
1141
1142         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1143 }
1144 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1145
1146 /**
1147  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1148  *      @sk: socket which provides the dst cache and route info
1149  *      @fl6: flow to lookup
1150  *      @final_dst: final destination address for ipsec lookup
1151  *
1152  *      This function performs a route lookup on the given flow with the
1153  *      possibility of using the cached route in the socket if it is valid.
1154  *      It will take the socket dst lock when operating on the dst cache.
1155  *      As a result, this function can only be used in process context.
1156  *
1157  *      It returns a valid dst pointer on success, or a pointer encoded
1158  *      error code.
1159  */
1160 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1161                                          const struct in6_addr *final_dst)
1162 {
1163         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1164
1165         dst = ip6_sk_dst_check(sk, dst, fl6);
1166         if (!dst)
1167                 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1168
1169         return dst;
1170 }
1171 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1172
1173 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1174                                                gfp_t gfp)
1175 {
1176         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1180                                                 gfp_t gfp)
1181 {
1182         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1183 }
1184
1185 static void ip6_append_data_mtu(unsigned int *mtu,
1186                                 int *maxfraglen,
1187                                 unsigned int fragheaderlen,
1188                                 struct sk_buff *skb,
1189                                 struct rt6_info *rt,
1190                                 unsigned int orig_mtu)
1191 {
1192         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1193                 if (!skb) {
1194                         /* first fragment, reserve header_len */
1195                         *mtu = orig_mtu - rt->dst.header_len;
1196
1197                 } else {
1198                         /*
1199                          * this fragment is not first, the headers
1200                          * space is regarded as data space.
1201                          */
1202                         *mtu = orig_mtu;
1203                 }
1204                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1205                               + fragheaderlen - sizeof(struct frag_hdr);
1206         }
1207 }
1208
1209 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1210                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1211                           struct rt6_info *rt, struct flowi6 *fl6)
1212 {
1213         struct ipv6_pinfo *np = inet6_sk(sk);
1214         unsigned int mtu;
1215         struct ipv6_txoptions *opt = ipc6->opt;
1216
1217         /*
1218          * setup for corking
1219          */
1220         if (opt) {
1221                 if (WARN_ON(v6_cork->opt))
1222                         return -EINVAL;
1223
1224                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1225                 if (unlikely(!v6_cork->opt))
1226                         return -ENOBUFS;
1227
1228                 v6_cork->opt->tot_len = sizeof(*opt);
1229                 v6_cork->opt->opt_flen = opt->opt_flen;
1230                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1231
1232                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1233                                                     sk->sk_allocation);
1234                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1235                         return -ENOBUFS;
1236
1237                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1238                                                     sk->sk_allocation);
1239                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1240                         return -ENOBUFS;
1241
1242                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1243                                                    sk->sk_allocation);
1244                 if (opt->hopopt && !v6_cork->opt->hopopt)
1245                         return -ENOBUFS;
1246
1247                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1248                                                     sk->sk_allocation);
1249                 if (opt->srcrt && !v6_cork->opt->srcrt)
1250                         return -ENOBUFS;
1251
1252                 /* need source address above miyazawa*/
1253         }
1254         dst_hold(&rt->dst);
1255         cork->base.dst = &rt->dst;
1256         cork->fl.u.ip6 = *fl6;
1257         v6_cork->hop_limit = ipc6->hlimit;
1258         v6_cork->tclass = ipc6->tclass;
1259         if (rt->dst.flags & DST_XFRM_TUNNEL)
1260                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1261                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1262         else
1263                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1264                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1265         if (np->frag_size < mtu) {
1266                 if (np->frag_size)
1267                         mtu = np->frag_size;
1268         }
1269         cork->base.fragsize = mtu;
1270         if (dst_allfrag(rt->dst.path))
1271                 cork->base.flags |= IPCORK_ALLFRAG;
1272         cork->base.length = 0;
1273
1274         return 0;
1275 }
1276
1277 static int __ip6_append_data(struct sock *sk,
1278                              struct flowi6 *fl6,
1279                              struct sk_buff_head *queue,
1280                              struct inet_cork *cork,
1281                              struct inet6_cork *v6_cork,
1282                              struct page_frag *pfrag,
1283                              int getfrag(void *from, char *to, int offset,
1284                                          int len, int odd, struct sk_buff *skb),
1285                              void *from, int length, int transhdrlen,
1286                              unsigned int flags, struct ipcm6_cookie *ipc6,
1287                              const struct sockcm_cookie *sockc)
1288 {
1289         struct sk_buff *skb, *skb_prev = NULL;
1290         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1291         int exthdrlen = 0;
1292         int dst_exthdrlen = 0;
1293         int hh_len;
1294         int copy;
1295         int err;
1296         int offset = 0;
1297         __u8 tx_flags = 0;
1298         u32 tskey = 0;
1299         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1300         struct ipv6_txoptions *opt = v6_cork->opt;
1301         int csummode = CHECKSUM_NONE;
1302         unsigned int maxnonfragsize, headersize;
1303
1304         skb = skb_peek_tail(queue);
1305         if (!skb) {
1306                 exthdrlen = opt ? opt->opt_flen : 0;
1307                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1308         }
1309
1310         mtu = cork->fragsize;
1311         orig_mtu = mtu;
1312
1313         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1314
1315         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1316                         (opt ? opt->opt_nflen : 0);
1317
1318         headersize = sizeof(struct ipv6hdr) +
1319                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1320                      (dst_allfrag(&rt->dst) ?
1321                       sizeof(struct frag_hdr) : 0) +
1322                      rt->rt6i_nfheader_len;
1323
1324         if (mtu <= fragheaderlen ||
1325             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1326                 goto emsgsize;
1327
1328         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1329                      sizeof(struct frag_hdr);
1330
1331         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1332          * the first fragment
1333          */
1334         if (headersize + transhdrlen > mtu)
1335                 goto emsgsize;
1336
1337         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1338             (sk->sk_protocol == IPPROTO_UDP ||
1339              sk->sk_protocol == IPPROTO_RAW)) {
1340                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1341                                 sizeof(struct ipv6hdr));
1342                 goto emsgsize;
1343         }
1344
1345         if (ip6_sk_ignore_df(sk))
1346                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1347         else
1348                 maxnonfragsize = mtu;
1349
1350         if (cork->length + length > maxnonfragsize - headersize) {
1351 emsgsize:
1352                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1353                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1354                 return -EMSGSIZE;
1355         }
1356
1357         /* CHECKSUM_PARTIAL only with no extension headers and when
1358          * we are not going to fragment
1359          */
1360         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1361             headersize == sizeof(struct ipv6hdr) &&
1362             length <= mtu - headersize &&
1363             !(flags & MSG_MORE) &&
1364             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1365                 csummode = CHECKSUM_PARTIAL;
1366
1367         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1368                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1369                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1370                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1371                         tskey = sk->sk_tskey++;
1372         }
1373
1374         /*
1375          * Let's try using as much space as possible.
1376          * Use MTU if total length of the message fits into the MTU.
1377          * Otherwise, we need to reserve fragment header and
1378          * fragment alignment (= 8-15 octects, in total).
1379          *
1380          * Note that we may need to "move" the data from the tail of
1381          * of the buffer to the new fragment when we split
1382          * the message.
1383          *
1384          * FIXME: It may be fragmented into multiple chunks
1385          *        at once if non-fragmentable extension headers
1386          *        are too large.
1387          * --yoshfuji
1388          */
1389
1390         cork->length += length;
1391         if (!skb)
1392                 goto alloc_new_skb;
1393
1394         while (length > 0) {
1395                 /* Check if the remaining data fits into current packet. */
1396                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1397                 if (copy < length)
1398                         copy = maxfraglen - skb->len;
1399
1400                 if (copy <= 0) {
1401                         char *data;
1402                         unsigned int datalen;
1403                         unsigned int fraglen;
1404                         unsigned int fraggap;
1405                         unsigned int alloclen;
1406 alloc_new_skb:
1407                         /* There's no room in the current skb */
1408                         if (skb)
1409                                 fraggap = skb->len - maxfraglen;
1410                         else
1411                                 fraggap = 0;
1412                         /* update mtu and maxfraglen if necessary */
1413                         if (!skb || !skb_prev)
1414                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1415                                                     fragheaderlen, skb, rt,
1416                                                     orig_mtu);
1417
1418                         skb_prev = skb;
1419
1420                         /*
1421                          * If remaining data exceeds the mtu,
1422                          * we know we need more fragment(s).
1423                          */
1424                         datalen = length + fraggap;
1425
1426                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1427                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1428                         if ((flags & MSG_MORE) &&
1429                             !(rt->dst.dev->features&NETIF_F_SG))
1430                                 alloclen = mtu;
1431                         else
1432                                 alloclen = datalen + fragheaderlen;
1433
1434                         alloclen += dst_exthdrlen;
1435
1436                         if (datalen != length + fraggap) {
1437                                 /*
1438                                  * this is not the last fragment, the trailer
1439                                  * space is regarded as data space.
1440                                  */
1441                                 datalen += rt->dst.trailer_len;
1442                         }
1443
1444                         alloclen += rt->dst.trailer_len;
1445                         fraglen = datalen + fragheaderlen;
1446
1447                         /*
1448                          * We just reserve space for fragment header.
1449                          * Note: this may be overallocation if the message
1450                          * (without MSG_MORE) fits into the MTU.
1451                          */
1452                         alloclen += sizeof(struct frag_hdr);
1453
1454                         copy = datalen - transhdrlen - fraggap;
1455                         if (copy < 0) {
1456                                 err = -EINVAL;
1457                                 goto error;
1458                         }
1459                         if (transhdrlen) {
1460                                 skb = sock_alloc_send_skb(sk,
1461                                                 alloclen + hh_len,
1462                                                 (flags & MSG_DONTWAIT), &err);
1463                         } else {
1464                                 skb = NULL;
1465                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1466                                     2 * sk->sk_sndbuf)
1467                                         skb = sock_wmalloc(sk,
1468                                                            alloclen + hh_len, 1,
1469                                                            sk->sk_allocation);
1470                                 if (unlikely(!skb))
1471                                         err = -ENOBUFS;
1472                         }
1473                         if (!skb)
1474                                 goto error;
1475                         /*
1476                          *      Fill in the control structures
1477                          */
1478                         skb->protocol = htons(ETH_P_IPV6);
1479                         skb->ip_summed = csummode;
1480                         skb->csum = 0;
1481                         /* reserve for fragmentation and ipsec header */
1482                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1483                                     dst_exthdrlen);
1484
1485                         /* Only the initial fragment is time stamped */
1486                         skb_shinfo(skb)->tx_flags = tx_flags;
1487                         tx_flags = 0;
1488                         skb_shinfo(skb)->tskey = tskey;
1489                         tskey = 0;
1490
1491                         /*
1492                          *      Find where to start putting bytes
1493                          */
1494                         data = skb_put(skb, fraglen);
1495                         skb_set_network_header(skb, exthdrlen);
1496                         data += fragheaderlen;
1497                         skb->transport_header = (skb->network_header +
1498                                                  fragheaderlen);
1499                         if (fraggap) {
1500                                 skb->csum = skb_copy_and_csum_bits(
1501                                         skb_prev, maxfraglen,
1502                                         data + transhdrlen, fraggap, 0);
1503                                 skb_prev->csum = csum_sub(skb_prev->csum,
1504                                                           skb->csum);
1505                                 data += fraggap;
1506                                 pskb_trim_unique(skb_prev, maxfraglen);
1507                         }
1508                         if (copy > 0 &&
1509                             getfrag(from, data + transhdrlen, offset,
1510                                     copy, fraggap, skb) < 0) {
1511                                 err = -EFAULT;
1512                                 kfree_skb(skb);
1513                                 goto error;
1514                         }
1515
1516                         offset += copy;
1517                         length -= datalen - fraggap;
1518                         transhdrlen = 0;
1519                         exthdrlen = 0;
1520                         dst_exthdrlen = 0;
1521
1522                         if ((flags & MSG_CONFIRM) && !skb_prev)
1523                                 skb_set_dst_pending_confirm(skb, 1);
1524
1525                         /*
1526                          * Put the packet on the pending queue
1527                          */
1528                         __skb_queue_tail(queue, skb);
1529                         continue;
1530                 }
1531
1532                 if (copy > length)
1533                         copy = length;
1534
1535                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1536                     skb_tailroom(skb) >= copy) {
1537                         unsigned int off;
1538
1539                         off = skb->len;
1540                         if (getfrag(from, skb_put(skb, copy),
1541                                                 offset, copy, off, skb) < 0) {
1542                                 __skb_trim(skb, off);
1543                                 err = -EFAULT;
1544                                 goto error;
1545                         }
1546                 } else {
1547                         int i = skb_shinfo(skb)->nr_frags;
1548
1549                         err = -ENOMEM;
1550                         if (!sk_page_frag_refill(sk, pfrag))
1551                                 goto error;
1552
1553                         if (!skb_can_coalesce(skb, i, pfrag->page,
1554                                               pfrag->offset)) {
1555                                 err = -EMSGSIZE;
1556                                 if (i == MAX_SKB_FRAGS)
1557                                         goto error;
1558
1559                                 __skb_fill_page_desc(skb, i, pfrag->page,
1560                                                      pfrag->offset, 0);
1561                                 skb_shinfo(skb)->nr_frags = ++i;
1562                                 get_page(pfrag->page);
1563                         }
1564                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1565                         if (getfrag(from,
1566                                     page_address(pfrag->page) + pfrag->offset,
1567                                     offset, copy, skb->len, skb) < 0)
1568                                 goto error_efault;
1569
1570                         pfrag->offset += copy;
1571                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1572                         skb->len += copy;
1573                         skb->data_len += copy;
1574                         skb->truesize += copy;
1575                         refcount_add(copy, &sk->sk_wmem_alloc);
1576                 }
1577                 offset += copy;
1578                 length -= copy;
1579         }
1580
1581         return 0;
1582
1583 error_efault:
1584         err = -EFAULT;
1585 error:
1586         cork->length -= length;
1587         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1588         return err;
1589 }
1590
1591 int ip6_append_data(struct sock *sk,
1592                     int getfrag(void *from, char *to, int offset, int len,
1593                                 int odd, struct sk_buff *skb),
1594                     void *from, int length, int transhdrlen,
1595                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1596                     struct rt6_info *rt, unsigned int flags,
1597                     const struct sockcm_cookie *sockc)
1598 {
1599         struct inet_sock *inet = inet_sk(sk);
1600         struct ipv6_pinfo *np = inet6_sk(sk);
1601         int exthdrlen;
1602         int err;
1603
1604         if (flags&MSG_PROBE)
1605                 return 0;
1606         if (skb_queue_empty(&sk->sk_write_queue)) {
1607                 /*
1608                  * setup for corking
1609                  */
1610                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1611                                      ipc6, rt, fl6);
1612                 if (err)
1613                         return err;
1614
1615                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1616                 length += exthdrlen;
1617                 transhdrlen += exthdrlen;
1618         } else {
1619                 fl6 = &inet->cork.fl.u.ip6;
1620                 transhdrlen = 0;
1621         }
1622
1623         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1624                                  &np->cork, sk_page_frag(sk), getfrag,
1625                                  from, length, transhdrlen, flags, ipc6, sockc);
1626 }
1627 EXPORT_SYMBOL_GPL(ip6_append_data);
1628
1629 static void ip6_cork_release(struct inet_cork_full *cork,
1630                              struct inet6_cork *v6_cork)
1631 {
1632         if (v6_cork->opt) {
1633                 kfree(v6_cork->opt->dst0opt);
1634                 kfree(v6_cork->opt->dst1opt);
1635                 kfree(v6_cork->opt->hopopt);
1636                 kfree(v6_cork->opt->srcrt);
1637                 kfree(v6_cork->opt);
1638                 v6_cork->opt = NULL;
1639         }
1640
1641         if (cork->base.dst) {
1642                 dst_release(cork->base.dst);
1643                 cork->base.dst = NULL;
1644                 cork->base.flags &= ~IPCORK_ALLFRAG;
1645         }
1646         memset(&cork->fl, 0, sizeof(cork->fl));
1647 }
1648
1649 struct sk_buff *__ip6_make_skb(struct sock *sk,
1650                                struct sk_buff_head *queue,
1651                                struct inet_cork_full *cork,
1652                                struct inet6_cork *v6_cork)
1653 {
1654         struct sk_buff *skb, *tmp_skb;
1655         struct sk_buff **tail_skb;
1656         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1657         struct ipv6_pinfo *np = inet6_sk(sk);
1658         struct net *net = sock_net(sk);
1659         struct ipv6hdr *hdr;
1660         struct ipv6_txoptions *opt = v6_cork->opt;
1661         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1662         struct flowi6 *fl6 = &cork->fl.u.ip6;
1663         unsigned char proto = fl6->flowi6_proto;
1664
1665         skb = __skb_dequeue(queue);
1666         if (!skb)
1667                 goto out;
1668         tail_skb = &(skb_shinfo(skb)->frag_list);
1669
1670         /* move skb->data to ip header from ext header */
1671         if (skb->data < skb_network_header(skb))
1672                 __skb_pull(skb, skb_network_offset(skb));
1673         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1674                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1675                 *tail_skb = tmp_skb;
1676                 tail_skb = &(tmp_skb->next);
1677                 skb->len += tmp_skb->len;
1678                 skb->data_len += tmp_skb->len;
1679                 skb->truesize += tmp_skb->truesize;
1680                 tmp_skb->destructor = NULL;
1681                 tmp_skb->sk = NULL;
1682         }
1683
1684         /* Allow local fragmentation. */
1685         skb->ignore_df = ip6_sk_ignore_df(sk);
1686
1687         *final_dst = fl6->daddr;
1688         __skb_pull(skb, skb_network_header_len(skb));
1689         if (opt && opt->opt_flen)
1690                 ipv6_push_frag_opts(skb, opt, &proto);
1691         if (opt && opt->opt_nflen)
1692                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1693
1694         skb_push(skb, sizeof(struct ipv6hdr));
1695         skb_reset_network_header(skb);
1696         hdr = ipv6_hdr(skb);
1697
1698         ip6_flow_hdr(hdr, v6_cork->tclass,
1699                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1700                                         ip6_autoflowlabel(net, np), fl6));
1701         hdr->hop_limit = v6_cork->hop_limit;
1702         hdr->nexthdr = proto;
1703         hdr->saddr = fl6->saddr;
1704         hdr->daddr = *final_dst;
1705
1706         skb->priority = sk->sk_priority;
1707         skb->mark = sk->sk_mark;
1708
1709         skb_dst_set(skb, dst_clone(&rt->dst));
1710         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1711         if (proto == IPPROTO_ICMPV6) {
1712                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1713
1714                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1715                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1716         }
1717
1718         ip6_cork_release(cork, v6_cork);
1719 out:
1720         return skb;
1721 }
1722
1723 int ip6_send_skb(struct sk_buff *skb)
1724 {
1725         struct net *net = sock_net(skb->sk);
1726         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1727         int err;
1728
1729         err = ip6_local_out(net, skb->sk, skb);
1730         if (err) {
1731                 if (err > 0)
1732                         err = net_xmit_errno(err);
1733                 if (err)
1734                         IP6_INC_STATS(net, rt->rt6i_idev,
1735                                       IPSTATS_MIB_OUTDISCARDS);
1736         }
1737
1738         return err;
1739 }
1740
1741 int ip6_push_pending_frames(struct sock *sk)
1742 {
1743         struct sk_buff *skb;
1744
1745         skb = ip6_finish_skb(sk);
1746         if (!skb)
1747                 return 0;
1748
1749         return ip6_send_skb(skb);
1750 }
1751 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1752
1753 static void __ip6_flush_pending_frames(struct sock *sk,
1754                                        struct sk_buff_head *queue,
1755                                        struct inet_cork_full *cork,
1756                                        struct inet6_cork *v6_cork)
1757 {
1758         struct sk_buff *skb;
1759
1760         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1761                 if (skb_dst(skb))
1762                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1763                                       IPSTATS_MIB_OUTDISCARDS);
1764                 kfree_skb(skb);
1765         }
1766
1767         ip6_cork_release(cork, v6_cork);
1768 }
1769
1770 void ip6_flush_pending_frames(struct sock *sk)
1771 {
1772         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1773                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1774 }
1775 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1776
1777 struct sk_buff *ip6_make_skb(struct sock *sk,
1778                              int getfrag(void *from, char *to, int offset,
1779                                          int len, int odd, struct sk_buff *skb),
1780                              void *from, int length, int transhdrlen,
1781                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1782                              struct rt6_info *rt, unsigned int flags,
1783                              const struct sockcm_cookie *sockc)
1784 {
1785         struct inet_cork_full cork;
1786         struct inet6_cork v6_cork;
1787         struct sk_buff_head queue;
1788         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1789         int err;
1790
1791         if (flags & MSG_PROBE)
1792                 return NULL;
1793
1794         __skb_queue_head_init(&queue);
1795
1796         cork.base.flags = 0;
1797         cork.base.addr = 0;
1798         cork.base.opt = NULL;
1799         cork.base.dst = NULL;
1800         v6_cork.opt = NULL;
1801         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1802         if (err) {
1803                 ip6_cork_release(&cork, &v6_cork);
1804                 return ERR_PTR(err);
1805         }
1806         if (ipc6->dontfrag < 0)
1807                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1808
1809         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1810                                 &current->task_frag, getfrag, from,
1811                                 length + exthdrlen, transhdrlen + exthdrlen,
1812                                 flags, ipc6, sockc);
1813         if (err) {
1814                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1815                 return ERR_PTR(err);
1816         }
1817
1818         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1819 }