net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/tcp_memcontrol.h>
  77 #include <net/busy_poll.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92 #ifdef CONFIG_TCP_MD5SIG
  93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  95 #endif
  96
  97 struct inet_hashinfo tcp_hashinfo;
  98 EXPORT_SYMBOL(tcp_hashinfo);
  99
 100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 101 {
 102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 103                                           ip_hdr(skb)->saddr,
 104                                           tcp_hdr(skb)->dest,
 105                                           tcp_hdr(skb)->source);
 106 }
 107
 108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 109 {
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112
 113         /* With PAWS, it is safe from the viewpoint
 114            of data integrity. Even without PAWS it is safe provided sequence
 115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 116
 117            Actually, the idea is close to VJ's one, only timestamp cache is
 118            held not per host, but per port pair and TW bucket is used as state
 119            holder.
 120
 121            If TW bucket has been already destroyed we fall back to VJ's scheme
 122            and use initial timestamp retrieved from peer table.
 123          */
 124         if (tcptw->tw_ts_recent_stamp &&
 125             (!twp || (sysctl_tcp_tw_reuse &&
 126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 128                 if (tp->write_seq == 0)
 129                         tp->write_seq = 1;
 130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 132                 sock_hold(sktw);
 133                 return 1;
 134         }
 135
 136         return 0;
 137 }
 138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 139
 140 /* This will initiate an outgoing connection. */
 141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 142 {
 143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 144         struct inet_sock *inet = inet_sk(sk);
 145         struct tcp_sock *tp = tcp_sk(sk);
 146         __be16 orig_sport, orig_dport;
 147         __be32 daddr, nexthop;
 148         struct flowi4 *fl4;
 149         struct rtable *rt;
 150         int err;
 151         struct ip_options_rcu *inet_opt;
 152
 153         if (addr_len < sizeof(struct sockaddr_in))
 154                 return -EINVAL;
 155
 156         if (usin->sin_family != AF_INET)
 157                 return -EAFNOSUPPORT;
 158
 159         nexthop = daddr = usin->sin_addr.s_addr;
 160         inet_opt = rcu_dereference_protected(inet->inet_opt,
 161                                              sock_owned_by_user(sk));
 162         if (inet_opt && inet_opt->opt.srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet_opt->opt.faddr;
 166         }
 167
 168         orig_sport = inet->inet_sport;
 169         orig_dport = usin->sin_port;
 170         fl4 = &inet->cork.fl.u.ip4;
 171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                               IPPROTO_TCP,
 174                               orig_sport, orig_dport, sk);
 175         if (IS_ERR(rt)) {
 176                 err = PTR_ERR(rt);
 177                 if (err == -ENETUNREACH)
 178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 179                 return err;
 180         }
 181
 182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 183                 ip_rt_put(rt);
 184                 return -ENETUNREACH;
 185         }
 186
 187         if (!inet_opt || !inet_opt->opt.srr)
 188                 daddr = fl4->daddr;
 189
 190         if (!inet->inet_saddr)
 191                 inet->inet_saddr = fl4->saddr;
 192         sk_rcv_saddr_set(sk, inet->inet_saddr);
 193
 194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 195                 /* Reset inherited state */
 196                 tp->rx_opt.ts_recent       = 0;
 197                 tp->rx_opt.ts_recent_stamp = 0;
 198                 if (likely(!tp->repair))
 199                         tp->write_seq      = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 205
 206         inet->inet_dport = usin->sin_port;
 207         sk_daddr_set(sk, daddr);
 208
 209         inet_csk(sk)->icsk_ext_hdr_len = 0;
 210         if (inet_opt)
 211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215         /* Socket identity is still unknown (sport may be zero).
 216          * However we set state to SYN-SENT and not releasing socket
 217          * lock select source port, enter ourselves into the hash tables and
 218          * complete initialization after this.
 219          */
 220         tcp_set_state(sk, TCP_SYN_SENT);
 221         err = inet_hash_connect(&tcp_death_row, sk);
 222         if (err)
 223                 goto failure;
 224
 225         sk_set_txhash(sk);
 226
 227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                                inet->inet_sport, inet->inet_dport, sk);
 229         if (IS_ERR(rt)) {
 230                 err = PTR_ERR(rt);
 231                 rt = NULL;
 232                 goto failure;
 233         }
 234         /* OK, now commit destination to socket.  */
 235         sk->sk_gso_type = SKB_GSO_TCPV4;
 236         sk_setup_caps(sk, &rt->dst);
 237
 238         if (!tp->write_seq && likely(!tp->repair))
 239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                            inet->inet_daddr,
 241                                                            inet->inet_sport,
 242                                                            usin->sin_port);
 243
 244         inet->inet_id = prandom_u32();
 245
 246         err = tcp_connect(sk);
 247
 248         rt = NULL;
 249         if (err)
 250                 goto failure;
 251
 252         return 0;
 253
 254 failure:
 255         /*
 256          * This unhashes the socket and releases the local port,
 257          * if necessary.
 258          */
 259         tcp_set_state(sk, TCP_CLOSE);
 260         ip_rt_put(rt);
 261         sk->sk_route_caps = 0;
 262         inet->inet_dport = 0;
 263         return err;
 264 }
 265 EXPORT_SYMBOL(tcp_v4_connect);
 266
 267 /*
 268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 269  * It can be called through tcp_release_cb() if socket was owned by user
 270  * at the time tcp_v4_err() was called to handle ICMP message.
 271  */
 272 void tcp_v4_mtu_reduced(struct sock *sk)
 273 {
 274         struct inet_sock *inet = inet_sk(sk);
 275         struct dst_entry *dst;
 276         u32 mtu;
 277
 278         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 279                 return;
 280         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 281         dst = inet_csk_update_pmtu(sk, mtu);
 282         if (!dst)
 283                 return;
 284
 285         /* Something is about to be wrong... Remember soft error
 286          * for the case, if this connection will not able to recover.
 287          */
 288         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 289                 sk->sk_err_soft = EMSGSIZE;
 290
 291         mtu = dst_mtu(dst);
 292
 293         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 294             ip_sk_accept_pmtu(sk) &&
 295             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 296                 tcp_sync_mss(sk, mtu);
 297
 298                 /* Resend the TCP packet because it's
 299                  * clear that the old packet has been
 300                  * dropped. This is the new "fast" path mtu
 301                  * discovery.
 302                  */
 303                 tcp_simple_retransmit(sk);
 304         } /* else let the usual retransmit timer handle it */
 305 }
 306 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 307
 308 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 309 {
 310         struct dst_entry *dst = __sk_dst_check(sk, 0);
 311
 312         if (dst)
 313                 dst->ops->redirect(dst, sk, skb);
 314 }
 315
 316
 317 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 318 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 319 {
 320         struct request_sock *req = inet_reqsk(sk);
 321         struct net *net = sock_net(sk);
 322
 323         /* ICMPs are not backlogged, hence we cannot get
 324          * an established socket here.
 325          */
 326         if (seq != tcp_rsk(req)->snt_isn) {
 327                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 328         } else if (abort) {
 329                 /*
 330                  * Still in SYN_RECV, just remove it silently.
 331                  * There is no good way to pass the error to the newly
 332                  * created socket, and POSIX does not want network
 333                  * errors returned from accept().
 334                  */
 335                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 336                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
 337         }
 338         reqsk_put(req);
 339 }
 340 EXPORT_SYMBOL(tcp_req_err);
 341
 342 /*
 343  * This routine is called by the ICMP module when it gets some
 344  * sort of error condition.  If err < 0 then the socket should
 345  * be closed and the error returned to the user.  If err > 0
 346  * it's just the icmp type << 8 | icmp code.  After adjustment
 347  * header points to the first 8 bytes of the tcp header.  We need
 348  * to find the appropriate port.
 349  *
 350  * The locking strategy used here is very "optimistic". When
 351  * someone else accesses the socket the ICMP is just dropped
 352  * and for some paths there is no check at all.
 353  * A more general error queue to queue errors for later handling
 354  * is probably better.
 355  *
 356  */
 357
 358 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 359 {
 360         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 361         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 362         struct inet_connection_sock *icsk;
 363         struct tcp_sock *tp;
 364         struct inet_sock *inet;
 365         const int type = icmp_hdr(icmp_skb)->type;
 366         const int code = icmp_hdr(icmp_skb)->code;
 367         struct sock *sk;
 368         struct sk_buff *skb;
 369         struct request_sock *fastopen;
 370         __u32 seq, snd_una;
 371         __u32 remaining;
 372         int err;
 373         struct net *net = dev_net(icmp_skb->dev);
 374
 375         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 376                                        th->dest, iph->saddr, ntohs(th->source),
 377                                        inet_iif(icmp_skb));
 378         if (!sk) {
 379                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 380                 return;
 381         }
 382         if (sk->sk_state == TCP_TIME_WAIT) {
 383                 inet_twsk_put(inet_twsk(sk));
 384                 return;
 385         }
 386         seq = ntohl(th->seq);
 387         if (sk->sk_state == TCP_NEW_SYN_RECV)
 388                 return tcp_req_err(sk, seq,
 389                                   type == ICMP_PARAMETERPROB ||
 390                                   type == ICMP_TIME_EXCEEDED ||
 391                                   (type == ICMP_DEST_UNREACH &&
 392                                    (code == ICMP_NET_UNREACH ||
 393                                     code == ICMP_HOST_UNREACH)));
 394
 395         bh_lock_sock(sk);
 396         /* If too many ICMPs get dropped on busy
 397          * servers this needs to be solved differently.
 398          * We do take care of PMTU discovery (RFC1191) special case :
 399          * we can receive locally generated ICMP messages while socket is held.
 400          */
 401         if (sock_owned_by_user(sk)) {
 402                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 403                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 404         }
 405         if (sk->sk_state == TCP_CLOSE)
 406                 goto out;
 407
 408         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 409                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 410                 goto out;
 411         }
 412
 413         icsk = inet_csk(sk);
 414         tp = tcp_sk(sk);
 415         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 416         fastopen = tp->fastopen_rsk;
 417         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 418         if (sk->sk_state != TCP_LISTEN &&
 419             !between(seq, snd_una, tp->snd_nxt)) {
 420                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 421                 goto out;
 422         }
 423
 424         switch (type) {
 425         case ICMP_REDIRECT:
 426                 if (!sock_owned_by_user(sk))
 427                         do_redirect(icmp_skb, sk);
 428                 goto out;
 429         case ICMP_SOURCE_QUENCH:
 430                 /* Just silently ignore these. */
 431                 goto out;
 432         case ICMP_PARAMETERPROB:
 433                 err = EPROTO;
 434                 break;
 435         case ICMP_DEST_UNREACH:
 436                 if (code > NR_ICMP_UNREACH)
 437                         goto out;
 438
 439                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 440                         /* We are not interested in TCP_LISTEN and open_requests
 441                          * (SYN-ACKs send out by Linux are always <576bytes so
 442                          * they should go through unfragmented).
 443                          */
 444                         if (sk->sk_state == TCP_LISTEN)
 445                                 goto out;
 446
 447                         WRITE_ONCE(tp->mtu_info, info);
 448                         if (!sock_owned_by_user(sk)) {
 449                                 tcp_v4_mtu_reduced(sk);
 450                         } else {
 451                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 452                                         sock_hold(sk);
 453                         }
 454                         goto out;
 455                 }
 456
 457                 err = icmp_err_convert[code].errno;
 458                 /* check if icmp_skb allows revert of backoff
 459                  * (see draft-zimmermann-tcp-lcd) */
 460                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 461                         break;
 462                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 463                     !icsk->icsk_backoff || fastopen)
 464                         break;
 465
 466                 if (sock_owned_by_user(sk))
 467                         break;
 468
 469                 skb = tcp_write_queue_head(sk);
 470                 if (WARN_ON_ONCE(!skb))
 471                         break;
 472
 473                 icsk->icsk_backoff--;
 474                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 475                                                TCP_TIMEOUT_INIT;
 476                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 477
 478                 remaining = icsk->icsk_rto -
 479                             min(icsk->icsk_rto,
 480                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 481
 482                 if (remaining) {
 483                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 484                                                   remaining, TCP_RTO_MAX);
 485                 } else {
 486                         /* RTO revert clocked out retransmission.
 487                          * Will retransmit now */
 488                         tcp_retransmit_timer(sk);
 489                 }
 490
 491                 break;
 492         case ICMP_TIME_EXCEEDED:
 493                 err = EHOSTUNREACH;
 494                 break;
 495         default:
 496                 goto out;
 497         }
 498
 499         switch (sk->sk_state) {
 500         case TCP_SYN_SENT:
 501         case TCP_SYN_RECV:
 502                 /* Only in fast or simultaneous open. If a fast open socket is
 503                  * is already accepted it is treated as a connected one below.
 504                  */
 505                 if (fastopen && !fastopen->sk)
 506                         break;
 507
 508                 if (!sock_owned_by_user(sk)) {
 509                         sk->sk_err = err;
 510
 511                         sk->sk_error_report(sk);
 512
 513                         tcp_done(sk);
 514                 } else {
 515                         sk->sk_err_soft = err;
 516                 }
 517                 goto out;
 518         }
 519
 520         /* If we've already connected we will keep trying
 521          * until we time out, or the user gives up.
 522          *
 523          * rfc1122 4.2.3.9 allows to consider as hard errors
 524          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 525          * but it is obsoleted by pmtu discovery).
 526          *
 527          * Note, that in modern internet, where routing is unreliable
 528          * and in each dark corner broken firewalls sit, sending random
 529          * errors ordered by their masters even this two messages finally lose
 530          * their original sense (even Linux sends invalid PORT_UNREACHs)
 531          *
 532          * Now we are in compliance with RFCs.
 533          *                                                      --ANK (980905)
 534          */
 535
 536         inet = inet_sk(sk);
 537         if (!sock_owned_by_user(sk) && inet->recverr) {
 538                 sk->sk_err = err;
 539                 sk->sk_error_report(sk);
 540         } else  { /* Only an error on timeout */
 541                 sk->sk_err_soft = err;
 542         }
 543
 544 out:
 545         bh_unlock_sock(sk);
 546         sock_put(sk);
 547 }
 548
 549 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 550 {
 551         struct tcphdr *th = tcp_hdr(skb);
 552
 553         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 554                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 555                 skb->csum_start = skb_transport_header(skb) - skb->head;
 556                 skb->csum_offset = offsetof(struct tcphdr, check);
 557         } else {
 558                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 559                                          csum_partial(th,
 560                                                       th->doff << 2,
 561                                                       skb->csum));
 562         }
 563 }
 564
 565 /* This routine computes an IPv4 TCP checksum. */
 566 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 567 {
 568         const struct inet_sock *inet = inet_sk(sk);
 569
 570         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 571 }
 572 EXPORT_SYMBOL(tcp_v4_send_check);
 573
 574 /*
 575  *      This routine will send an RST to the other tcp.
 576  *
 577  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 578  *                    for reset.
 579  *      Answer: if a packet caused RST, it is not for a socket
 580  *              existing in our system, if it is matched to a socket,
 581  *              it is just duplicate segment or bug in other side's TCP.
 582  *              So that we build reply only basing on parameters
 583  *              arrived with segment.
 584  *      Exception: precedence violation. We do not implement it in any case.
 585  */
 586
 587 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 588 {
 589         const struct tcphdr *th = tcp_hdr(skb);
 590         struct {
 591                 struct tcphdr th;
 592 #ifdef CONFIG_TCP_MD5SIG
 593                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 594 #endif
 595         } rep;
 596         struct ip_reply_arg arg;
 597 #ifdef CONFIG_TCP_MD5SIG
 598         struct tcp_md5sig_key *key;
 599         const __u8 *hash_location = NULL;
 600         unsigned char newhash[16];
 601         int genhash;
 602         struct sock *sk1 = NULL;
 603 #endif
 604         struct net *net;
 605
 606         /* Never send a reset in response to a reset. */
 607         if (th->rst)
 608                 return;
 609
 610         /* If sk not NULL, it means we did a successful lookup and incoming
 611          * route had to be correct. prequeue might have dropped our dst.
 612          */
 613         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 614                 return;
 615
 616         /* Swap the send and the receive. */
 617         memset(&rep, 0, sizeof(rep));
 618         rep.th.dest   = th->source;
 619         rep.th.source = th->dest;
 620         rep.th.doff   = sizeof(struct tcphdr) / 4;
 621         rep.th.rst    = 1;
 622
 623         if (th->ack) {
 624                 rep.th.seq = th->ack_seq;
 625         } else {
 626                 rep.th.ack = 1;
 627                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 628                                        skb->len - (th->doff << 2));
 629         }
 630
 631         memset(&arg, 0, sizeof(arg));
 632         arg.iov[0].iov_base = (unsigned char *)&rep;
 633         arg.iov[0].iov_len  = sizeof(rep.th);
 634
 635         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 636 #ifdef CONFIG_TCP_MD5SIG
 637         hash_location = tcp_parse_md5sig_option(th);
 638         if (!sk && hash_location) {
 639                 /*
 640                  * active side is lost. Try to find listening socket through
 641                  * source port, and then find md5 key through listening socket.
 642                  * we are not loose security here:
 643                  * Incoming packet is checked with md5 hash with finding key,
 644                  * no RST generated if md5 hash doesn't match.
 645                  */
 646                 sk1 = __inet_lookup_listener(net,
 647                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 648                                              th->source, ip_hdr(skb)->daddr,
 649                                              ntohs(th->source), inet_iif(skb));
 650                 /* don't send rst if it can't find key */
 651                 if (!sk1)
 652                         return;
 653                 rcu_read_lock();
 654                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 655                                         &ip_hdr(skb)->saddr, AF_INET);
 656                 if (!key)
 657                         goto release_sk1;
 658
 659                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 660                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 661                         goto release_sk1;
 662         } else {
 663                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 664                                              &ip_hdr(skb)->saddr,
 665                                              AF_INET) : NULL;
 666         }
 667
 668         if (key) {
 669                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 670                                    (TCPOPT_NOP << 16) |
 671                                    (TCPOPT_MD5SIG << 8) |
 672                                    TCPOLEN_MD5SIG);
 673                 /* Update length and the length the header thinks exists */
 674                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 675                 rep.th.doff = arg.iov[0].iov_len / 4;
 676
 677                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 678                                      key, ip_hdr(skb)->saddr,
 679                                      ip_hdr(skb)->daddr, &rep.th);
 680         }
 681 #endif
 682         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 683                                       ip_hdr(skb)->saddr, /* XXX */
 684                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 685         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 686         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 687         /* When socket is gone, all binding information is lost.
 688          * routing might fail in this case. No choice here, if we choose to force
 689          * input interface, we will misroute in case of asymmetric route.
 690          */
 691         if (sk)
 692                 arg.bound_dev_if = sk->sk_bound_dev_if;
 693
 694         arg.tos = ip_hdr(skb)->tos;
 695         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 696                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 697                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 698                               &arg, arg.iov[0].iov_len);
 699
 700         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 701         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 702
 703 #ifdef CONFIG_TCP_MD5SIG
 704 release_sk1:
 705         if (sk1) {
 706                 rcu_read_unlock();
 707                 sock_put(sk1);
 708         }
 709 #endif
 710 }
 711
 712 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 713    outside socket context is ugly, certainly. What can I do?
 714  */
 715
 716 static void tcp_v4_send_ack(struct net *net,
 717                             struct sk_buff *skb, u32 seq, u32 ack,
 718                             u32 win, u32 tsval, u32 tsecr, int oif,
 719                             struct tcp_md5sig_key *key,
 720                             int reply_flags, u8 tos)
 721 {
 722         const struct tcphdr *th = tcp_hdr(skb);
 723         struct {
 724                 struct tcphdr th;
 725                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 726 #ifdef CONFIG_TCP_MD5SIG
 727                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 728 #endif
 729                         ];
 730         } rep;
 731         struct ip_reply_arg arg;
 732
 733         memset(&rep.th, 0, sizeof(struct tcphdr));
 734         memset(&arg, 0, sizeof(arg));
 735
 736         arg.iov[0].iov_base = (unsigned char *)&rep;
 737         arg.iov[0].iov_len  = sizeof(rep.th);
 738         if (tsecr) {
 739                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 740                                    (TCPOPT_TIMESTAMP << 8) |
 741                                    TCPOLEN_TIMESTAMP);
 742                 rep.opt[1] = htonl(tsval);
 743                 rep.opt[2] = htonl(tsecr);
 744                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 745         }
 746
 747         /* Swap the send and the receive. */
 748         rep.th.dest    = th->source;
 749         rep.th.source  = th->dest;
 750         rep.th.doff    = arg.iov[0].iov_len / 4;
 751         rep.th.seq     = htonl(seq);
 752         rep.th.ack_seq = htonl(ack);
 753         rep.th.ack     = 1;
 754         rep.th.window  = htons(win);
 755
 756 #ifdef CONFIG_TCP_MD5SIG
 757         if (key) {
 758                 int offset = (tsecr) ? 3 : 0;
 759
 760                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 761                                           (TCPOPT_NOP << 16) |
 762                                           (TCPOPT_MD5SIG << 8) |
 763                                           TCPOLEN_MD5SIG);
 764                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 765                 rep.th.doff = arg.iov[0].iov_len/4;
 766
 767                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 768                                     key, ip_hdr(skb)->saddr,
 769                                     ip_hdr(skb)->daddr, &rep.th);
 770         }
 771 #endif
 772         arg.flags = reply_flags;
 773         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 774                                       ip_hdr(skb)->saddr, /* XXX */
 775                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 776         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 777         if (oif)
 778                 arg.bound_dev_if = oif;
 779         arg.tos = tos;
 780         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 781                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 782                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 783                               &arg, arg.iov[0].iov_len);
 784
 785         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 786 }
 787
 788 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 789 {
 790         struct inet_timewait_sock *tw = inet_twsk(sk);
 791         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 792
 793         tcp_v4_send_ack(sock_net(sk), skb,
 794                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 795                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 796                         tcp_time_stamp + tcptw->tw_ts_offset,
 797                         tcptw->tw_ts_recent,
 798                         tw->tw_bound_dev_if,
 799                         tcp_twsk_md5_key(tcptw),
 800                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 801                         tw->tw_tos
 802                         );
 803
 804         inet_twsk_put(tw);
 805 }
 806
 807 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 808                                   struct request_sock *req)
 809 {
 810         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 811          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 812          */
 813         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 814                                              tcp_sk(sk)->snd_nxt;
 815
 816         /* RFC 7323 2.3
 817          * The window field (SEG.WND) of every outgoing segment, with the
 818          * exception of <SYN> segments, MUST be right-shifted by
 819          * Rcv.Wind.Shift bits:
 820          */
 821         tcp_v4_send_ack(sock_net(sk), skb, seq,
 822                         tcp_rsk(req)->rcv_nxt,
 823                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 824                         tcp_time_stamp,
 825                         req->ts_recent,
 826                         0,
 827                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 828                                           AF_INET),
 829                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 830                         ip_hdr(skb)->tos);
 831 }
 832
 833 /*
 834  *      Send a SYN-ACK after having received a SYN.
 835  *      This still operates on a request_sock only, not on a big
 836  *      socket.
 837  */
 838 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 839                               struct flowi *fl,
 840                               struct request_sock *req,
 841                               struct tcp_fastopen_cookie *foc,
 842                                   bool attach_req)
 843 {
 844         const struct inet_request_sock *ireq = inet_rsk(req);
 845         struct flowi4 fl4;
 846         int err = -1;
 847         struct sk_buff *skb;
 848
 849         /* First, grab a route. */
 850         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 851                 return -1;
 852
 853         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
 854
 855         if (skb) {
 856                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 857
 858                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 859                                             ireq->ir_rmt_addr,
 860                                             ireq_opt_deref(ireq));
 861                 err = net_xmit_eval(err);
 862         }
 863
 864         return err;
 865 }
 866
 867 /*
 868  *      IPv4 request_sock destructor.
 869  */
 870 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 871 {
 872         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 873 }
 874
 875
 876 #ifdef CONFIG_TCP_MD5SIG
 877 /*
 878  * RFC2385 MD5 checksumming requires a mapping of
 879  * IP address->MD5 Key.
 880  * We need to maintain these in the sk structure.
 881  */
 882
 883 /* Find the Key structure for an address.  */
 884 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 885                                          const union tcp_md5_addr *addr,
 886                                          int family)
 887 {
 888         const struct tcp_sock *tp = tcp_sk(sk);
 889         struct tcp_md5sig_key *key;
 890         unsigned int size = sizeof(struct in_addr);
 891         const struct tcp_md5sig_info *md5sig;
 892
 893         /* caller either holds rcu_read_lock() or socket lock */
 894         md5sig = rcu_dereference_check(tp->md5sig_info,
 895                                        sock_owned_by_user(sk) ||
 896                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
 897         if (!md5sig)
 898                 return NULL;
 899 #if IS_ENABLED(CONFIG_IPV6)
 900         if (family == AF_INET6)
 901                 size = sizeof(struct in6_addr);
 902 #endif
 903         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 904                 if (key->family != family)
 905                         continue;
 906                 if (!memcmp(&key->addr, addr, size))
 907                         return key;
 908         }
 909         return NULL;
 910 }
 911 EXPORT_SYMBOL(tcp_md5_do_lookup);
 912
 913 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 914                                          const struct sock *addr_sk)
 915 {
 916         const union tcp_md5_addr *addr;
 917
 918         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 919         return tcp_md5_do_lookup(sk, addr, AF_INET);
 920 }
 921 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 922
 923 /* This can be called on a newly created socket, from other files */
 924 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 925                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 926 {
 927         /* Add Key to the list */
 928         struct tcp_md5sig_key *key;
 929         struct tcp_sock *tp = tcp_sk(sk);
 930         struct tcp_md5sig_info *md5sig;
 931
 932         key = tcp_md5_do_lookup(sk, addr, family);
 933         if (key) {
 934                 /* Pre-existing entry - just update that one.
 935                  * Note that the key might be used concurrently.
 936                  */
 937                 memcpy(key->key, newkey, newkeylen);
 938
 939                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
 940                  * Also note that a reader could catch new key->keylen value
 941                  * but old key->key[], this is the reason we use __GFP_ZERO
 942                  * at sock_kmalloc() time below these lines.
 943                  */
 944                 WRITE_ONCE(key->keylen, newkeylen);
 945
 946                 return 0;
 947         }
 948
 949         md5sig = rcu_dereference_protected(tp->md5sig_info,
 950                                            sock_owned_by_user(sk) ||
 951                                            lockdep_is_held(&sk->sk_lock.slock));
 952         if (!md5sig) {
 953                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 954                 if (!md5sig)
 955                         return -ENOMEM;
 956
 957                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 958                 INIT_HLIST_HEAD(&md5sig->head);
 959                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 960         }
 961
 962         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
 963         if (!key)
 964                 return -ENOMEM;
 965         if (!tcp_alloc_md5sig_pool()) {
 966                 sock_kfree_s(sk, key, sizeof(*key));
 967                 return -ENOMEM;
 968         }
 969
 970         memcpy(key->key, newkey, newkeylen);
 971         key->keylen = newkeylen;
 972         key->family = family;
 973         memcpy(&key->addr, addr,
 974                (family == AF_INET6) ? sizeof(struct in6_addr) :
 975                                       sizeof(struct in_addr));
 976         hlist_add_head_rcu(&key->node, &md5sig->head);
 977         return 0;
 978 }
 979 EXPORT_SYMBOL(tcp_md5_do_add);
 980
 981 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 982 {
 983         struct tcp_md5sig_key *key;
 984
 985         key = tcp_md5_do_lookup(sk, addr, family);
 986         if (!key)
 987                 return -ENOENT;
 988         hlist_del_rcu(&key->node);
 989         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 990         kfree_rcu(key, rcu);
 991         return 0;
 992 }
 993 EXPORT_SYMBOL(tcp_md5_do_del);
 994
 995 static void tcp_clear_md5_list(struct sock *sk)
 996 {
 997         struct tcp_sock *tp = tcp_sk(sk);
 998         struct tcp_md5sig_key *key;
 999         struct hlist_node *n;
1000         struct tcp_md5sig_info *md5sig;
1001
1002         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1003
1004         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1005                 hlist_del_rcu(&key->node);
1006                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1007                 kfree_rcu(key, rcu);
1008         }
1009 }
1010
1011 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1012                                  int optlen)
1013 {
1014         struct tcp_md5sig cmd;
1015         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1016
1017         if (optlen < sizeof(cmd))
1018                 return -EINVAL;
1019
1020         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1021                 return -EFAULT;
1022
1023         if (sin->sin_family != AF_INET)
1024                 return -EINVAL;
1025
1026         if (!cmd.tcpm_keylen)
1027                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1028                                       AF_INET);
1029
1030         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1031                 return -EINVAL;
1032
1033         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1034                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1035                               GFP_KERNEL);
1036 }
1037
1038 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1039                                         __be32 daddr, __be32 saddr, int nbytes)
1040 {
1041         struct tcp4_pseudohdr *bp;
1042         struct scatterlist sg;
1043
1044         bp = &hp->md5_blk.ip4;
1045
1046         /*
1047          * 1. the TCP pseudo-header (in the order: source IP address,
1048          * destination IP address, zero-padded protocol number, and
1049          * segment length)
1050          */
1051         bp->saddr = saddr;
1052         bp->daddr = daddr;
1053         bp->pad = 0;
1054         bp->protocol = IPPROTO_TCP;
1055         bp->len = cpu_to_be16(nbytes);
1056
1057         sg_init_one(&sg, bp, sizeof(*bp));
1058         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1059 }
1060
1061 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1062                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1063 {
1064         struct tcp_md5sig_pool *hp;
1065         struct hash_desc *desc;
1066
1067         hp = tcp_get_md5sig_pool();
1068         if (!hp)
1069                 goto clear_hash_noput;
1070         desc = &hp->md5_desc;
1071
1072         if (crypto_hash_init(desc))
1073                 goto clear_hash;
1074         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1075                 goto clear_hash;
1076         if (tcp_md5_hash_header(hp, th))
1077                 goto clear_hash;
1078         if (tcp_md5_hash_key(hp, key))
1079                 goto clear_hash;
1080         if (crypto_hash_final(desc, md5_hash))
1081                 goto clear_hash;
1082
1083         tcp_put_md5sig_pool();
1084         return 0;
1085
1086 clear_hash:
1087         tcp_put_md5sig_pool();
1088 clear_hash_noput:
1089         memset(md5_hash, 0, 16);
1090         return 1;
1091 }
1092
1093 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1094                         const struct sock *sk,
1095                         const struct sk_buff *skb)
1096 {
1097         struct tcp_md5sig_pool *hp;
1098         struct hash_desc *desc;
1099         const struct tcphdr *th = tcp_hdr(skb);
1100         __be32 saddr, daddr;
1101
1102         if (sk) { /* valid for establish/request sockets */
1103                 saddr = sk->sk_rcv_saddr;
1104                 daddr = sk->sk_daddr;
1105         } else {
1106                 const struct iphdr *iph = ip_hdr(skb);
1107                 saddr = iph->saddr;
1108                 daddr = iph->daddr;
1109         }
1110
1111         hp = tcp_get_md5sig_pool();
1112         if (!hp)
1113                 goto clear_hash_noput;
1114         desc = &hp->md5_desc;
1115
1116         if (crypto_hash_init(desc))
1117                 goto clear_hash;
1118
1119         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1120                 goto clear_hash;
1121         if (tcp_md5_hash_header(hp, th))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_key(hp, key))
1126                 goto clear_hash;
1127         if (crypto_hash_final(desc, md5_hash))
1128                 goto clear_hash;
1129
1130         tcp_put_md5sig_pool();
1131         return 0;
1132
1133 clear_hash:
1134         tcp_put_md5sig_pool();
1135 clear_hash_noput:
1136         memset(md5_hash, 0, 16);
1137         return 1;
1138 }
1139 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1140
1141 #endif
1142
1143 /* Called with rcu_read_lock() */
1144 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1145                                     const struct sk_buff *skb)
1146 {
1147 #ifdef CONFIG_TCP_MD5SIG
1148         /*
1149          * This gets called for each TCP segment that arrives
1150          * so we want to be efficient.
1151          * We have 3 drop cases:
1152          * o No MD5 hash and one expected.
1153          * o MD5 hash and we're not expecting one.
1154          * o MD5 hash and its wrong.
1155          */
1156         const __u8 *hash_location = NULL;
1157         struct tcp_md5sig_key *hash_expected;
1158         const struct iphdr *iph = ip_hdr(skb);
1159         const struct tcphdr *th = tcp_hdr(skb);
1160         int genhash;
1161         unsigned char newhash[16];
1162
1163         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1164                                           AF_INET);
1165         hash_location = tcp_parse_md5sig_option(th);
1166
1167         /* We've parsed the options - do we have a hash? */
1168         if (!hash_expected && !hash_location)
1169                 return false;
1170
1171         if (hash_expected && !hash_location) {
1172                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1173                 return true;
1174         }
1175
1176         if (!hash_expected && hash_location) {
1177                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1178                 return true;
1179         }
1180
1181         /* Okay, so this is hash_expected and hash_location -
1182          * so we need to calculate the checksum.
1183          */
1184         genhash = tcp_v4_md5_hash_skb(newhash,
1185                                       hash_expected,
1186                                       NULL, skb);
1187
1188         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1189                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1190                                      &iph->saddr, ntohs(th->source),
1191                                      &iph->daddr, ntohs(th->dest),
1192                                      genhash ? " tcp_v4_calc_md5_hash failed"
1193                                      : "");
1194                 return true;
1195         }
1196         return false;
1197 #endif
1198         return false;
1199 }
1200
1201 static void tcp_v4_init_req(struct request_sock *req,
1202                             const struct sock *sk_listener,
1203                             struct sk_buff *skb)
1204 {
1205         struct inet_request_sock *ireq = inet_rsk(req);
1206
1207         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1208         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1209         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1210         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
1211 }
1212
1213 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1214                                           struct flowi *fl,
1215                                           const struct request_sock *req,
1216                                           bool *strict)
1217 {
1218         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1219
1220         if (strict) {
1221                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1222                         *strict = true;
1223                 else
1224                         *strict = false;
1225         }
1226
1227         return dst;
1228 }
1229
1230 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1231         .family         =       PF_INET,
1232         .obj_size       =       sizeof(struct tcp_request_sock),
1233         .rtx_syn_ack    =       tcp_rtx_synack,
1234         .send_ack       =       tcp_v4_reqsk_send_ack,
1235         .destructor     =       tcp_v4_reqsk_destructor,
1236         .send_reset     =       tcp_v4_send_reset,
1237         .syn_ack_timeout =      tcp_syn_ack_timeout,
1238 };
1239
1240 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1241         .mss_clamp      =       TCP_MSS_DEFAULT,
1242 #ifdef CONFIG_TCP_MD5SIG
1243         .req_md5_lookup =       tcp_v4_md5_lookup,
1244         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1245 #endif
1246         .init_req       =       tcp_v4_init_req,
1247 #ifdef CONFIG_SYN_COOKIES
1248         .cookie_init_seq =      cookie_v4_init_sequence,
1249 #endif
1250         .route_req      =       tcp_v4_route_req,
1251         .init_seq       =       tcp_v4_init_sequence,
1252         .send_synack    =       tcp_v4_send_synack,
1253 };
1254
1255 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1256 {
1257         /* Never answer to SYNs send to broadcast or multicast */
1258         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1259                 goto drop;
1260
1261         return tcp_conn_request(&tcp_request_sock_ops,
1262                                 &tcp_request_sock_ipv4_ops, sk, skb);
1263
1264 drop:
1265         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1266         return 0;
1267 }
1268 EXPORT_SYMBOL(tcp_v4_conn_request);
1269
1270
1271 /*
1272  * The three way handshake has completed - we got a valid synack -
1273  * now create the new socket.
1274  */
1275 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1276                                   struct request_sock *req,
1277                                   struct dst_entry *dst,
1278                                   struct request_sock *req_unhash,
1279                                   bool *own_req)
1280 {
1281         struct inet_request_sock *ireq;
1282         struct inet_sock *newinet;
1283         struct tcp_sock *newtp;
1284         struct sock *newsk;
1285 #ifdef CONFIG_TCP_MD5SIG
1286         struct tcp_md5sig_key *key;
1287 #endif
1288         struct ip_options_rcu *inet_opt;
1289
1290         if (sk_acceptq_is_full(sk))
1291                 goto exit_overflow;
1292
1293         newsk = tcp_create_openreq_child(sk, req, skb);
1294         if (!newsk)
1295                 goto exit_nonewsk;
1296
1297         newsk->sk_gso_type = SKB_GSO_TCPV4;
1298         inet_sk_rx_dst_set(newsk, skb);
1299
1300         newtp                 = tcp_sk(newsk);
1301         newinet               = inet_sk(newsk);
1302         ireq                  = inet_rsk(req);
1303         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1304         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1305         newinet->inet_saddr   = ireq->ir_loc_addr;
1306         inet_opt              = rcu_dereference(ireq->ireq_opt);
1307         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1308         newinet->mc_index     = inet_iif(skb);
1309         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1310         newinet->rcv_tos      = ip_hdr(skb)->tos;
1311         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1312         if (inet_opt)
1313                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1314         newinet->inet_id = prandom_u32();
1315
1316         if (!dst) {
1317                 dst = inet_csk_route_child_sock(sk, newsk, req);
1318                 if (!dst)
1319                         goto put_and_exit;
1320         } else {
1321                 /* syncookie case : see end of cookie_v4_check() */
1322         }
1323         sk_setup_caps(newsk, dst);
1324
1325         tcp_ca_openreq_child(newsk, dst);
1326
1327         tcp_sync_mss(newsk, dst_mtu(dst));
1328         newtp->advmss = dst_metric_advmss(dst);
1329         if (tcp_sk(sk)->rx_opt.user_mss &&
1330             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1331                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1332
1333         tcp_initialize_rcv_mss(newsk);
1334
1335 #ifdef CONFIG_TCP_MD5SIG
1336         /* Copy over the MD5 key from the original socket */
1337         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1338                                 AF_INET);
1339         if (key) {
1340                 /*
1341                  * We're using one, so create a matching key
1342                  * on the newsk structure. If we fail to get
1343                  * memory, then we end up not copying the key
1344                  * across. Shucks.
1345                  */
1346                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1347                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1348                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1349         }
1350 #endif
1351
1352         if (__inet_inherit_port(sk, newsk) < 0)
1353                 goto put_and_exit;
1354         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1355         if (likely(*own_req)) {
1356                 tcp_move_syn(newtp, req);
1357                 ireq->ireq_opt = NULL;
1358         } else {
1359                 newinet->inet_opt = NULL;
1360         }
1361         return newsk;
1362
1363 exit_overflow:
1364         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1365 exit_nonewsk:
1366         dst_release(dst);
1367 exit:
1368         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1369         return NULL;
1370 put_and_exit:
1371         newinet->inet_opt = NULL;
1372         inet_csk_prepare_forced_close(newsk);
1373         tcp_done(newsk);
1374         goto exit;
1375 }
1376 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1377
1378 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1379 {
1380 #ifdef CONFIG_SYN_COOKIES
1381         const struct tcphdr *th = tcp_hdr(skb);
1382
1383         if (!th->syn)
1384                 sk = cookie_v4_check(sk, skb);
1385 #endif
1386         return sk;
1387 }
1388
1389 /* The socket must have it's spinlock held when we get
1390  * here, unless it is a TCP_LISTEN socket.
1391  *
1392  * We have a potential double-lock case here, so even when
1393  * doing backlog processing we use the BH locking scheme.
1394  * This is because we cannot sleep with the original spinlock
1395  * held.
1396  */
1397 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1398 {
1399         struct sock *rsk;
1400
1401         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1402                 struct dst_entry *dst = sk->sk_rx_dst;
1403
1404                 sock_rps_save_rxhash(sk, skb);
1405                 sk_mark_napi_id(sk, skb);
1406                 if (dst) {
1407                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1408                             !dst->ops->check(dst, 0)) {
1409                                 dst_release(dst);
1410                                 sk->sk_rx_dst = NULL;
1411                         }
1412                 }
1413                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1414                 return 0;
1415         }
1416
1417         if (tcp_checksum_complete(skb))
1418                 goto csum_err;
1419
1420         if (sk->sk_state == TCP_LISTEN) {
1421                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1422
1423                 if (!nsk)
1424                         goto discard;
1425                 if (nsk != sk) {
1426                         sock_rps_save_rxhash(nsk, skb);
1427                         sk_mark_napi_id(nsk, skb);
1428                         if (tcp_child_process(sk, nsk, skb)) {
1429                                 rsk = nsk;
1430                                 goto reset;
1431                         }
1432                         return 0;
1433                 }
1434         } else
1435                 sock_rps_save_rxhash(sk, skb);
1436
1437         if (tcp_rcv_state_process(sk, skb)) {
1438                 rsk = sk;
1439                 goto reset;
1440         }
1441         return 0;
1442
1443 reset:
1444         tcp_v4_send_reset(rsk, skb);
1445 discard:
1446         kfree_skb(skb);
1447         /* Be careful here. If this function gets more complicated and
1448          * gcc suffers from register pressure on the x86, sk (in %ebx)
1449          * might be destroyed here. This current version compiles correctly,
1450          * but you have been warned.
1451          */
1452         return 0;
1453
1454 csum_err:
1455         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1456         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1457         goto discard;
1458 }
1459 EXPORT_SYMBOL(tcp_v4_do_rcv);
1460
1461 void tcp_v4_early_demux(struct sk_buff *skb)
1462 {
1463         const struct iphdr *iph;
1464         const struct tcphdr *th;
1465         struct sock *sk;
1466
1467         if (skb->pkt_type != PACKET_HOST)
1468                 return;
1469
1470         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1471                 return;
1472
1473         iph = ip_hdr(skb);
1474         th = tcp_hdr(skb);
1475
1476         if (th->doff < sizeof(struct tcphdr) / 4)
1477                 return;
1478
1479         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1480                                        iph->saddr, th->source,
1481                                        iph->daddr, ntohs(th->dest),
1482                                        skb->skb_iif);
1483         if (sk) {
1484                 skb->sk = sk;
1485                 skb->destructor = sock_edemux;
1486                 if (sk_fullsock(sk)) {
1487                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1488
1489                         if (dst)
1490                                 dst = dst_check(dst, 0);
1491                         if (dst &&
1492                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1493                                 skb_dst_set_noref(skb, dst);
1494                 }
1495         }
1496 }
1497
1498 /* Packet is added to VJ-style prequeue for processing in process
1499  * context, if a reader task is waiting. Apparently, this exciting
1500  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1501  * failed somewhere. Latency? Burstiness? Well, at least now we will
1502  * see, why it failed. 8)8)                               --ANK
1503  *
1504  */
1505 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1506 {
1507         struct tcp_sock *tp = tcp_sk(sk);
1508
1509         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1510                 return false;
1511
1512         if (skb->len <= tcp_hdrlen(skb) &&
1513             skb_queue_len(&tp->ucopy.prequeue) == 0)
1514                 return false;
1515
1516         /* Before escaping RCU protected region, we need to take care of skb
1517          * dst. Prequeue is only enabled for established sockets.
1518          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1519          * Instead of doing full sk_rx_dst validity here, let's perform
1520          * an optimistic check.
1521          */
1522         if (likely(sk->sk_rx_dst))
1523                 skb_dst_drop(skb);
1524         else
1525                 skb_dst_force_safe(skb);
1526
1527         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1528         tp->ucopy.memory += skb->truesize;
1529         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1530                 struct sk_buff *skb1;
1531
1532                 BUG_ON(sock_owned_by_user(sk));
1533
1534                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1535                         sk_backlog_rcv(sk, skb1);
1536                         NET_INC_STATS_BH(sock_net(sk),
1537                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1538                 }
1539
1540                 tp->ucopy.memory = 0;
1541         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1542                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1543                                            POLLIN | POLLRDNORM | POLLRDBAND);
1544                 if (!inet_csk_ack_scheduled(sk))
1545                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1546                                                   (3 * tcp_rto_min(sk)) / 4,
1547                                                   TCP_RTO_MAX);
1548         }
1549         return true;
1550 }
1551 EXPORT_SYMBOL(tcp_prequeue);
1552
1553 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1554 {
1555         struct tcphdr *th = (struct tcphdr *)skb->data;
1556         unsigned int eaten = skb->len;
1557         int err;
1558
1559         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1560         if (!err) {
1561                 eaten -= skb->len;
1562                 TCP_SKB_CB(skb)->end_seq -= eaten;
1563         }
1564         return err;
1565 }
1566 EXPORT_SYMBOL(tcp_filter);
1567
1568 /*
1569  *      From tcp_input.c
1570  */
1571
1572 int tcp_v4_rcv(struct sk_buff *skb)
1573 {
1574         const struct iphdr *iph;
1575         const struct tcphdr *th;
1576         struct sock *sk;
1577         int ret;
1578         struct net *net = dev_net(skb->dev);
1579
1580         if (skb->pkt_type != PACKET_HOST)
1581                 goto discard_it;
1582
1583         /* Count it even if it's bad */
1584         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1585
1586         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1587                 goto discard_it;
1588
1589         th = tcp_hdr(skb);
1590
1591         if (th->doff < sizeof(struct tcphdr) / 4)
1592                 goto bad_packet;
1593         if (!pskb_may_pull(skb, th->doff * 4))
1594                 goto discard_it;
1595
1596         /* An explanation is required here, I think.
1597          * Packet length and doff are validated by header prediction,
1598          * provided case of th->doff==0 is eliminated.
1599          * So, we defer the checks. */
1600
1601         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1602                 goto csum_error;
1603
1604         th = tcp_hdr(skb);
1605         iph = ip_hdr(skb);
1606         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1607          * barrier() makes sure compiler wont play fool^Waliasing games.
1608          */
1609         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1610                 sizeof(struct inet_skb_parm));
1611         barrier();
1612
1613         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1614         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1615                                     skb->len - th->doff * 4);
1616         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1617         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1618         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1619         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1620         TCP_SKB_CB(skb)->sacked  = 0;
1621
1622 lookup:
1623         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1624         if (!sk)
1625                 goto no_tcp_socket;
1626
1627 process:
1628         if (sk->sk_state == TCP_TIME_WAIT)
1629                 goto do_time_wait;
1630
1631         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1632                 struct request_sock *req = inet_reqsk(sk);
1633                 struct sock *nsk;
1634
1635                 sk = req->rsk_listener;
1636                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1637                         reqsk_put(req);
1638                         goto discard_it;
1639                 }
1640                 if (tcp_checksum_complete(skb)) {
1641                         reqsk_put(req);
1642                         goto csum_error;
1643                 }
1644                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1645                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1646                         goto lookup;
1647                 }
1648                 sock_hold(sk);
1649                 nsk = tcp_check_req(sk, skb, req, false);
1650                 if (!nsk) {
1651                         reqsk_put(req);
1652                         goto discard_and_relse;
1653                 }
1654                 if (nsk == sk) {
1655                         reqsk_put(req);
1656                 } else if (tcp_child_process(sk, nsk, skb)) {
1657                         tcp_v4_send_reset(nsk, skb);
1658                         goto discard_and_relse;
1659                 } else {
1660                         sock_put(sk);
1661                         return 0;
1662                 }
1663         }
1664         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1665                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1666                 goto discard_and_relse;
1667         }
1668
1669         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1670                 goto discard_and_relse;
1671
1672         if (tcp_v4_inbound_md5_hash(sk, skb))
1673                 goto discard_and_relse;
1674
1675         nf_reset(skb);
1676
1677         if (tcp_filter(sk, skb))
1678                 goto discard_and_relse;
1679         th = (const struct tcphdr *)skb->data;
1680         iph = ip_hdr(skb);
1681
1682         skb->dev = NULL;
1683
1684         if (sk->sk_state == TCP_LISTEN) {
1685                 ret = tcp_v4_do_rcv(sk, skb);
1686                 goto put_and_return;
1687         }
1688
1689         sk_incoming_cpu_update(sk);
1690
1691         bh_lock_sock_nested(sk);
1692         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1693         ret = 0;
1694         if (!sock_owned_by_user(sk)) {
1695                 if (!tcp_prequeue(sk, skb))
1696                         ret = tcp_v4_do_rcv(sk, skb);
1697         } else if (unlikely(sk_add_backlog(sk, skb,
1698                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1699                 bh_unlock_sock(sk);
1700                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1701                 goto discard_and_relse;
1702         }
1703         bh_unlock_sock(sk);
1704
1705 put_and_return:
1706         sock_put(sk);
1707
1708         return ret;
1709
1710 no_tcp_socket:
1711         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1712                 goto discard_it;
1713
1714         if (tcp_checksum_complete(skb)) {
1715 csum_error:
1716                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1717 bad_packet:
1718                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1719         } else {
1720                 tcp_v4_send_reset(NULL, skb);
1721         }
1722
1723 discard_it:
1724         /* Discard frame. */
1725         kfree_skb(skb);
1726         return 0;
1727
1728 discard_and_relse:
1729         sk_drops_add(sk, skb);
1730         sock_put(sk);
1731         goto discard_it;
1732
1733 do_time_wait:
1734         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1735                 inet_twsk_put(inet_twsk(sk));
1736                 goto discard_it;
1737         }
1738
1739         if (tcp_checksum_complete(skb)) {
1740                 inet_twsk_put(inet_twsk(sk));
1741                 goto csum_error;
1742         }
1743         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1744         case TCP_TW_SYN: {
1745                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1746                                                         &tcp_hashinfo,
1747                                                         iph->saddr, th->source,
1748                                                         iph->daddr, th->dest,
1749                                                         inet_iif(skb));
1750                 if (sk2) {
1751                         inet_twsk_deschedule_put(inet_twsk(sk));
1752                         sk = sk2;
1753                         goto process;
1754                 }
1755                 /* Fall through to ACK */
1756         }
1757         case TCP_TW_ACK:
1758                 tcp_v4_timewait_ack(sk, skb);
1759                 break;
1760         case TCP_TW_RST:
1761                 goto no_tcp_socket;
1762         case TCP_TW_SUCCESS:;
1763         }
1764         goto discard_it;
1765 }
1766
1767 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1768         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1769         .twsk_unique    = tcp_twsk_unique,
1770         .twsk_destructor= tcp_twsk_destructor,
1771 };
1772
1773 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1774 {
1775         struct dst_entry *dst = skb_dst(skb);
1776
1777         if (dst && dst_hold_safe(dst)) {
1778                 sk->sk_rx_dst = dst;
1779                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1780         }
1781 }
1782 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1783
1784 const struct inet_connection_sock_af_ops ipv4_specific = {
1785         .queue_xmit        = ip_queue_xmit,
1786         .send_check        = tcp_v4_send_check,
1787         .rebuild_header    = inet_sk_rebuild_header,
1788         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1789         .conn_request      = tcp_v4_conn_request,
1790         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1791         .net_header_len    = sizeof(struct iphdr),
1792         .setsockopt        = ip_setsockopt,
1793         .getsockopt        = ip_getsockopt,
1794         .addr2sockaddr     = inet_csk_addr2sockaddr,
1795         .sockaddr_len      = sizeof(struct sockaddr_in),
1796         .bind_conflict     = inet_csk_bind_conflict,
1797 #ifdef CONFIG_COMPAT
1798         .compat_setsockopt = compat_ip_setsockopt,
1799         .compat_getsockopt = compat_ip_getsockopt,
1800 #endif
1801         .mtu_reduced       = tcp_v4_mtu_reduced,
1802 };
1803 EXPORT_SYMBOL(ipv4_specific);
1804
1805 #ifdef CONFIG_TCP_MD5SIG
1806 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1807         .md5_lookup             = tcp_v4_md5_lookup,
1808         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1809         .md5_parse              = tcp_v4_parse_md5_keys,
1810 };
1811 #endif
1812
1813 /* NOTE: A lot of things set to zero explicitly by call to
1814  *       sk_alloc() so need not be done here.
1815  */
1816 static int tcp_v4_init_sock(struct sock *sk)
1817 {
1818         struct inet_connection_sock *icsk = inet_csk(sk);
1819
1820         tcp_init_sock(sk);
1821
1822         icsk->icsk_af_ops = &ipv4_specific;
1823
1824 #ifdef CONFIG_TCP_MD5SIG
1825         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1826 #endif
1827
1828         return 0;
1829 }
1830
1831 void tcp_v4_destroy_sock(struct sock *sk)
1832 {
1833         struct tcp_sock *tp = tcp_sk(sk);
1834
1835         tcp_clear_xmit_timers(sk);
1836
1837         tcp_cleanup_congestion_control(sk);
1838
1839         /* Cleanup up the write buffer. */
1840         tcp_write_queue_purge(sk);
1841
1842         /* Cleans up our, hopefully empty, out_of_order_queue. */
1843         skb_rbtree_purge(&tp->out_of_order_queue);
1844
1845 #ifdef CONFIG_TCP_MD5SIG
1846         /* Clean up the MD5 key list, if any */
1847         if (tp->md5sig_info) {
1848                 tcp_clear_md5_list(sk);
1849                 kfree_rcu(tp->md5sig_info, rcu);
1850                 tp->md5sig_info = NULL;
1851         }
1852 #endif
1853
1854         /* Clean prequeue, it must be empty really */
1855         __skb_queue_purge(&tp->ucopy.prequeue);
1856
1857         /* Clean up a referenced TCP bind bucket. */
1858         if (inet_csk(sk)->icsk_bind_hash)
1859                 inet_put_port(sk);
1860
1861         BUG_ON(tp->fastopen_rsk);
1862
1863         /* If socket is aborted during connect operation */
1864         tcp_free_fastopen_req(tp);
1865         tcp_saved_syn_free(tp);
1866
1867         sk_sockets_allocated_dec(sk);
1868         sock_release_memcg(sk);
1869 }
1870 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1871
1872 #ifdef CONFIG_PROC_FS
1873 /* Proc filesystem TCP sock list dumping. */
1874
1875 /*
1876  * Get next listener socket follow cur.  If cur is NULL, get first socket
1877  * starting from bucket given in st->bucket; when st->bucket is zero the
1878  * very first socket in the hash table is returned.
1879  */
1880 static void *listening_get_next(struct seq_file *seq, void *cur)
1881 {
1882         struct inet_connection_sock *icsk;
1883         struct hlist_nulls_node *node;
1884         struct sock *sk = cur;
1885         struct inet_listen_hashbucket *ilb;
1886         struct tcp_iter_state *st = seq->private;
1887         struct net *net = seq_file_net(seq);
1888
1889         if (!sk) {
1890                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1891                 spin_lock_bh(&ilb->lock);
1892                 sk = sk_nulls_head(&ilb->head);
1893                 st->offset = 0;
1894                 goto get_sk;
1895         }
1896         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1897         ++st->num;
1898         ++st->offset;
1899
1900         sk = sk_nulls_next(sk);
1901 get_sk:
1902         sk_nulls_for_each_from(sk, node) {
1903                 if (!net_eq(sock_net(sk), net))
1904                         continue;
1905                 if (sk->sk_family == st->family) {
1906                         cur = sk;
1907                         goto out;
1908                 }
1909                 icsk = inet_csk(sk);
1910         }
1911         spin_unlock_bh(&ilb->lock);
1912         st->offset = 0;
1913         if (++st->bucket < INET_LHTABLE_SIZE) {
1914                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1915                 spin_lock_bh(&ilb->lock);
1916                 sk = sk_nulls_head(&ilb->head);
1917                 goto get_sk;
1918         }
1919         cur = NULL;
1920 out:
1921         return cur;
1922 }
1923
1924 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1925 {
1926         struct tcp_iter_state *st = seq->private;
1927         void *rc;
1928
1929         st->bucket = 0;
1930         st->offset = 0;
1931         rc = listening_get_next(seq, NULL);
1932
1933         while (rc && *pos) {
1934                 rc = listening_get_next(seq, rc);
1935                 --*pos;
1936         }
1937         return rc;
1938 }
1939
1940 static inline bool empty_bucket(const struct tcp_iter_state *st)
1941 {
1942         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1943 }
1944
1945 /*
1946  * Get first established socket starting from bucket given in st->bucket.
1947  * If st->bucket is zero, the very first socket in the hash is returned.
1948  */
1949 static void *established_get_first(struct seq_file *seq)
1950 {
1951         struct tcp_iter_state *st = seq->private;
1952         struct net *net = seq_file_net(seq);
1953         void *rc = NULL;
1954
1955         st->offset = 0;
1956         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1957                 struct sock *sk;
1958                 struct hlist_nulls_node *node;
1959                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1960
1961                 /* Lockless fast path for the common case of empty buckets */
1962                 if (empty_bucket(st))
1963                         continue;
1964
1965                 spin_lock_bh(lock);
1966                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1967                         if (sk->sk_family != st->family ||
1968                             !net_eq(sock_net(sk), net)) {
1969                                 continue;
1970                         }
1971                         rc = sk;
1972                         goto out;
1973                 }
1974                 spin_unlock_bh(lock);
1975         }
1976 out:
1977         return rc;
1978 }
1979
1980 static void *established_get_next(struct seq_file *seq, void *cur)
1981 {
1982         struct sock *sk = cur;
1983         struct hlist_nulls_node *node;
1984         struct tcp_iter_state *st = seq->private;
1985         struct net *net = seq_file_net(seq);
1986
1987         ++st->num;
1988         ++st->offset;
1989
1990         sk = sk_nulls_next(sk);
1991
1992         sk_nulls_for_each_from(sk, node) {
1993                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1994                         return sk;
1995         }
1996
1997         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1998         ++st->bucket;
1999         return established_get_first(seq);
2000 }
2001
2002 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2003 {
2004         struct tcp_iter_state *st = seq->private;
2005         void *rc;
2006
2007         st->bucket = 0;
2008         rc = established_get_first(seq);
2009
2010         while (rc && pos) {
2011                 rc = established_get_next(seq, rc);
2012                 --pos;
2013         }
2014         return rc;
2015 }
2016
2017 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2018 {
2019         void *rc;
2020         struct tcp_iter_state *st = seq->private;
2021
2022         st->state = TCP_SEQ_STATE_LISTENING;
2023         rc        = listening_get_idx(seq, &pos);
2024
2025         if (!rc) {
2026                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2027                 rc        = established_get_idx(seq, pos);
2028         }
2029
2030         return rc;
2031 }
2032
2033 static void *tcp_seek_last_pos(struct seq_file *seq)
2034 {
2035         struct tcp_iter_state *st = seq->private;
2036         int bucket = st->bucket;
2037         int offset = st->offset;
2038         int orig_num = st->num;
2039         void *rc = NULL;
2040
2041         switch (st->state) {
2042         case TCP_SEQ_STATE_LISTENING:
2043                 if (st->bucket >= INET_LHTABLE_SIZE)
2044                         break;
2045                 st->state = TCP_SEQ_STATE_LISTENING;
2046                 rc = listening_get_next(seq, NULL);
2047                 while (offset-- && rc && bucket == st->bucket)
2048                         rc = listening_get_next(seq, rc);
2049                 if (rc)
2050                         break;
2051                 st->bucket = 0;
2052                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2053                 /* Fallthrough */
2054         case TCP_SEQ_STATE_ESTABLISHED:
2055                 if (st->bucket > tcp_hashinfo.ehash_mask)
2056                         break;
2057                 rc = established_get_first(seq);
2058                 while (offset-- && rc && bucket == st->bucket)
2059                         rc = established_get_next(seq, rc);
2060         }
2061
2062         st->num = orig_num;
2063
2064         return rc;
2065 }
2066
2067 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2068 {
2069         struct tcp_iter_state *st = seq->private;
2070         void *rc;
2071
2072         if (*pos && *pos == st->last_pos) {
2073                 rc = tcp_seek_last_pos(seq);
2074                 if (rc)
2075                         goto out;
2076         }
2077
2078         st->state = TCP_SEQ_STATE_LISTENING;
2079         st->num = 0;
2080         st->bucket = 0;
2081         st->offset = 0;
2082         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2083
2084 out:
2085         st->last_pos = *pos;
2086         return rc;
2087 }
2088
2089 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2090 {
2091         struct tcp_iter_state *st = seq->private;
2092         void *rc = NULL;
2093
2094         if (v == SEQ_START_TOKEN) {
2095                 rc = tcp_get_idx(seq, 0);
2096                 goto out;
2097         }
2098
2099         switch (st->state) {
2100         case TCP_SEQ_STATE_LISTENING:
2101                 rc = listening_get_next(seq, v);
2102                 if (!rc) {
2103                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2104                         st->bucket = 0;
2105                         st->offset = 0;
2106                         rc        = established_get_first(seq);
2107                 }
2108                 break;
2109         case TCP_SEQ_STATE_ESTABLISHED:
2110                 rc = established_get_next(seq, v);
2111                 break;
2112         }
2113 out:
2114         ++*pos;
2115         st->last_pos = *pos;
2116         return rc;
2117 }
2118
2119 static void tcp_seq_stop(struct seq_file *seq, void *v)
2120 {
2121         struct tcp_iter_state *st = seq->private;
2122
2123         switch (st->state) {
2124         case TCP_SEQ_STATE_LISTENING:
2125                 if (v != SEQ_START_TOKEN)
2126                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2127                 break;
2128         case TCP_SEQ_STATE_ESTABLISHED:
2129                 if (v)
2130                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2131                 break;
2132         }
2133 }
2134
2135 int tcp_seq_open(struct inode *inode, struct file *file)
2136 {
2137         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2138         struct tcp_iter_state *s;
2139         int err;
2140
2141         err = seq_open_net(inode, file, &afinfo->seq_ops,
2142                           sizeof(struct tcp_iter_state));
2143         if (err < 0)
2144                 return err;
2145
2146         s = ((struct seq_file *)file->private_data)->private;
2147         s->family               = afinfo->family;
2148         s->last_pos             = 0;
2149         return 0;
2150 }
2151 EXPORT_SYMBOL(tcp_seq_open);
2152
2153 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2154 {
2155         int rc = 0;
2156         struct proc_dir_entry *p;
2157
2158         afinfo->seq_ops.start           = tcp_seq_start;
2159         afinfo->seq_ops.next            = tcp_seq_next;
2160         afinfo->seq_ops.stop            = tcp_seq_stop;
2161
2162         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2163                              afinfo->seq_fops, afinfo);
2164         if (!p)
2165                 rc = -ENOMEM;
2166         return rc;
2167 }
2168 EXPORT_SYMBOL(tcp_proc_register);
2169
2170 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2171 {
2172         remove_proc_entry(afinfo->name, net->proc_net);
2173 }
2174 EXPORT_SYMBOL(tcp_proc_unregister);
2175
2176 static void get_openreq4(const struct request_sock *req,
2177                          struct seq_file *f, int i)
2178 {
2179         const struct inet_request_sock *ireq = inet_rsk(req);
2180         long delta = req->rsk_timer.expires - jiffies;
2181
2182         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2183                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2184                 i,
2185                 ireq->ir_loc_addr,
2186                 ireq->ir_num,
2187                 ireq->ir_rmt_addr,
2188                 ntohs(ireq->ir_rmt_port),
2189                 TCP_SYN_RECV,
2190                 0, 0, /* could print option size, but that is af dependent. */
2191                 1,    /* timers active (only the expire timer) */
2192                 jiffies_delta_to_clock_t(delta),
2193                 req->num_timeout,
2194                 from_kuid_munged(seq_user_ns(f),
2195                                  sock_i_uid(req->rsk_listener)),
2196                 0,  /* non standard timer */
2197                 0, /* open_requests have no inode */
2198                 0,
2199                 req);
2200 }
2201
2202 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2203 {
2204         int timer_active;
2205         unsigned long timer_expires;
2206         const struct tcp_sock *tp = tcp_sk(sk);
2207         const struct inet_connection_sock *icsk = inet_csk(sk);
2208         const struct inet_sock *inet = inet_sk(sk);
2209         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2210         __be32 dest = inet->inet_daddr;
2211         __be32 src = inet->inet_rcv_saddr;
2212         __u16 destp = ntohs(inet->inet_dport);
2213         __u16 srcp = ntohs(inet->inet_sport);
2214         int rx_queue;
2215         int state;
2216
2217         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2218             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2219             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2220                 timer_active    = 1;
2221                 timer_expires   = icsk->icsk_timeout;
2222         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2223                 timer_active    = 4;
2224                 timer_expires   = icsk->icsk_timeout;
2225         } else if (timer_pending(&sk->sk_timer)) {
2226                 timer_active    = 2;
2227                 timer_expires   = sk->sk_timer.expires;
2228         } else {
2229                 timer_active    = 0;
2230                 timer_expires = jiffies;
2231         }
2232
2233         state = sk_state_load(sk);
2234         if (state == TCP_LISTEN)
2235                 rx_queue = sk->sk_ack_backlog;
2236         else
2237                 /* Because we don't lock the socket,
2238                  * we might find a transient negative value.
2239                  */
2240                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2241
2242         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2243                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2244                 i, src, srcp, dest, destp, state,
2245                 tp->write_seq - tp->snd_una,
2246                 rx_queue,
2247                 timer_active,
2248                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2249                 icsk->icsk_retransmits,
2250                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2251                 icsk->icsk_probes_out,
2252                 sock_i_ino(sk),
2253                 atomic_read(&sk->sk_refcnt), sk,
2254                 jiffies_to_clock_t(icsk->icsk_rto),
2255                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2256                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2257                 tp->snd_cwnd,
2258                 state == TCP_LISTEN ?
2259                     fastopenq->max_qlen :
2260                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2261 }
2262
2263 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2264                                struct seq_file *f, int i)
2265 {
2266         long delta = tw->tw_timer.expires - jiffies;
2267         __be32 dest, src;
2268         __u16 destp, srcp;
2269
2270         dest  = tw->tw_daddr;
2271         src   = tw->tw_rcv_saddr;
2272         destp = ntohs(tw->tw_dport);
2273         srcp  = ntohs(tw->tw_sport);
2274
2275         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2276                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2277                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2278                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2279                 atomic_read(&tw->tw_refcnt), tw);
2280 }
2281
2282 #define TMPSZ 150
2283
2284 static int tcp4_seq_show(struct seq_file *seq, void *v)
2285 {
2286         struct tcp_iter_state *st;
2287         struct sock *sk = v;
2288
2289         seq_setwidth(seq, TMPSZ - 1);
2290         if (v == SEQ_START_TOKEN) {
2291                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2292                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2293                            "inode");
2294                 goto out;
2295         }
2296         st = seq->private;
2297
2298         if (sk->sk_state == TCP_TIME_WAIT)
2299                 get_timewait4_sock(v, seq, st->num);
2300         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2301                 get_openreq4(v, seq, st->num);
2302         else
2303                 get_tcp4_sock(v, seq, st->num);
2304 out:
2305         seq_pad(seq, '\n');
2306         return 0;
2307 }
2308
2309 static const struct file_operations tcp_afinfo_seq_fops = {
2310         .owner   = THIS_MODULE,
2311         .open    = tcp_seq_open,
2312         .read    = seq_read,
2313         .llseek  = seq_lseek,
2314         .release = seq_release_net
2315 };
2316
2317 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2318         .name           = "tcp",
2319         .family         = AF_INET,
2320         .seq_fops       = &tcp_afinfo_seq_fops,
2321         .seq_ops        = {
2322                 .show           = tcp4_seq_show,
2323         },
2324 };
2325
2326 static int __net_init tcp4_proc_init_net(struct net *net)
2327 {
2328         return tcp_proc_register(net, &tcp4_seq_afinfo);
2329 }
2330
2331 static void __net_exit tcp4_proc_exit_net(struct net *net)
2332 {
2333         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2334 }
2335
2336 static struct pernet_operations tcp4_net_ops = {
2337         .init = tcp4_proc_init_net,
2338         .exit = tcp4_proc_exit_net,
2339 };
2340
2341 int __init tcp4_proc_init(void)
2342 {
2343         return register_pernet_subsys(&tcp4_net_ops);
2344 }
2345
2346 void tcp4_proc_exit(void)
2347 {
2348         unregister_pernet_subsys(&tcp4_net_ops);
2349 }
2350 #endif /* CONFIG_PROC_FS */
2351
2352 struct proto tcp_prot = {
2353         .name                   = "TCP",
2354         .owner                  = THIS_MODULE,
2355         .close                  = tcp_close,
2356         .connect                = tcp_v4_connect,
2357         .disconnect             = tcp_disconnect,
2358         .accept                 = inet_csk_accept,
2359         .ioctl                  = tcp_ioctl,
2360         .init                   = tcp_v4_init_sock,
2361         .destroy                = tcp_v4_destroy_sock,
2362         .shutdown               = tcp_shutdown,
2363         .setsockopt             = tcp_setsockopt,
2364         .getsockopt             = tcp_getsockopt,
2365         .recvmsg                = tcp_recvmsg,
2366         .sendmsg                = tcp_sendmsg,
2367         .sendpage               = tcp_sendpage,
2368         .backlog_rcv            = tcp_v4_do_rcv,
2369         .release_cb             = tcp_release_cb,
2370         .hash                   = inet_hash,
2371         .unhash                 = inet_unhash,
2372         .get_port               = inet_csk_get_port,
2373         .enter_memory_pressure  = tcp_enter_memory_pressure,
2374         .stream_memory_free     = tcp_stream_memory_free,
2375         .sockets_allocated      = &tcp_sockets_allocated,
2376         .orphan_count           = &tcp_orphan_count,
2377         .memory_allocated       = &tcp_memory_allocated,
2378         .memory_pressure        = &tcp_memory_pressure,
2379         .sysctl_mem             = sysctl_tcp_mem,
2380         .sysctl_wmem            = sysctl_tcp_wmem,
2381         .sysctl_rmem            = sysctl_tcp_rmem,
2382         .max_header             = MAX_TCP_HEADER,
2383         .obj_size               = sizeof(struct tcp_sock),
2384         .slab_flags             = SLAB_DESTROY_BY_RCU,
2385         .twsk_prot              = &tcp_timewait_sock_ops,
2386         .rsk_prot               = &tcp_request_sock_ops,
2387         .h.hashinfo             = &tcp_hashinfo,
2388         .no_autobind            = true,
2389 #ifdef CONFIG_COMPAT
2390         .compat_setsockopt      = compat_tcp_setsockopt,
2391         .compat_getsockopt      = compat_tcp_getsockopt,
2392 #endif
2393 #ifdef CONFIG_MEMCG_KMEM
2394         .init_cgroup            = tcp_init_cgroup,
2395         .destroy_cgroup         = tcp_destroy_cgroup,
2396         .proto_cgroup           = tcp_proto_cgroup,
2397 #endif
2398 };
2399 EXPORT_SYMBOL(tcp_prot);
2400
2401 static void __net_exit tcp_sk_exit(struct net *net)
2402 {
2403         int cpu;
2404
2405         for_each_possible_cpu(cpu)
2406                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2407         free_percpu(net->ipv4.tcp_sk);
2408 }
2409
2410 static int __net_init tcp_sk_init(struct net *net)
2411 {
2412         int res, cpu;
2413
2414         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2415         if (!net->ipv4.tcp_sk)
2416                 return -ENOMEM;
2417
2418         for_each_possible_cpu(cpu) {
2419                 struct sock *sk;
2420
2421                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2422                                            IPPROTO_TCP, net);
2423                 if (res)
2424                         goto fail;
2425                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2426         }
2427
2428         net->ipv4.sysctl_tcp_ecn = 2;
2429         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2430
2431         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2432         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2433         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2434         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2435
2436         return 0;
2437 fail:
2438         tcp_sk_exit(net);
2439
2440         return res;
2441 }
2442
2443 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2444 {
2445         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2446 }
2447
2448 static struct pernet_operations __net_initdata tcp_sk_ops = {
2449        .init       = tcp_sk_init,
2450        .exit       = tcp_sk_exit,
2451        .exit_batch = tcp_sk_exit_batch,
2452 };
2453
2454 void __init tcp_v4_init(void)
2455 {
2456         inet_hashinfo_init(&tcp_hashinfo);
2457         if (register_pernet_subsys(&tcp_sk_ops))
2458                 panic("Failed to create the TCP control socket.\n");
2459 }