net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_tw_reuse __read_mostly;
  88 int sysctl_tcp_low_latency __read_mostly;
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 101                                           ip_hdr(skb)->saddr,
 102                                           tcp_hdr(skb)->dest,
 103                                           tcp_hdr(skb)->source);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 109         struct tcp_sock *tp = tcp_sk(sk);
 110
 111         /* With PAWS, it is safe from the viewpoint
 112            of data integrity. Even without PAWS it is safe provided sequence
 113            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 114
 115            Actually, the idea is close to VJ's one, only timestamp cache is
 116            held not per host, but per port pair and TW bucket is used as state
 117            holder.
 118
 119            If TW bucket has been already destroyed we fall back to VJ's scheme
 120            and use initial timestamp retrieved from peer table.
 121          */
 122         if (tcptw->tw_ts_recent_stamp &&
 123             (!twp || (sysctl_tcp_tw_reuse &&
 124                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 125                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 126                 if (tp->write_seq == 0)
 127                         tp->write_seq = 1;
 128                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 129                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 130                 sock_hold(sktw);
 131                 return 1;
 132         }
 133
 134         return 0;
 135 }
 136 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 137
 138 /* This will initiate an outgoing connection. */
 139 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 140 {
 141         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 142         struct inet_sock *inet = inet_sk(sk);
 143         struct tcp_sock *tp = tcp_sk(sk);
 144         __be16 orig_sport, orig_dport;
 145         __be32 daddr, nexthop;
 146         struct flowi4 *fl4;
 147         struct rtable *rt;
 148         int err;
 149         struct ip_options_rcu *inet_opt;
 150
 151         if (addr_len < sizeof(struct sockaddr_in))
 152                 return -EINVAL;
 153
 154         if (usin->sin_family != AF_INET)
 155                 return -EAFNOSUPPORT;
 156
 157         nexthop = daddr = usin->sin_addr.s_addr;
 158         inet_opt = rcu_dereference_protected(inet->inet_opt,
 159                                              lockdep_sock_is_held(sk));
 160         if (inet_opt && inet_opt->opt.srr) {
 161                 if (!daddr)
 162                         return -EINVAL;
 163                 nexthop = inet_opt->opt.faddr;
 164         }
 165
 166         orig_sport = inet->inet_sport;
 167         orig_dport = usin->sin_port;
 168         fl4 = &inet->cork.fl.u.ip4;
 169         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 170                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                               IPPROTO_TCP,
 172                               orig_sport, orig_dport, sk);
 173         if (IS_ERR(rt)) {
 174                 err = PTR_ERR(rt);
 175                 if (err == -ENETUNREACH)
 176                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                 return err;
 178         }
 179
 180         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                 ip_rt_put(rt);
 182                 return -ENETUNREACH;
 183         }
 184
 185         if (!inet_opt || !inet_opt->opt.srr)
 186                 daddr = fl4->daddr;
 187
 188         if (!inet->inet_saddr)
 189                 inet->inet_saddr = fl4->saddr;
 190         sk_rcv_saddr_set(sk, inet->inet_saddr);
 191
 192         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                 /* Reset inherited state */
 194                 tp->rx_opt.ts_recent       = 0;
 195                 tp->rx_opt.ts_recent_stamp = 0;
 196                 if (likely(!tp->repair))
 197                         tp->write_seq      = 0;
 198         }
 199
 200         if (tcp_death_row.sysctl_tw_recycle &&
 201             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 202                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 203
 204         inet->inet_dport = usin->sin_port;
 205         sk_daddr_set(sk, daddr);
 206
 207         inet_csk(sk)->icsk_ext_hdr_len = 0;
 208         if (inet_opt)
 209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 210
 211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 212
 213         /* Socket identity is still unknown (sport may be zero).
 214          * However we set state to SYN-SENT and not releasing socket
 215          * lock select source port, enter ourselves into the hash tables and
 216          * complete initialization after this.
 217          */
 218         tcp_set_state(sk, TCP_SYN_SENT);
 219         err = inet_hash_connect(&tcp_death_row, sk);
 220         if (err)
 221                 goto failure;
 222
 223         sk_set_txhash(sk);
 224
 225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 226                                inet->inet_sport, inet->inet_dport, sk);
 227         if (IS_ERR(rt)) {
 228                 err = PTR_ERR(rt);
 229                 rt = NULL;
 230                 goto failure;
 231         }
 232         /* OK, now commit destination to socket.  */
 233         sk->sk_gso_type = SKB_GSO_TCPV4;
 234         sk_setup_caps(sk, &rt->dst);
 235
 236         if (!tp->write_seq && likely(!tp->repair))
 237                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 238                                                            inet->inet_daddr,
 239                                                            inet->inet_sport,
 240                                                            usin->sin_port);
 241
 242         inet->inet_id = prandom_u32();
 243
 244         err = tcp_connect(sk);
 245
 246         rt = NULL;
 247         if (err)
 248                 goto failure;
 249
 250         return 0;
 251
 252 failure:
 253         /*
 254          * This unhashes the socket and releases the local port,
 255          * if necessary.
 256          */
 257         tcp_set_state(sk, TCP_CLOSE);
 258         ip_rt_put(rt);
 259         sk->sk_route_caps = 0;
 260         inet->inet_dport = 0;
 261         return err;
 262 }
 263 EXPORT_SYMBOL(tcp_v4_connect);
 264
 265 /*
 266  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 267  * It can be called through tcp_release_cb() if socket was owned by user
 268  * at the time tcp_v4_err() was called to handle ICMP message.
 269  */
 270 void tcp_v4_mtu_reduced(struct sock *sk)
 271 {
 272         struct inet_sock *inet = inet_sk(sk);
 273         struct dst_entry *dst;
 274         u32 mtu;
 275
 276         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 277                 return;
 278         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 279         dst = inet_csk_update_pmtu(sk, mtu);
 280         if (!dst)
 281                 return;
 282
 283         /* Something is about to be wrong... Remember soft error
 284          * for the case, if this connection will not able to recover.
 285          */
 286         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 287                 sk->sk_err_soft = EMSGSIZE;
 288
 289         mtu = dst_mtu(dst);
 290
 291         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 292             ip_sk_accept_pmtu(sk) &&
 293             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 294                 tcp_sync_mss(sk, mtu);
 295
 296                 /* Resend the TCP packet because it's
 297                  * clear that the old packet has been
 298                  * dropped. This is the new "fast" path mtu
 299                  * discovery.
 300                  */
 301                 tcp_simple_retransmit(sk);
 302         } /* else let the usual retransmit timer handle it */
 303 }
 304 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 305
 306 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 307 {
 308         struct dst_entry *dst = __sk_dst_check(sk, 0);
 309
 310         if (dst)
 311                 dst->ops->redirect(dst, sk, skb);
 312 }
 313
 314
 315 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 316 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 317 {
 318         struct request_sock *req = inet_reqsk(sk);
 319         struct net *net = sock_net(sk);
 320
 321         /* ICMPs are not backlogged, hence we cannot get
 322          * an established socket here.
 323          */
 324         if (seq != tcp_rsk(req)->snt_isn) {
 325                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 326         } else if (abort) {
 327                 /*
 328                  * Still in SYN_RECV, just remove it silently.
 329                  * There is no good way to pass the error to the newly
 330                  * created socket, and POSIX does not want network
 331                  * errors returned from accept().
 332                  */
 333                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 334                 tcp_listendrop(req->rsk_listener);
 335         }
 336         reqsk_put(req);
 337 }
 338 EXPORT_SYMBOL(tcp_req_err);
 339
 340 /*
 341  * This routine is called by the ICMP module when it gets some
 342  * sort of error condition.  If err < 0 then the socket should
 343  * be closed and the error returned to the user.  If err > 0
 344  * it's just the icmp type << 8 | icmp code.  After adjustment
 345  * header points to the first 8 bytes of the tcp header.  We need
 346  * to find the appropriate port.
 347  *
 348  * The locking strategy used here is very "optimistic". When
 349  * someone else accesses the socket the ICMP is just dropped
 350  * and for some paths there is no check at all.
 351  * A more general error queue to queue errors for later handling
 352  * is probably better.
 353  *
 354  */
 355
 356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 357 {
 358         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 359         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 360         struct inet_connection_sock *icsk;
 361         struct tcp_sock *tp;
 362         struct inet_sock *inet;
 363         const int type = icmp_hdr(icmp_skb)->type;
 364         const int code = icmp_hdr(icmp_skb)->code;
 365         struct sock *sk;
 366         struct sk_buff *skb;
 367         struct request_sock *fastopen;
 368         __u32 seq, snd_una;
 369         __u32 remaining;
 370         int err;
 371         struct net *net = dev_net(icmp_skb->dev);
 372
 373         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 374                                        th->dest, iph->saddr, ntohs(th->source),
 375                                        inet_iif(icmp_skb));
 376         if (!sk) {
 377                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 378                 return;
 379         }
 380         if (sk->sk_state == TCP_TIME_WAIT) {
 381                 inet_twsk_put(inet_twsk(sk));
 382                 return;
 383         }
 384         seq = ntohl(th->seq);
 385         if (sk->sk_state == TCP_NEW_SYN_RECV)
 386                 return tcp_req_err(sk, seq,
 387                                   type == ICMP_PARAMETERPROB ||
 388                                   type == ICMP_TIME_EXCEEDED ||
 389                                   (type == ICMP_DEST_UNREACH &&
 390                                    (code == ICMP_NET_UNREACH ||
 391                                     code == ICMP_HOST_UNREACH)));
 392
 393         bh_lock_sock(sk);
 394         /* If too many ICMPs get dropped on busy
 395          * servers this needs to be solved differently.
 396          * We do take care of PMTU discovery (RFC1191) special case :
 397          * we can receive locally generated ICMP messages while socket is held.
 398          */
 399         if (sock_owned_by_user(sk)) {
 400                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 401                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 402         }
 403         if (sk->sk_state == TCP_CLOSE)
 404                 goto out;
 405
 406         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 407                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 408                 goto out;
 409         }
 410
 411         icsk = inet_csk(sk);
 412         tp = tcp_sk(sk);
 413         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 414         fastopen = tp->fastopen_rsk;
 415         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 416         if (sk->sk_state != TCP_LISTEN &&
 417             !between(seq, snd_una, tp->snd_nxt)) {
 418                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 419                 goto out;
 420         }
 421
 422         switch (type) {
 423         case ICMP_REDIRECT:
 424                 if (!sock_owned_by_user(sk))
 425                         do_redirect(icmp_skb, sk);
 426                 goto out;
 427         case ICMP_SOURCE_QUENCH:
 428                 /* Just silently ignore these. */
 429                 goto out;
 430         case ICMP_PARAMETERPROB:
 431                 err = EPROTO;
 432                 break;
 433         case ICMP_DEST_UNREACH:
 434                 if (code > NR_ICMP_UNREACH)
 435                         goto out;
 436
 437                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 438                         /* We are not interested in TCP_LISTEN and open_requests
 439                          * (SYN-ACKs send out by Linux are always <576bytes so
 440                          * they should go through unfragmented).
 441                          */
 442                         if (sk->sk_state == TCP_LISTEN)
 443                                 goto out;
 444
 445                         WRITE_ONCE(tp->mtu_info, info);
 446                         if (!sock_owned_by_user(sk)) {
 447                                 tcp_v4_mtu_reduced(sk);
 448                         } else {
 449                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 450                                         sock_hold(sk);
 451                         }
 452                         goto out;
 453                 }
 454
 455                 err = icmp_err_convert[code].errno;
 456                 /* check if icmp_skb allows revert of backoff
 457                  * (see draft-zimmermann-tcp-lcd) */
 458                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 459                         break;
 460                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 461                     !icsk->icsk_backoff || fastopen)
 462                         break;
 463
 464                 if (sock_owned_by_user(sk))
 465                         break;
 466
 467                 skb = tcp_write_queue_head(sk);
 468                 if (WARN_ON_ONCE(!skb))
 469                         break;
 470
 471                 icsk->icsk_backoff--;
 472                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 473                                                TCP_TIMEOUT_INIT;
 474                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 475
 476                 remaining = icsk->icsk_rto -
 477                             min(icsk->icsk_rto,
 478                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 479
 480                 if (remaining) {
 481                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 482                                                   remaining, TCP_RTO_MAX);
 483                 } else {
 484                         /* RTO revert clocked out retransmission.
 485                          * Will retransmit now */
 486                         tcp_retransmit_timer(sk);
 487                 }
 488
 489                 break;
 490         case ICMP_TIME_EXCEEDED:
 491                 err = EHOSTUNREACH;
 492                 break;
 493         default:
 494                 goto out;
 495         }
 496
 497         switch (sk->sk_state) {
 498         case TCP_SYN_SENT:
 499         case TCP_SYN_RECV:
 500                 /* Only in fast or simultaneous open. If a fast open socket is
 501                  * is already accepted it is treated as a connected one below.
 502                  */
 503                 if (fastopen && !fastopen->sk)
 504                         break;
 505
 506                 if (!sock_owned_by_user(sk)) {
 507                         sk->sk_err = err;
 508
 509                         sk->sk_error_report(sk);
 510
 511                         tcp_done(sk);
 512                 } else {
 513                         sk->sk_err_soft = err;
 514                 }
 515                 goto out;
 516         }
 517
 518         /* If we've already connected we will keep trying
 519          * until we time out, or the user gives up.
 520          *
 521          * rfc1122 4.2.3.9 allows to consider as hard errors
 522          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 523          * but it is obsoleted by pmtu discovery).
 524          *
 525          * Note, that in modern internet, where routing is unreliable
 526          * and in each dark corner broken firewalls sit, sending random
 527          * errors ordered by their masters even this two messages finally lose
 528          * their original sense (even Linux sends invalid PORT_UNREACHs)
 529          *
 530          * Now we are in compliance with RFCs.
 531          *                                                      --ANK (980905)
 532          */
 533
 534         inet = inet_sk(sk);
 535         if (!sock_owned_by_user(sk) && inet->recverr) {
 536                 sk->sk_err = err;
 537                 sk->sk_error_report(sk);
 538         } else  { /* Only an error on timeout */
 539                 sk->sk_err_soft = err;
 540         }
 541
 542 out:
 543         bh_unlock_sock(sk);
 544         sock_put(sk);
 545 }
 546
 547 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 548 {
 549         struct tcphdr *th = tcp_hdr(skb);
 550
 551         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 552                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 553                 skb->csum_start = skb_transport_header(skb) - skb->head;
 554                 skb->csum_offset = offsetof(struct tcphdr, check);
 555         } else {
 556                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 557                                          csum_partial(th,
 558                                                       th->doff << 2,
 559                                                       skb->csum));
 560         }
 561 }
 562
 563 /* This routine computes an IPv4 TCP checksum. */
 564 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 565 {
 566         const struct inet_sock *inet = inet_sk(sk);
 567
 568         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 569 }
 570 EXPORT_SYMBOL(tcp_v4_send_check);
 571
 572 /*
 573  *      This routine will send an RST to the other tcp.
 574  *
 575  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 576  *                    for reset.
 577  *      Answer: if a packet caused RST, it is not for a socket
 578  *              existing in our system, if it is matched to a socket,
 579  *              it is just duplicate segment or bug in other side's TCP.
 580  *              So that we build reply only basing on parameters
 581  *              arrived with segment.
 582  *      Exception: precedence violation. We do not implement it in any case.
 583  */
 584
 585 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 586 {
 587         const struct tcphdr *th = tcp_hdr(skb);
 588         struct {
 589                 struct tcphdr th;
 590 #ifdef CONFIG_TCP_MD5SIG
 591                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 592 #endif
 593         } rep;
 594         struct ip_reply_arg arg;
 595 #ifdef CONFIG_TCP_MD5SIG
 596         struct tcp_md5sig_key *key = NULL;
 597         const __u8 *hash_location = NULL;
 598         unsigned char newhash[16];
 599         int genhash;
 600         struct sock *sk1 = NULL;
 601 #endif
 602         struct net *net;
 603
 604         /* Never send a reset in response to a reset. */
 605         if (th->rst)
 606                 return;
 607
 608         /* If sk not NULL, it means we did a successful lookup and incoming
 609          * route had to be correct. prequeue might have dropped our dst.
 610          */
 611         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 612                 return;
 613
 614         /* Swap the send and the receive. */
 615         memset(&rep, 0, sizeof(rep));
 616         rep.th.dest   = th->source;
 617         rep.th.source = th->dest;
 618         rep.th.doff   = sizeof(struct tcphdr) / 4;
 619         rep.th.rst    = 1;
 620
 621         if (th->ack) {
 622                 rep.th.seq = th->ack_seq;
 623         } else {
 624                 rep.th.ack = 1;
 625                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 626                                        skb->len - (th->doff << 2));
 627         }
 628
 629         memset(&arg, 0, sizeof(arg));
 630         arg.iov[0].iov_base = (unsigned char *)&rep;
 631         arg.iov[0].iov_len  = sizeof(rep.th);
 632
 633         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 634 #ifdef CONFIG_TCP_MD5SIG
 635         rcu_read_lock();
 636         hash_location = tcp_parse_md5sig_option(th);
 637         if (sk && sk_fullsock(sk)) {
 638                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 639                                         &ip_hdr(skb)->saddr, AF_INET);
 640         } else if (hash_location) {
 641                 /*
 642                  * active side is lost. Try to find listening socket through
 643                  * source port, and then find md5 key through listening socket.
 644                  * we are not loose security here:
 645                  * Incoming packet is checked with md5 hash with finding key,
 646                  * no RST generated if md5 hash doesn't match.
 647                  */
 648                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 649                                              ip_hdr(skb)->saddr,
 650                                              th->source, ip_hdr(skb)->daddr,
 651                                              ntohs(th->source), inet_iif(skb));
 652                 /* don't send rst if it can't find key */
 653                 if (!sk1)
 654                         goto out;
 655
 656                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 657                                         &ip_hdr(skb)->saddr, AF_INET);
 658                 if (!key)
 659                         goto out;
 660
 661
 662                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 663                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 664                         goto out;
 665
 666         }
 667
 668         if (key) {
 669                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 670                                    (TCPOPT_NOP << 16) |
 671                                    (TCPOPT_MD5SIG << 8) |
 672                                    TCPOLEN_MD5SIG);
 673                 /* Update length and the length the header thinks exists */
 674                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 675                 rep.th.doff = arg.iov[0].iov_len / 4;
 676
 677                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 678                                      key, ip_hdr(skb)->saddr,
 679                                      ip_hdr(skb)->daddr, &rep.th);
 680         }
 681 #endif
 682         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 683                                       ip_hdr(skb)->saddr, /* XXX */
 684                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 685         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 686         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 687
 688         /* When socket is gone, all binding information is lost.
 689          * routing might fail in this case. No choice here, if we choose to force
 690          * input interface, we will misroute in case of asymmetric route.
 691          */
 692         if (sk)
 693                 arg.bound_dev_if = sk->sk_bound_dev_if;
 694
 695         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 696                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 697
 698         arg.tos = ip_hdr(skb)->tos;
 699         local_bh_disable();
 700         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 701                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 702                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 703                               &arg, arg.iov[0].iov_len);
 704
 705         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 706         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 707         local_bh_enable();
 708
 709 #ifdef CONFIG_TCP_MD5SIG
 710 out:
 711         rcu_read_unlock();
 712 #endif
 713 }
 714
 715 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 716    outside socket context is ugly, certainly. What can I do?
 717  */
 718
 719 static void tcp_v4_send_ack(struct net *net,
 720                             struct sk_buff *skb, u32 seq, u32 ack,
 721                             u32 win, u32 tsval, u32 tsecr, int oif,
 722                             struct tcp_md5sig_key *key,
 723                             int reply_flags, u8 tos)
 724 {
 725         const struct tcphdr *th = tcp_hdr(skb);
 726         struct {
 727                 struct tcphdr th;
 728                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 729 #ifdef CONFIG_TCP_MD5SIG
 730                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 731 #endif
 732                         ];
 733         } rep;
 734         struct ip_reply_arg arg;
 735
 736         memset(&rep.th, 0, sizeof(struct tcphdr));
 737         memset(&arg, 0, sizeof(arg));
 738
 739         arg.iov[0].iov_base = (unsigned char *)&rep;
 740         arg.iov[0].iov_len  = sizeof(rep.th);
 741         if (tsecr) {
 742                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 743                                    (TCPOPT_TIMESTAMP << 8) |
 744                                    TCPOLEN_TIMESTAMP);
 745                 rep.opt[1] = htonl(tsval);
 746                 rep.opt[2] = htonl(tsecr);
 747                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 748         }
 749
 750         /* Swap the send and the receive. */
 751         rep.th.dest    = th->source;
 752         rep.th.source  = th->dest;
 753         rep.th.doff    = arg.iov[0].iov_len / 4;
 754         rep.th.seq     = htonl(seq);
 755         rep.th.ack_seq = htonl(ack);
 756         rep.th.ack     = 1;
 757         rep.th.window  = htons(win);
 758
 759 #ifdef CONFIG_TCP_MD5SIG
 760         if (key) {
 761                 int offset = (tsecr) ? 3 : 0;
 762
 763                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 764                                           (TCPOPT_NOP << 16) |
 765                                           (TCPOPT_MD5SIG << 8) |
 766                                           TCPOLEN_MD5SIG);
 767                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 768                 rep.th.doff = arg.iov[0].iov_len/4;
 769
 770                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 771                                     key, ip_hdr(skb)->saddr,
 772                                     ip_hdr(skb)->daddr, &rep.th);
 773         }
 774 #endif
 775         arg.flags = reply_flags;
 776         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 777                                       ip_hdr(skb)->saddr, /* XXX */
 778                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 779         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 780         if (oif)
 781                 arg.bound_dev_if = oif;
 782         arg.tos = tos;
 783         local_bh_disable();
 784         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 785                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 786                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 787                               &arg, arg.iov[0].iov_len);
 788
 789         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 790         local_bh_enable();
 791 }
 792
 793 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 794 {
 795         struct inet_timewait_sock *tw = inet_twsk(sk);
 796         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 797
 798         tcp_v4_send_ack(sock_net(sk), skb,
 799                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 800                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 801                         tcp_time_stamp + tcptw->tw_ts_offset,
 802                         tcptw->tw_ts_recent,
 803                         tw->tw_bound_dev_if,
 804                         tcp_twsk_md5_key(tcptw),
 805                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 806                         tw->tw_tos
 807                         );
 808
 809         inet_twsk_put(tw);
 810 }
 811
 812 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 813                                   struct request_sock *req)
 814 {
 815         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 816          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 817          */
 818         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 819                                              tcp_sk(sk)->snd_nxt;
 820
 821         /* RFC 7323 2.3
 822          * The window field (SEG.WND) of every outgoing segment, with the
 823          * exception of <SYN> segments, MUST be right-shifted by
 824          * Rcv.Wind.Shift bits:
 825          */
 826         tcp_v4_send_ack(sock_net(sk), skb, seq,
 827                         tcp_rsk(req)->rcv_nxt,
 828                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 829                         tcp_time_stamp,
 830                         req->ts_recent,
 831                         0,
 832                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 833                                           AF_INET),
 834                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 835                         ip_hdr(skb)->tos);
 836 }
 837
 838 /*
 839  *      Send a SYN-ACK after having received a SYN.
 840  *      This still operates on a request_sock only, not on a big
 841  *      socket.
 842  */
 843 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 844                               struct flowi *fl,
 845                               struct request_sock *req,
 846                               struct tcp_fastopen_cookie *foc,
 847                               enum tcp_synack_type synack_type)
 848 {
 849         const struct inet_request_sock *ireq = inet_rsk(req);
 850         struct flowi4 fl4;
 851         int err = -1;
 852         struct sk_buff *skb;
 853
 854         /* First, grab a route. */
 855         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 856                 return -1;
 857
 858         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 859
 860         if (skb) {
 861                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 862
 863                 rcu_read_lock();
 864                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 865                                             ireq->ir_rmt_addr,
 866                                             rcu_dereference(ireq->ireq_opt));
 867                 rcu_read_unlock();
 868                 err = net_xmit_eval(err);
 869         }
 870
 871         return err;
 872 }
 873
 874 /*
 875  *      IPv4 request_sock destructor.
 876  */
 877 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 878 {
 879         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 880 }
 881
 882 #ifdef CONFIG_TCP_MD5SIG
 883 /*
 884  * RFC2385 MD5 checksumming requires a mapping of
 885  * IP address->MD5 Key.
 886  * We need to maintain these in the sk structure.
 887  */
 888
 889 /* Find the Key structure for an address.  */
 890 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 891                                          const union tcp_md5_addr *addr,
 892                                          int family)
 893 {
 894         const struct tcp_sock *tp = tcp_sk(sk);
 895         struct tcp_md5sig_key *key;
 896         unsigned int size = sizeof(struct in_addr);
 897         const struct tcp_md5sig_info *md5sig;
 898
 899         /* caller either holds rcu_read_lock() or socket lock */
 900         md5sig = rcu_dereference_check(tp->md5sig_info,
 901                                        lockdep_sock_is_held(sk));
 902         if (!md5sig)
 903                 return NULL;
 904 #if IS_ENABLED(CONFIG_IPV6)
 905         if (family == AF_INET6)
 906                 size = sizeof(struct in6_addr);
 907 #endif
 908         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 909                 if (key->family != family)
 910                         continue;
 911                 if (!memcmp(&key->addr, addr, size))
 912                         return key;
 913         }
 914         return NULL;
 915 }
 916 EXPORT_SYMBOL(tcp_md5_do_lookup);
 917
 918 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 919                                          const struct sock *addr_sk)
 920 {
 921         const union tcp_md5_addr *addr;
 922
 923         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 924         return tcp_md5_do_lookup(sk, addr, AF_INET);
 925 }
 926 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 927
 928 /* This can be called on a newly created socket, from other files */
 929 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 930                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 931 {
 932         /* Add Key to the list */
 933         struct tcp_md5sig_key *key;
 934         struct tcp_sock *tp = tcp_sk(sk);
 935         struct tcp_md5sig_info *md5sig;
 936
 937         key = tcp_md5_do_lookup(sk, addr, family);
 938         if (key) {
 939                 /* Pre-existing entry - just update that one.
 940                  * Note that the key might be used concurrently.
 941                  */
 942                 memcpy(key->key, newkey, newkeylen);
 943
 944                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
 945                  * Also note that a reader could catch new key->keylen value
 946                  * but old key->key[], this is the reason we use __GFP_ZERO
 947                  * at sock_kmalloc() time below these lines.
 948                  */
 949                 WRITE_ONCE(key->keylen, newkeylen);
 950
 951                 return 0;
 952         }
 953
 954         md5sig = rcu_dereference_protected(tp->md5sig_info,
 955                                            lockdep_sock_is_held(sk));
 956         if (!md5sig) {
 957                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 958                 if (!md5sig)
 959                         return -ENOMEM;
 960
 961                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 962                 INIT_HLIST_HEAD(&md5sig->head);
 963                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 964         }
 965
 966         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
 967         if (!key)
 968                 return -ENOMEM;
 969         if (!tcp_alloc_md5sig_pool()) {
 970                 sock_kfree_s(sk, key, sizeof(*key));
 971                 return -ENOMEM;
 972         }
 973
 974         memcpy(key->key, newkey, newkeylen);
 975         key->keylen = newkeylen;
 976         key->family = family;
 977         memcpy(&key->addr, addr,
 978                (family == AF_INET6) ? sizeof(struct in6_addr) :
 979                                       sizeof(struct in_addr));
 980         hlist_add_head_rcu(&key->node, &md5sig->head);
 981         return 0;
 982 }
 983 EXPORT_SYMBOL(tcp_md5_do_add);
 984
 985 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 986 {
 987         struct tcp_md5sig_key *key;
 988
 989         key = tcp_md5_do_lookup(sk, addr, family);
 990         if (!key)
 991                 return -ENOENT;
 992         hlist_del_rcu(&key->node);
 993         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 994         kfree_rcu(key, rcu);
 995         return 0;
 996 }
 997 EXPORT_SYMBOL(tcp_md5_do_del);
 998
 999 static void tcp_clear_md5_list(struct sock *sk)
1000 {
1001         struct tcp_sock *tp = tcp_sk(sk);
1002         struct tcp_md5sig_key *key;
1003         struct hlist_node *n;
1004         struct tcp_md5sig_info *md5sig;
1005
1006         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1007
1008         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1009                 hlist_del_rcu(&key->node);
1010                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1011                 kfree_rcu(key, rcu);
1012         }
1013 }
1014
1015 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1016                                  int optlen)
1017 {
1018         struct tcp_md5sig cmd;
1019         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1020
1021         if (optlen < sizeof(cmd))
1022                 return -EINVAL;
1023
1024         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1025                 return -EFAULT;
1026
1027         if (sin->sin_family != AF_INET)
1028                 return -EINVAL;
1029
1030         if (!cmd.tcpm_keylen)
1031                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1032                                       AF_INET);
1033
1034         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1035                 return -EINVAL;
1036
1037         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1038                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1039                               GFP_KERNEL);
1040 }
1041
1042 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1043                                    __be32 daddr, __be32 saddr,
1044                                    const struct tcphdr *th, int nbytes)
1045 {
1046         struct tcp4_pseudohdr *bp;
1047         struct scatterlist sg;
1048         struct tcphdr *_th;
1049
1050         bp = hp->scratch;
1051         bp->saddr = saddr;
1052         bp->daddr = daddr;
1053         bp->pad = 0;
1054         bp->protocol = IPPROTO_TCP;
1055         bp->len = cpu_to_be16(nbytes);
1056
1057         _th = (struct tcphdr *)(bp + 1);
1058         memcpy(_th, th, sizeof(*th));
1059         _th->check = 0;
1060
1061         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1062         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1063                                 sizeof(*bp) + sizeof(*th));
1064         return crypto_ahash_update(hp->md5_req);
1065 }
1066
1067 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1068                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1069 {
1070         struct tcp_md5sig_pool *hp;
1071         struct ahash_request *req;
1072
1073         hp = tcp_get_md5sig_pool();
1074         if (!hp)
1075                 goto clear_hash_noput;
1076         req = hp->md5_req;
1077
1078         if (crypto_ahash_init(req))
1079                 goto clear_hash;
1080         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1081                 goto clear_hash;
1082         if (tcp_md5_hash_key(hp, key))
1083                 goto clear_hash;
1084         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1085         if (crypto_ahash_final(req))
1086                 goto clear_hash;
1087
1088         tcp_put_md5sig_pool();
1089         return 0;
1090
1091 clear_hash:
1092         tcp_put_md5sig_pool();
1093 clear_hash_noput:
1094         memset(md5_hash, 0, 16);
1095         return 1;
1096 }
1097
1098 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1099                         const struct sock *sk,
1100                         const struct sk_buff *skb)
1101 {
1102         struct tcp_md5sig_pool *hp;
1103         struct ahash_request *req;
1104         const struct tcphdr *th = tcp_hdr(skb);
1105         __be32 saddr, daddr;
1106
1107         if (sk) { /* valid for establish/request sockets */
1108                 saddr = sk->sk_rcv_saddr;
1109                 daddr = sk->sk_daddr;
1110         } else {
1111                 const struct iphdr *iph = ip_hdr(skb);
1112                 saddr = iph->saddr;
1113                 daddr = iph->daddr;
1114         }
1115
1116         hp = tcp_get_md5sig_pool();
1117         if (!hp)
1118                 goto clear_hash_noput;
1119         req = hp->md5_req;
1120
1121         if (crypto_ahash_init(req))
1122                 goto clear_hash;
1123
1124         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1125                 goto clear_hash;
1126         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1127                 goto clear_hash;
1128         if (tcp_md5_hash_key(hp, key))
1129                 goto clear_hash;
1130         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1131         if (crypto_ahash_final(req))
1132                 goto clear_hash;
1133
1134         tcp_put_md5sig_pool();
1135         return 0;
1136
1137 clear_hash:
1138         tcp_put_md5sig_pool();
1139 clear_hash_noput:
1140         memset(md5_hash, 0, 16);
1141         return 1;
1142 }
1143 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1144
1145 #endif
1146
1147 /* Called with rcu_read_lock() */
1148 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1149                                     const struct sk_buff *skb)
1150 {
1151 #ifdef CONFIG_TCP_MD5SIG
1152         /*
1153          * This gets called for each TCP segment that arrives
1154          * so we want to be efficient.
1155          * We have 3 drop cases:
1156          * o No MD5 hash and one expected.
1157          * o MD5 hash and we're not expecting one.
1158          * o MD5 hash and its wrong.
1159          */
1160         const __u8 *hash_location = NULL;
1161         struct tcp_md5sig_key *hash_expected;
1162         const struct iphdr *iph = ip_hdr(skb);
1163         const struct tcphdr *th = tcp_hdr(skb);
1164         int genhash;
1165         unsigned char newhash[16];
1166
1167         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1168                                           AF_INET);
1169         hash_location = tcp_parse_md5sig_option(th);
1170
1171         /* We've parsed the options - do we have a hash? */
1172         if (!hash_expected && !hash_location)
1173                 return false;
1174
1175         if (hash_expected && !hash_location) {
1176                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1177                 return true;
1178         }
1179
1180         if (!hash_expected && hash_location) {
1181                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1182                 return true;
1183         }
1184
1185         /* Okay, so this is hash_expected and hash_location -
1186          * so we need to calculate the checksum.
1187          */
1188         genhash = tcp_v4_md5_hash_skb(newhash,
1189                                       hash_expected,
1190                                       NULL, skb);
1191
1192         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1193                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1194                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1195                                      &iph->saddr, ntohs(th->source),
1196                                      &iph->daddr, ntohs(th->dest),
1197                                      genhash ? " tcp_v4_calc_md5_hash failed"
1198                                      : "");
1199                 return true;
1200         }
1201         return false;
1202 #endif
1203         return false;
1204 }
1205
1206 static void tcp_v4_init_req(struct request_sock *req,
1207                             const struct sock *sk_listener,
1208                             struct sk_buff *skb)
1209 {
1210         struct inet_request_sock *ireq = inet_rsk(req);
1211
1212         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1213         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1214         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
1215 }
1216
1217 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1218                                           struct flowi *fl,
1219                                           const struct request_sock *req,
1220                                           bool *strict)
1221 {
1222         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1223
1224         if (strict) {
1225                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1226                         *strict = true;
1227                 else
1228                         *strict = false;
1229         }
1230
1231         return dst;
1232 }
1233
1234 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1235         .family         =       PF_INET,
1236         .obj_size       =       sizeof(struct tcp_request_sock),
1237         .rtx_syn_ack    =       tcp_rtx_synack,
1238         .send_ack       =       tcp_v4_reqsk_send_ack,
1239         .destructor     =       tcp_v4_reqsk_destructor,
1240         .send_reset     =       tcp_v4_send_reset,
1241         .syn_ack_timeout =      tcp_syn_ack_timeout,
1242 };
1243
1244 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1245         .mss_clamp      =       TCP_MSS_DEFAULT,
1246 #ifdef CONFIG_TCP_MD5SIG
1247         .req_md5_lookup =       tcp_v4_md5_lookup,
1248         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1249 #endif
1250         .init_req       =       tcp_v4_init_req,
1251 #ifdef CONFIG_SYN_COOKIES
1252         .cookie_init_seq =      cookie_v4_init_sequence,
1253 #endif
1254         .route_req      =       tcp_v4_route_req,
1255         .init_seq       =       tcp_v4_init_sequence,
1256         .send_synack    =       tcp_v4_send_synack,
1257 };
1258
1259 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1260 {
1261         /* Never answer to SYNs send to broadcast or multicast */
1262         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1263                 goto drop;
1264
1265         return tcp_conn_request(&tcp_request_sock_ops,
1266                                 &tcp_request_sock_ipv4_ops, sk, skb);
1267
1268 drop:
1269         tcp_listendrop(sk);
1270         return 0;
1271 }
1272 EXPORT_SYMBOL(tcp_v4_conn_request);
1273
1274
1275 /*
1276  * The three way handshake has completed - we got a valid synack -
1277  * now create the new socket.
1278  */
1279 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1280                                   struct request_sock *req,
1281                                   struct dst_entry *dst,
1282                                   struct request_sock *req_unhash,
1283                                   bool *own_req)
1284 {
1285         struct inet_request_sock *ireq;
1286         struct inet_sock *newinet;
1287         struct tcp_sock *newtp;
1288         struct sock *newsk;
1289 #ifdef CONFIG_TCP_MD5SIG
1290         struct tcp_md5sig_key *key;
1291 #endif
1292         struct ip_options_rcu *inet_opt;
1293
1294         if (sk_acceptq_is_full(sk))
1295                 goto exit_overflow;
1296
1297         newsk = tcp_create_openreq_child(sk, req, skb);
1298         if (!newsk)
1299                 goto exit_nonewsk;
1300
1301         newsk->sk_gso_type = SKB_GSO_TCPV4;
1302         inet_sk_rx_dst_set(newsk, skb);
1303
1304         newtp                 = tcp_sk(newsk);
1305         newinet               = inet_sk(newsk);
1306         ireq                  = inet_rsk(req);
1307         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1308         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1309         newsk->sk_bound_dev_if = ireq->ir_iif;
1310         newinet->inet_saddr   = ireq->ir_loc_addr;
1311         inet_opt              = rcu_dereference(ireq->ireq_opt);
1312         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1313         newinet->mc_index     = inet_iif(skb);
1314         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1315         newinet->rcv_tos      = ip_hdr(skb)->tos;
1316         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1317         if (inet_opt)
1318                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1319         newinet->inet_id = prandom_u32();
1320
1321         if (!dst) {
1322                 dst = inet_csk_route_child_sock(sk, newsk, req);
1323                 if (!dst)
1324                         goto put_and_exit;
1325         } else {
1326                 /* syncookie case : see end of cookie_v4_check() */
1327         }
1328         sk_setup_caps(newsk, dst);
1329
1330         tcp_ca_openreq_child(newsk, dst);
1331
1332         tcp_sync_mss(newsk, dst_mtu(dst));
1333         newtp->advmss = dst_metric_advmss(dst);
1334         if (tcp_sk(sk)->rx_opt.user_mss &&
1335             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1336                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1337
1338         tcp_initialize_rcv_mss(newsk);
1339
1340 #ifdef CONFIG_TCP_MD5SIG
1341         /* Copy over the MD5 key from the original socket */
1342         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1343                                 AF_INET);
1344         if (key) {
1345                 /*
1346                  * We're using one, so create a matching key
1347                  * on the newsk structure. If we fail to get
1348                  * memory, then we end up not copying the key
1349                  * across. Shucks.
1350                  */
1351                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1352                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1353                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1354         }
1355 #endif
1356
1357         if (__inet_inherit_port(sk, newsk) < 0)
1358                 goto put_and_exit;
1359         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1360         if (likely(*own_req)) {
1361                 tcp_move_syn(newtp, req);
1362                 ireq->ireq_opt = NULL;
1363         } else {
1364                 newinet->inet_opt = NULL;
1365         }
1366         return newsk;
1367
1368 exit_overflow:
1369         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1370 exit_nonewsk:
1371         dst_release(dst);
1372 exit:
1373         tcp_listendrop(sk);
1374         return NULL;
1375 put_and_exit:
1376         newinet->inet_opt = NULL;
1377         inet_csk_prepare_forced_close(newsk);
1378         tcp_done(newsk);
1379         goto exit;
1380 }
1381 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1382
1383 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1384 {
1385 #ifdef CONFIG_SYN_COOKIES
1386         const struct tcphdr *th = tcp_hdr(skb);
1387
1388         if (!th->syn)
1389                 sk = cookie_v4_check(sk, skb);
1390 #endif
1391         return sk;
1392 }
1393
1394 /* The socket must have it's spinlock held when we get
1395  * here, unless it is a TCP_LISTEN socket.
1396  *
1397  * We have a potential double-lock case here, so even when
1398  * doing backlog processing we use the BH locking scheme.
1399  * This is because we cannot sleep with the original spinlock
1400  * held.
1401  */
1402 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1403 {
1404         struct sock *rsk;
1405
1406         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1407                 struct dst_entry *dst = sk->sk_rx_dst;
1408
1409                 sock_rps_save_rxhash(sk, skb);
1410                 sk_mark_napi_id(sk, skb);
1411                 if (dst) {
1412                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1413                             !dst->ops->check(dst, 0)) {
1414                                 dst_release(dst);
1415                                 sk->sk_rx_dst = NULL;
1416                         }
1417                 }
1418                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1419                 return 0;
1420         }
1421
1422         if (tcp_checksum_complete(skb))
1423                 goto csum_err;
1424
1425         if (sk->sk_state == TCP_LISTEN) {
1426                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1427
1428                 if (!nsk)
1429                         goto discard;
1430                 if (nsk != sk) {
1431                         sock_rps_save_rxhash(nsk, skb);
1432                         sk_mark_napi_id(nsk, skb);
1433                         if (tcp_child_process(sk, nsk, skb)) {
1434                                 rsk = nsk;
1435                                 goto reset;
1436                         }
1437                         return 0;
1438                 }
1439         } else
1440                 sock_rps_save_rxhash(sk, skb);
1441
1442         if (tcp_rcv_state_process(sk, skb)) {
1443                 rsk = sk;
1444                 goto reset;
1445         }
1446         return 0;
1447
1448 reset:
1449         tcp_v4_send_reset(rsk, skb);
1450 discard:
1451         kfree_skb(skb);
1452         /* Be careful here. If this function gets more complicated and
1453          * gcc suffers from register pressure on the x86, sk (in %ebx)
1454          * might be destroyed here. This current version compiles correctly,
1455          * but you have been warned.
1456          */
1457         return 0;
1458
1459 csum_err:
1460         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1461         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1462         goto discard;
1463 }
1464 EXPORT_SYMBOL(tcp_v4_do_rcv);
1465
1466 void tcp_v4_early_demux(struct sk_buff *skb)
1467 {
1468         const struct iphdr *iph;
1469         const struct tcphdr *th;
1470         struct sock *sk;
1471
1472         if (skb->pkt_type != PACKET_HOST)
1473                 return;
1474
1475         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1476                 return;
1477
1478         iph = ip_hdr(skb);
1479         th = tcp_hdr(skb);
1480
1481         if (th->doff < sizeof(struct tcphdr) / 4)
1482                 return;
1483
1484         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1485                                        iph->saddr, th->source,
1486                                        iph->daddr, ntohs(th->dest),
1487                                        skb->skb_iif);
1488         if (sk) {
1489                 skb->sk = sk;
1490                 skb->destructor = sock_edemux;
1491                 if (sk_fullsock(sk)) {
1492                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1493
1494                         if (dst)
1495                                 dst = dst_check(dst, 0);
1496                         if (dst &&
1497                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1498                                 skb_dst_set_noref(skb, dst);
1499                 }
1500         }
1501 }
1502
1503 /* Packet is added to VJ-style prequeue for processing in process
1504  * context, if a reader task is waiting. Apparently, this exciting
1505  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1506  * failed somewhere. Latency? Burstiness? Well, at least now we will
1507  * see, why it failed. 8)8)                               --ANK
1508  *
1509  */
1510 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1511 {
1512         struct tcp_sock *tp = tcp_sk(sk);
1513
1514         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1515                 return false;
1516
1517         if (skb->len <= tcp_hdrlen(skb) &&
1518             skb_queue_len(&tp->ucopy.prequeue) == 0)
1519                 return false;
1520
1521         /* Before escaping RCU protected region, we need to take care of skb
1522          * dst. Prequeue is only enabled for established sockets.
1523          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1524          * Instead of doing full sk_rx_dst validity here, let's perform
1525          * an optimistic check.
1526          */
1527         if (likely(sk->sk_rx_dst))
1528                 skb_dst_drop(skb);
1529         else
1530                 skb_dst_force_safe(skb);
1531
1532         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1533         tp->ucopy.memory += skb->truesize;
1534         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1535             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1536                 struct sk_buff *skb1;
1537
1538                 BUG_ON(sock_owned_by_user(sk));
1539                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1540                                 skb_queue_len(&tp->ucopy.prequeue));
1541
1542                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1543                         sk_backlog_rcv(sk, skb1);
1544
1545                 tp->ucopy.memory = 0;
1546         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1547                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1548                                            POLLIN | POLLRDNORM | POLLRDBAND);
1549                 if (!inet_csk_ack_scheduled(sk))
1550                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1551                                                   (3 * tcp_rto_min(sk)) / 4,
1552                                                   TCP_RTO_MAX);
1553         }
1554         return true;
1555 }
1556 EXPORT_SYMBOL(tcp_prequeue);
1557
1558 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1559 {
1560         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1561
1562         /* Only socket owner can try to collapse/prune rx queues
1563          * to reduce memory overhead, so add a little headroom here.
1564          * Few sockets backlog are possibly concurrently non empty.
1565          */
1566         limit += 64*1024;
1567
1568         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1569          * we can fix skb->truesize to its real value to avoid future drops.
1570          * This is valid because skb is not yet charged to the socket.
1571          * It has been noticed pure SACK packets were sometimes dropped
1572          * (if cooked by drivers without copybreak feature).
1573          */
1574         if (!skb->data_len)
1575                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1576
1577         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1578                 bh_unlock_sock(sk);
1579                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1580                 return true;
1581         }
1582         return false;
1583 }
1584 EXPORT_SYMBOL(tcp_add_backlog);
1585
1586 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1587 {
1588         struct tcphdr *th = (struct tcphdr *)skb->data;
1589         unsigned int eaten = skb->len;
1590         int err;
1591
1592         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1593         if (!err) {
1594                 eaten -= skb->len;
1595                 TCP_SKB_CB(skb)->end_seq -= eaten;
1596         }
1597         return err;
1598 }
1599 EXPORT_SYMBOL(tcp_filter);
1600
1601 /*
1602  *      From tcp_input.c
1603  */
1604
1605 int tcp_v4_rcv(struct sk_buff *skb)
1606 {
1607         struct net *net = dev_net(skb->dev);
1608         const struct iphdr *iph;
1609         const struct tcphdr *th;
1610         bool refcounted;
1611         struct sock *sk;
1612         int ret;
1613
1614         if (skb->pkt_type != PACKET_HOST)
1615                 goto discard_it;
1616
1617         /* Count it even if it's bad */
1618         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1619
1620         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1621                 goto discard_it;
1622
1623         th = (const struct tcphdr *)skb->data;
1624
1625         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1626                 goto bad_packet;
1627         if (!pskb_may_pull(skb, th->doff * 4))
1628                 goto discard_it;
1629
1630         /* An explanation is required here, I think.
1631          * Packet length and doff are validated by header prediction,
1632          * provided case of th->doff==0 is eliminated.
1633          * So, we defer the checks. */
1634
1635         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1636                 goto csum_error;
1637
1638         th = (const struct tcphdr *)skb->data;
1639         iph = ip_hdr(skb);
1640         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1641          * barrier() makes sure compiler wont play fool^Waliasing games.
1642          */
1643         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1644                 sizeof(struct inet_skb_parm));
1645         barrier();
1646
1647         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1648         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1649                                     skb->len - th->doff * 4);
1650         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1651         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1652         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1653         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1654         TCP_SKB_CB(skb)->sacked  = 0;
1655
1656 lookup:
1657         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1658                                th->dest, &refcounted);
1659         if (!sk)
1660                 goto no_tcp_socket;
1661
1662 process:
1663         if (sk->sk_state == TCP_TIME_WAIT)
1664                 goto do_time_wait;
1665
1666         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1667                 struct request_sock *req = inet_reqsk(sk);
1668                 struct sock *nsk;
1669
1670                 sk = req->rsk_listener;
1671                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1672                         sk_drops_add(sk, skb);
1673                         reqsk_put(req);
1674                         goto discard_it;
1675                 }
1676                 if (tcp_checksum_complete(skb)) {
1677                         reqsk_put(req);
1678                         goto csum_error;
1679                 }
1680                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1681                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1682                         goto lookup;
1683                 }
1684                 /* We own a reference on the listener, increase it again
1685                  * as we might lose it too soon.
1686                  */
1687                 sock_hold(sk);
1688                 refcounted = true;
1689                 nsk = tcp_check_req(sk, skb, req, false);
1690                 if (!nsk) {
1691                         reqsk_put(req);
1692                         goto discard_and_relse;
1693                 }
1694                 if (nsk == sk) {
1695                         reqsk_put(req);
1696                 } else if (tcp_child_process(sk, nsk, skb)) {
1697                         tcp_v4_send_reset(nsk, skb);
1698                         goto discard_and_relse;
1699                 } else {
1700                         sock_put(sk);
1701                         return 0;
1702                 }
1703         }
1704         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1705                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1706                 goto discard_and_relse;
1707         }
1708
1709         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1710                 goto discard_and_relse;
1711
1712         if (tcp_v4_inbound_md5_hash(sk, skb))
1713                 goto discard_and_relse;
1714
1715         nf_reset(skb);
1716
1717         if (tcp_filter(sk, skb))
1718                 goto discard_and_relse;
1719         th = (const struct tcphdr *)skb->data;
1720         iph = ip_hdr(skb);
1721
1722         skb->dev = NULL;
1723
1724         if (sk->sk_state == TCP_LISTEN) {
1725                 ret = tcp_v4_do_rcv(sk, skb);
1726                 goto put_and_return;
1727         }
1728
1729         sk_incoming_cpu_update(sk);
1730
1731         bh_lock_sock_nested(sk);
1732         tcp_segs_in(tcp_sk(sk), skb);
1733         ret = 0;
1734         if (!sock_owned_by_user(sk)) {
1735                 if (!tcp_prequeue(sk, skb))
1736                         ret = tcp_v4_do_rcv(sk, skb);
1737         } else if (tcp_add_backlog(sk, skb)) {
1738                 goto discard_and_relse;
1739         }
1740         bh_unlock_sock(sk);
1741
1742 put_and_return:
1743         if (refcounted)
1744                 sock_put(sk);
1745
1746         return ret;
1747
1748 no_tcp_socket:
1749         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1750                 goto discard_it;
1751
1752         if (tcp_checksum_complete(skb)) {
1753 csum_error:
1754                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1755 bad_packet:
1756                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1757         } else {
1758                 tcp_v4_send_reset(NULL, skb);
1759         }
1760
1761 discard_it:
1762         /* Discard frame. */
1763         kfree_skb(skb);
1764         return 0;
1765
1766 discard_and_relse:
1767         sk_drops_add(sk, skb);
1768         if (refcounted)
1769                 sock_put(sk);
1770         goto discard_it;
1771
1772 do_time_wait:
1773         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1774                 inet_twsk_put(inet_twsk(sk));
1775                 goto discard_it;
1776         }
1777
1778         if (tcp_checksum_complete(skb)) {
1779                 inet_twsk_put(inet_twsk(sk));
1780                 goto csum_error;
1781         }
1782         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1783         case TCP_TW_SYN: {
1784                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1785                                                         &tcp_hashinfo, skb,
1786                                                         __tcp_hdrlen(th),
1787                                                         iph->saddr, th->source,
1788                                                         iph->daddr, th->dest,
1789                                                         inet_iif(skb));
1790                 if (sk2) {
1791                         inet_twsk_deschedule_put(inet_twsk(sk));
1792                         sk = sk2;
1793                         refcounted = false;
1794                         goto process;
1795                 }
1796                 /* Fall through to ACK */
1797         }
1798         case TCP_TW_ACK:
1799                 tcp_v4_timewait_ack(sk, skb);
1800                 break;
1801         case TCP_TW_RST:
1802                 tcp_v4_send_reset(sk, skb);
1803                 inet_twsk_deschedule_put(inet_twsk(sk));
1804                 goto discard_it;
1805         case TCP_TW_SUCCESS:;
1806         }
1807         goto discard_it;
1808 }
1809
1810 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1811         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1812         .twsk_unique    = tcp_twsk_unique,
1813         .twsk_destructor= tcp_twsk_destructor,
1814 };
1815
1816 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1817 {
1818         struct dst_entry *dst = skb_dst(skb);
1819
1820         if (dst && dst_hold_safe(dst)) {
1821                 sk->sk_rx_dst = dst;
1822                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1823         }
1824 }
1825 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1826
1827 const struct inet_connection_sock_af_ops ipv4_specific = {
1828         .queue_xmit        = ip_queue_xmit,
1829         .send_check        = tcp_v4_send_check,
1830         .rebuild_header    = inet_sk_rebuild_header,
1831         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1832         .conn_request      = tcp_v4_conn_request,
1833         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1834         .net_header_len    = sizeof(struct iphdr),
1835         .setsockopt        = ip_setsockopt,
1836         .getsockopt        = ip_getsockopt,
1837         .addr2sockaddr     = inet_csk_addr2sockaddr,
1838         .sockaddr_len      = sizeof(struct sockaddr_in),
1839         .bind_conflict     = inet_csk_bind_conflict,
1840 #ifdef CONFIG_COMPAT
1841         .compat_setsockopt = compat_ip_setsockopt,
1842         .compat_getsockopt = compat_ip_getsockopt,
1843 #endif
1844         .mtu_reduced       = tcp_v4_mtu_reduced,
1845 };
1846 EXPORT_SYMBOL(ipv4_specific);
1847
1848 #ifdef CONFIG_TCP_MD5SIG
1849 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1850         .md5_lookup             = tcp_v4_md5_lookup,
1851         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1852         .md5_parse              = tcp_v4_parse_md5_keys,
1853 };
1854 #endif
1855
1856 /* NOTE: A lot of things set to zero explicitly by call to
1857  *       sk_alloc() so need not be done here.
1858  */
1859 static int tcp_v4_init_sock(struct sock *sk)
1860 {
1861         struct inet_connection_sock *icsk = inet_csk(sk);
1862
1863         tcp_init_sock(sk);
1864
1865         icsk->icsk_af_ops = &ipv4_specific;
1866
1867 #ifdef CONFIG_TCP_MD5SIG
1868         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1869 #endif
1870
1871         return 0;
1872 }
1873
1874 void tcp_v4_destroy_sock(struct sock *sk)
1875 {
1876         struct tcp_sock *tp = tcp_sk(sk);
1877
1878         tcp_clear_xmit_timers(sk);
1879
1880         tcp_cleanup_congestion_control(sk);
1881
1882         /* Cleanup up the write buffer. */
1883         tcp_write_queue_purge(sk);
1884
1885         /* Cleans up our, hopefully empty, out_of_order_queue. */
1886         skb_rbtree_purge(&tp->out_of_order_queue);
1887
1888 #ifdef CONFIG_TCP_MD5SIG
1889         /* Clean up the MD5 key list, if any */
1890         if (tp->md5sig_info) {
1891                 tcp_clear_md5_list(sk);
1892                 kfree_rcu(tp->md5sig_info, rcu);
1893                 tp->md5sig_info = NULL;
1894         }
1895 #endif
1896
1897         /* Clean prequeue, it must be empty really */
1898         __skb_queue_purge(&tp->ucopy.prequeue);
1899
1900         /* Clean up a referenced TCP bind bucket. */
1901         if (inet_csk(sk)->icsk_bind_hash)
1902                 inet_put_port(sk);
1903
1904         BUG_ON(tp->fastopen_rsk);
1905
1906         /* If socket is aborted during connect operation */
1907         tcp_free_fastopen_req(tp);
1908         tcp_saved_syn_free(tp);
1909
1910         local_bh_disable();
1911         sk_sockets_allocated_dec(sk);
1912         local_bh_enable();
1913 }
1914 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1915
1916 #ifdef CONFIG_PROC_FS
1917 /* Proc filesystem TCP sock list dumping. */
1918
1919 /*
1920  * Get next listener socket follow cur.  If cur is NULL, get first socket
1921  * starting from bucket given in st->bucket; when st->bucket is zero the
1922  * very first socket in the hash table is returned.
1923  */
1924 static void *listening_get_next(struct seq_file *seq, void *cur)
1925 {
1926         struct tcp_iter_state *st = seq->private;
1927         struct net *net = seq_file_net(seq);
1928         struct inet_listen_hashbucket *ilb;
1929         struct hlist_nulls_node *node;
1930         struct sock *sk = cur;
1931
1932         if (!sk) {
1933 get_head:
1934                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1935                 spin_lock_bh(&ilb->lock);
1936                 sk = sk_nulls_head(&ilb->nulls_head);
1937                 st->offset = 0;
1938                 goto get_sk;
1939         }
1940         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1941         ++st->num;
1942         ++st->offset;
1943
1944         sk = sk_nulls_next(sk);
1945 get_sk:
1946         sk_nulls_for_each_from(sk, node) {
1947                 if (!net_eq(sock_net(sk), net))
1948                         continue;
1949                 if (sk->sk_family == st->family)
1950                         return sk;
1951         }
1952         spin_unlock_bh(&ilb->lock);
1953         st->offset = 0;
1954         if (++st->bucket < INET_LHTABLE_SIZE)
1955                 goto get_head;
1956         return NULL;
1957 }
1958
1959 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1960 {
1961         struct tcp_iter_state *st = seq->private;
1962         void *rc;
1963
1964         st->bucket = 0;
1965         st->offset = 0;
1966         rc = listening_get_next(seq, NULL);
1967
1968         while (rc && *pos) {
1969                 rc = listening_get_next(seq, rc);
1970                 --*pos;
1971         }
1972         return rc;
1973 }
1974
1975 static inline bool empty_bucket(const struct tcp_iter_state *st)
1976 {
1977         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1978 }
1979
1980 /*
1981  * Get first established socket starting from bucket given in st->bucket.
1982  * If st->bucket is zero, the very first socket in the hash is returned.
1983  */
1984 static void *established_get_first(struct seq_file *seq)
1985 {
1986         struct tcp_iter_state *st = seq->private;
1987         struct net *net = seq_file_net(seq);
1988         void *rc = NULL;
1989
1990         st->offset = 0;
1991         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1992                 struct sock *sk;
1993                 struct hlist_nulls_node *node;
1994                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1995
1996                 /* Lockless fast path for the common case of empty buckets */
1997                 if (empty_bucket(st))
1998                         continue;
1999
2000                 spin_lock_bh(lock);
2001                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2002                         if (sk->sk_family != st->family ||
2003                             !net_eq(sock_net(sk), net)) {
2004                                 continue;
2005                         }
2006                         rc = sk;
2007                         goto out;
2008                 }
2009                 spin_unlock_bh(lock);
2010         }
2011 out:
2012         return rc;
2013 }
2014
2015 static void *established_get_next(struct seq_file *seq, void *cur)
2016 {
2017         struct sock *sk = cur;
2018         struct hlist_nulls_node *node;
2019         struct tcp_iter_state *st = seq->private;
2020         struct net *net = seq_file_net(seq);
2021
2022         ++st->num;
2023         ++st->offset;
2024
2025         sk = sk_nulls_next(sk);
2026
2027         sk_nulls_for_each_from(sk, node) {
2028                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2029                         return sk;
2030         }
2031
2032         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2033         ++st->bucket;
2034         return established_get_first(seq);
2035 }
2036
2037 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2038 {
2039         struct tcp_iter_state *st = seq->private;
2040         void *rc;
2041
2042         st->bucket = 0;
2043         rc = established_get_first(seq);
2044
2045         while (rc && pos) {
2046                 rc = established_get_next(seq, rc);
2047                 --pos;
2048         }
2049         return rc;
2050 }
2051
2052 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2053 {
2054         void *rc;
2055         struct tcp_iter_state *st = seq->private;
2056
2057         st->state = TCP_SEQ_STATE_LISTENING;
2058         rc        = listening_get_idx(seq, &pos);
2059
2060         if (!rc) {
2061                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062                 rc        = established_get_idx(seq, pos);
2063         }
2064
2065         return rc;
2066 }
2067
2068 static void *tcp_seek_last_pos(struct seq_file *seq)
2069 {
2070         struct tcp_iter_state *st = seq->private;
2071         int bucket = st->bucket;
2072         int offset = st->offset;
2073         int orig_num = st->num;
2074         void *rc = NULL;
2075
2076         switch (st->state) {
2077         case TCP_SEQ_STATE_LISTENING:
2078                 if (st->bucket >= INET_LHTABLE_SIZE)
2079                         break;
2080                 st->state = TCP_SEQ_STATE_LISTENING;
2081                 rc = listening_get_next(seq, NULL);
2082                 while (offset-- && rc && bucket == st->bucket)
2083                         rc = listening_get_next(seq, rc);
2084                 if (rc)
2085                         break;
2086                 st->bucket = 0;
2087                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2088                 /* Fallthrough */
2089         case TCP_SEQ_STATE_ESTABLISHED:
2090                 if (st->bucket > tcp_hashinfo.ehash_mask)
2091                         break;
2092                 rc = established_get_first(seq);
2093                 while (offset-- && rc && bucket == st->bucket)
2094                         rc = established_get_next(seq, rc);
2095         }
2096
2097         st->num = orig_num;
2098
2099         return rc;
2100 }
2101
2102 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2103 {
2104         struct tcp_iter_state *st = seq->private;
2105         void *rc;
2106
2107         if (*pos && *pos == st->last_pos) {
2108                 rc = tcp_seek_last_pos(seq);
2109                 if (rc)
2110                         goto out;
2111         }
2112
2113         st->state = TCP_SEQ_STATE_LISTENING;
2114         st->num = 0;
2115         st->bucket = 0;
2116         st->offset = 0;
2117         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2118
2119 out:
2120         st->last_pos = *pos;
2121         return rc;
2122 }
2123
2124 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2125 {
2126         struct tcp_iter_state *st = seq->private;
2127         void *rc = NULL;
2128
2129         if (v == SEQ_START_TOKEN) {
2130                 rc = tcp_get_idx(seq, 0);
2131                 goto out;
2132         }
2133
2134         switch (st->state) {
2135         case TCP_SEQ_STATE_LISTENING:
2136                 rc = listening_get_next(seq, v);
2137                 if (!rc) {
2138                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2139                         st->bucket = 0;
2140                         st->offset = 0;
2141                         rc        = established_get_first(seq);
2142                 }
2143                 break;
2144         case TCP_SEQ_STATE_ESTABLISHED:
2145                 rc = established_get_next(seq, v);
2146                 break;
2147         }
2148 out:
2149         ++*pos;
2150         st->last_pos = *pos;
2151         return rc;
2152 }
2153
2154 static void tcp_seq_stop(struct seq_file *seq, void *v)
2155 {
2156         struct tcp_iter_state *st = seq->private;
2157
2158         switch (st->state) {
2159         case TCP_SEQ_STATE_LISTENING:
2160                 if (v != SEQ_START_TOKEN)
2161                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2162                 break;
2163         case TCP_SEQ_STATE_ESTABLISHED:
2164                 if (v)
2165                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2166                 break;
2167         }
2168 }
2169
2170 int tcp_seq_open(struct inode *inode, struct file *file)
2171 {
2172         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2173         struct tcp_iter_state *s;
2174         int err;
2175
2176         err = seq_open_net(inode, file, &afinfo->seq_ops,
2177                           sizeof(struct tcp_iter_state));
2178         if (err < 0)
2179                 return err;
2180
2181         s = ((struct seq_file *)file->private_data)->private;
2182         s->family               = afinfo->family;
2183         s->last_pos             = 0;
2184         return 0;
2185 }
2186 EXPORT_SYMBOL(tcp_seq_open);
2187
2188 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2189 {
2190         int rc = 0;
2191         struct proc_dir_entry *p;
2192
2193         afinfo->seq_ops.start           = tcp_seq_start;
2194         afinfo->seq_ops.next            = tcp_seq_next;
2195         afinfo->seq_ops.stop            = tcp_seq_stop;
2196
2197         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2198                              afinfo->seq_fops, afinfo);
2199         if (!p)
2200                 rc = -ENOMEM;
2201         return rc;
2202 }
2203 EXPORT_SYMBOL(tcp_proc_register);
2204
2205 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2206 {
2207         remove_proc_entry(afinfo->name, net->proc_net);
2208 }
2209 EXPORT_SYMBOL(tcp_proc_unregister);
2210
2211 static void get_openreq4(const struct request_sock *req,
2212                          struct seq_file *f, int i)
2213 {
2214         const struct inet_request_sock *ireq = inet_rsk(req);
2215         long delta = req->rsk_timer.expires - jiffies;
2216
2217         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2218                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2219                 i,
2220                 ireq->ir_loc_addr,
2221                 ireq->ir_num,
2222                 ireq->ir_rmt_addr,
2223                 ntohs(ireq->ir_rmt_port),
2224                 TCP_SYN_RECV,
2225                 0, 0, /* could print option size, but that is af dependent. */
2226                 1,    /* timers active (only the expire timer) */
2227                 jiffies_delta_to_clock_t(delta),
2228                 req->num_timeout,
2229                 from_kuid_munged(seq_user_ns(f),
2230                                  sock_i_uid(req->rsk_listener)),
2231                 0,  /* non standard timer */
2232                 0, /* open_requests have no inode */
2233                 0,
2234                 req);
2235 }
2236
2237 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2238 {
2239         int timer_active;
2240         unsigned long timer_expires;
2241         const struct tcp_sock *tp = tcp_sk(sk);
2242         const struct inet_connection_sock *icsk = inet_csk(sk);
2243         const struct inet_sock *inet = inet_sk(sk);
2244         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2245         __be32 dest = inet->inet_daddr;
2246         __be32 src = inet->inet_rcv_saddr;
2247         __u16 destp = ntohs(inet->inet_dport);
2248         __u16 srcp = ntohs(inet->inet_sport);
2249         int rx_queue;
2250         int state;
2251
2252         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2253             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2254             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2255                 timer_active    = 1;
2256                 timer_expires   = icsk->icsk_timeout;
2257         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2258                 timer_active    = 4;
2259                 timer_expires   = icsk->icsk_timeout;
2260         } else if (timer_pending(&sk->sk_timer)) {
2261                 timer_active    = 2;
2262                 timer_expires   = sk->sk_timer.expires;
2263         } else {
2264                 timer_active    = 0;
2265                 timer_expires = jiffies;
2266         }
2267
2268         state = sk_state_load(sk);
2269         if (state == TCP_LISTEN)
2270                 rx_queue = sk->sk_ack_backlog;
2271         else
2272                 /* Because we don't lock the socket,
2273                  * we might find a transient negative value.
2274                  */
2275                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2276
2277         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2278                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2279                 i, src, srcp, dest, destp, state,
2280                 tp->write_seq - tp->snd_una,
2281                 rx_queue,
2282                 timer_active,
2283                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2284                 icsk->icsk_retransmits,
2285                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2286                 icsk->icsk_probes_out,
2287                 sock_i_ino(sk),
2288                 atomic_read(&sk->sk_refcnt), sk,
2289                 jiffies_to_clock_t(icsk->icsk_rto),
2290                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2291                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2292                 tp->snd_cwnd,
2293                 state == TCP_LISTEN ?
2294                     fastopenq->max_qlen :
2295                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2296 }
2297
2298 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2299                                struct seq_file *f, int i)
2300 {
2301         long delta = tw->tw_timer.expires - jiffies;
2302         __be32 dest, src;
2303         __u16 destp, srcp;
2304
2305         dest  = tw->tw_daddr;
2306         src   = tw->tw_rcv_saddr;
2307         destp = ntohs(tw->tw_dport);
2308         srcp  = ntohs(tw->tw_sport);
2309
2310         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2311                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2312                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2313                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2314                 atomic_read(&tw->tw_refcnt), tw);
2315 }
2316
2317 #define TMPSZ 150
2318
2319 static int tcp4_seq_show(struct seq_file *seq, void *v)
2320 {
2321         struct tcp_iter_state *st;
2322         struct sock *sk = v;
2323
2324         seq_setwidth(seq, TMPSZ - 1);
2325         if (v == SEQ_START_TOKEN) {
2326                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2327                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2328                            "inode");
2329                 goto out;
2330         }
2331         st = seq->private;
2332
2333         if (sk->sk_state == TCP_TIME_WAIT)
2334                 get_timewait4_sock(v, seq, st->num);
2335         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2336                 get_openreq4(v, seq, st->num);
2337         else
2338                 get_tcp4_sock(v, seq, st->num);
2339 out:
2340         seq_pad(seq, '\n');
2341         return 0;
2342 }
2343
2344 static const struct file_operations tcp_afinfo_seq_fops = {
2345         .owner   = THIS_MODULE,
2346         .open    = tcp_seq_open,
2347         .read    = seq_read,
2348         .llseek  = seq_lseek,
2349         .release = seq_release_net
2350 };
2351
2352 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2353         .name           = "tcp",
2354         .family         = AF_INET,
2355         .seq_fops       = &tcp_afinfo_seq_fops,
2356         .seq_ops        = {
2357                 .show           = tcp4_seq_show,
2358         },
2359 };
2360
2361 static int __net_init tcp4_proc_init_net(struct net *net)
2362 {
2363         return tcp_proc_register(net, &tcp4_seq_afinfo);
2364 }
2365
2366 static void __net_exit tcp4_proc_exit_net(struct net *net)
2367 {
2368         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2369 }
2370
2371 static struct pernet_operations tcp4_net_ops = {
2372         .init = tcp4_proc_init_net,
2373         .exit = tcp4_proc_exit_net,
2374 };
2375
2376 int __init tcp4_proc_init(void)
2377 {
2378         return register_pernet_subsys(&tcp4_net_ops);
2379 }
2380
2381 void tcp4_proc_exit(void)
2382 {
2383         unregister_pernet_subsys(&tcp4_net_ops);
2384 }
2385 #endif /* CONFIG_PROC_FS */
2386
2387 struct proto tcp_prot = {
2388         .name                   = "TCP",
2389         .owner                  = THIS_MODULE,
2390         .close                  = tcp_close,
2391         .connect                = tcp_v4_connect,
2392         .disconnect             = tcp_disconnect,
2393         .accept                 = inet_csk_accept,
2394         .ioctl                  = tcp_ioctl,
2395         .init                   = tcp_v4_init_sock,
2396         .destroy                = tcp_v4_destroy_sock,
2397         .shutdown               = tcp_shutdown,
2398         .setsockopt             = tcp_setsockopt,
2399         .getsockopt             = tcp_getsockopt,
2400         .recvmsg                = tcp_recvmsg,
2401         .sendmsg                = tcp_sendmsg,
2402         .sendpage               = tcp_sendpage,
2403         .backlog_rcv            = tcp_v4_do_rcv,
2404         .release_cb             = tcp_release_cb,
2405         .hash                   = inet_hash,
2406         .unhash                 = inet_unhash,
2407         .get_port               = inet_csk_get_port,
2408         .enter_memory_pressure  = tcp_enter_memory_pressure,
2409         .stream_memory_free     = tcp_stream_memory_free,
2410         .sockets_allocated      = &tcp_sockets_allocated,
2411         .orphan_count           = &tcp_orphan_count,
2412         .memory_allocated       = &tcp_memory_allocated,
2413         .memory_pressure        = &tcp_memory_pressure,
2414         .sysctl_mem             = sysctl_tcp_mem,
2415         .sysctl_wmem            = sysctl_tcp_wmem,
2416         .sysctl_rmem            = sysctl_tcp_rmem,
2417         .max_header             = MAX_TCP_HEADER,
2418         .obj_size               = sizeof(struct tcp_sock),
2419         .slab_flags             = SLAB_DESTROY_BY_RCU,
2420         .twsk_prot              = &tcp_timewait_sock_ops,
2421         .rsk_prot               = &tcp_request_sock_ops,
2422         .h.hashinfo             = &tcp_hashinfo,
2423         .no_autobind            = true,
2424 #ifdef CONFIG_COMPAT
2425         .compat_setsockopt      = compat_tcp_setsockopt,
2426         .compat_getsockopt      = compat_tcp_getsockopt,
2427 #endif
2428         .diag_destroy           = tcp_abort,
2429 };
2430 EXPORT_SYMBOL(tcp_prot);
2431
2432 static void __net_exit tcp_sk_exit(struct net *net)
2433 {
2434         int cpu;
2435
2436         for_each_possible_cpu(cpu)
2437                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2438         free_percpu(net->ipv4.tcp_sk);
2439 }
2440
2441 static int __net_init tcp_sk_init(struct net *net)
2442 {
2443         int res, cpu;
2444
2445         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2446         if (!net->ipv4.tcp_sk)
2447                 return -ENOMEM;
2448
2449         for_each_possible_cpu(cpu) {
2450                 struct sock *sk;
2451
2452                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2453                                            IPPROTO_TCP, net);
2454                 if (res)
2455                         goto fail;
2456                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2457
2458                 /* Please enforce IP_DF and IPID==0 for RST and
2459                  * ACK sent in SYN-RECV and TIME-WAIT state.
2460                  */
2461                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2462
2463                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2464         }
2465
2466         net->ipv4.sysctl_tcp_ecn = 2;
2467         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2468
2469         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2470         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2471         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2472         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2473
2474         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2475         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2476         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2477
2478         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2479         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2480         net->ipv4.sysctl_tcp_syncookies = 1;
2481         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2482         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2483         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2484         net->ipv4.sysctl_tcp_orphan_retries = 0;
2485         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2486         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2487
2488         return 0;
2489 fail:
2490         tcp_sk_exit(net);
2491
2492         return res;
2493 }
2494
2495 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2496 {
2497         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2498 }
2499
2500 static struct pernet_operations __net_initdata tcp_sk_ops = {
2501        .init       = tcp_sk_init,
2502        .exit       = tcp_sk_exit,
2503        .exit_batch = tcp_sk_exit_batch,
2504 };
2505
2506 void __init tcp_v4_init(void)
2507 {
2508         inet_hashinfo_init(&tcp_hashinfo);
2509         if (register_pernet_subsys(&tcp_sk_ops))
2510                 panic("Failed to create the TCP control socket.\n");
2511 }