GNU Linux-libre 4.14.266-gnu1
[releases.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98         return secure_tcp_seq(ip_hdr(skb)->daddr,
99                               ip_hdr(skb)->saddr,
100                               tcp_hdr(skb)->dest,
101                               tcp_hdr(skb)->source);
102 }
103
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112         struct tcp_sock *tp = tcp_sk(sk);
113
114         /* With PAWS, it is safe from the viewpoint
115            of data integrity. Even without PAWS it is safe provided sequence
116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118            Actually, the idea is close to VJ's one, only timestamp cache is
119            held not per host, but per port pair and TW bucket is used as state
120            holder.
121
122            If TW bucket has been already destroyed we fall back to VJ's scheme
123            and use initial timestamp retrieved from peer table.
124          */
125         if (tcptw->tw_ts_recent_stamp &&
126             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129                 if (tp->write_seq == 0)
130                         tp->write_seq = 1;
131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133                 sock_hold(sktw);
134                 return 1;
135         }
136
137         return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
141 /* This will initiate an outgoing connection. */
142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145         struct inet_sock *inet = inet_sk(sk);
146         struct tcp_sock *tp = tcp_sk(sk);
147         __be16 orig_sport, orig_dport;
148         __be32 daddr, nexthop;
149         struct flowi4 *fl4;
150         struct rtable *rt;
151         int err;
152         struct ip_options_rcu *inet_opt;
153         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
154
155         if (addr_len < sizeof(struct sockaddr_in))
156                 return -EINVAL;
157
158         if (usin->sin_family != AF_INET)
159                 return -EAFNOSUPPORT;
160
161         nexthop = daddr = usin->sin_addr.s_addr;
162         inet_opt = rcu_dereference_protected(inet->inet_opt,
163                                              lockdep_sock_is_held(sk));
164         if (inet_opt && inet_opt->opt.srr) {
165                 if (!daddr)
166                         return -EINVAL;
167                 nexthop = inet_opt->opt.faddr;
168         }
169
170         orig_sport = inet->inet_sport;
171         orig_dport = usin->sin_port;
172         fl4 = &inet->cork.fl.u.ip4;
173         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
174                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175                               IPPROTO_TCP,
176                               orig_sport, orig_dport, sk);
177         if (IS_ERR(rt)) {
178                 err = PTR_ERR(rt);
179                 if (err == -ENETUNREACH)
180                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
181                 return err;
182         }
183
184         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185                 ip_rt_put(rt);
186                 return -ENETUNREACH;
187         }
188
189         if (!inet_opt || !inet_opt->opt.srr)
190                 daddr = fl4->daddr;
191
192         if (!inet->inet_saddr)
193                 inet->inet_saddr = fl4->saddr;
194         sk_rcv_saddr_set(sk, inet->inet_saddr);
195
196         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197                 /* Reset inherited state */
198                 tp->rx_opt.ts_recent       = 0;
199                 tp->rx_opt.ts_recent_stamp = 0;
200                 if (likely(!tp->repair))
201                         tp->write_seq      = 0;
202         }
203
204         inet->inet_dport = usin->sin_port;
205         sk_daddr_set(sk, daddr);
206
207         inet_csk(sk)->icsk_ext_hdr_len = 0;
208         if (inet_opt)
209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
210
211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
212
213         /* Socket identity is still unknown (sport may be zero).
214          * However we set state to SYN-SENT and not releasing socket
215          * lock select source port, enter ourselves into the hash tables and
216          * complete initialization after this.
217          */
218         tcp_set_state(sk, TCP_SYN_SENT);
219         err = inet_hash_connect(tcp_death_row, sk);
220         if (err)
221                 goto failure;
222
223         sk_set_txhash(sk);
224
225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
226                                inet->inet_sport, inet->inet_dport, sk);
227         if (IS_ERR(rt)) {
228                 err = PTR_ERR(rt);
229                 rt = NULL;
230                 goto failure;
231         }
232         /* OK, now commit destination to socket.  */
233         sk->sk_gso_type = SKB_GSO_TCPV4;
234         sk_setup_caps(sk, &rt->dst);
235         rt = NULL;
236
237         if (likely(!tp->repair)) {
238                 if (!tp->write_seq)
239                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
240                                                        inet->inet_daddr,
241                                                        inet->inet_sport,
242                                                        usin->sin_port);
243                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
244                                                  inet->inet_saddr,
245                                                  inet->inet_daddr);
246         }
247
248         inet->inet_id = prandom_u32();
249
250         if (tcp_fastopen_defer_connect(sk, &err))
251                 return err;
252         if (err)
253                 goto failure;
254
255         err = tcp_connect(sk);
256
257         if (err)
258                 goto failure;
259
260         return 0;
261
262 failure:
263         /*
264          * This unhashes the socket and releases the local port,
265          * if necessary.
266          */
267         tcp_set_state(sk, TCP_CLOSE);
268         ip_rt_put(rt);
269         sk->sk_route_caps = 0;
270         inet->inet_dport = 0;
271         return err;
272 }
273 EXPORT_SYMBOL(tcp_v4_connect);
274
275 /*
276  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
277  * It can be called through tcp_release_cb() if socket was owned by user
278  * at the time tcp_v4_err() was called to handle ICMP message.
279  */
280 void tcp_v4_mtu_reduced(struct sock *sk)
281 {
282         struct inet_sock *inet = inet_sk(sk);
283         struct dst_entry *dst;
284         u32 mtu;
285
286         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
287                 return;
288         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
289         dst = inet_csk_update_pmtu(sk, mtu);
290         if (!dst)
291                 return;
292
293         /* Something is about to be wrong... Remember soft error
294          * for the case, if this connection will not able to recover.
295          */
296         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
297                 sk->sk_err_soft = EMSGSIZE;
298
299         mtu = dst_mtu(dst);
300
301         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
302             ip_sk_accept_pmtu(sk) &&
303             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304                 tcp_sync_mss(sk, mtu);
305
306                 /* Resend the TCP packet because it's
307                  * clear that the old packet has been
308                  * dropped. This is the new "fast" path mtu
309                  * discovery.
310                  */
311                 tcp_simple_retransmit(sk);
312         } /* else let the usual retransmit timer handle it */
313 }
314 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
315
316 static void do_redirect(struct sk_buff *skb, struct sock *sk)
317 {
318         struct dst_entry *dst = __sk_dst_check(sk, 0);
319
320         if (dst)
321                 dst->ops->redirect(dst, sk, skb);
322 }
323
324
325 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
326 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
327 {
328         struct request_sock *req = inet_reqsk(sk);
329         struct net *net = sock_net(sk);
330
331         /* ICMPs are not backlogged, hence we cannot get
332          * an established socket here.
333          */
334         if (seq != tcp_rsk(req)->snt_isn) {
335                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
336         } else if (abort) {
337                 /*
338                  * Still in SYN_RECV, just remove it silently.
339                  * There is no good way to pass the error to the newly
340                  * created socket, and POSIX does not want network
341                  * errors returned from accept().
342                  */
343                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
344                 tcp_listendrop(req->rsk_listener);
345         }
346         reqsk_put(req);
347 }
348 EXPORT_SYMBOL(tcp_req_err);
349
350 /*
351  * This routine is called by the ICMP module when it gets some
352  * sort of error condition.  If err < 0 then the socket should
353  * be closed and the error returned to the user.  If err > 0
354  * it's just the icmp type << 8 | icmp code.  After adjustment
355  * header points to the first 8 bytes of the tcp header.  We need
356  * to find the appropriate port.
357  *
358  * The locking strategy used here is very "optimistic". When
359  * someone else accesses the socket the ICMP is just dropped
360  * and for some paths there is no check at all.
361  * A more general error queue to queue errors for later handling
362  * is probably better.
363  *
364  */
365
366 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
367 {
368         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
369         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
370         struct inet_connection_sock *icsk;
371         struct tcp_sock *tp;
372         struct inet_sock *inet;
373         const int type = icmp_hdr(icmp_skb)->type;
374         const int code = icmp_hdr(icmp_skb)->code;
375         struct sock *sk;
376         struct sk_buff *skb;
377         struct request_sock *fastopen;
378         u32 seq, snd_una;
379         s32 remaining;
380         u32 delta_us;
381         int err;
382         struct net *net = dev_net(icmp_skb->dev);
383
384         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
385                                        th->dest, iph->saddr, ntohs(th->source),
386                                        inet_iif(icmp_skb), 0);
387         if (!sk) {
388                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
389                 return;
390         }
391         if (sk->sk_state == TCP_TIME_WAIT) {
392                 inet_twsk_put(inet_twsk(sk));
393                 return;
394         }
395         seq = ntohl(th->seq);
396         if (sk->sk_state == TCP_NEW_SYN_RECV)
397                 return tcp_req_err(sk, seq,
398                                   type == ICMP_PARAMETERPROB ||
399                                   type == ICMP_TIME_EXCEEDED ||
400                                   (type == ICMP_DEST_UNREACH &&
401                                    (code == ICMP_NET_UNREACH ||
402                                     code == ICMP_HOST_UNREACH)));
403
404         bh_lock_sock(sk);
405         /* If too many ICMPs get dropped on busy
406          * servers this needs to be solved differently.
407          * We do take care of PMTU discovery (RFC1191) special case :
408          * we can receive locally generated ICMP messages while socket is held.
409          */
410         if (sock_owned_by_user(sk)) {
411                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
412                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
413         }
414         if (sk->sk_state == TCP_CLOSE)
415                 goto out;
416
417         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
418                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
419                 goto out;
420         }
421
422         icsk = inet_csk(sk);
423         tp = tcp_sk(sk);
424         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
425         fastopen = tp->fastopen_rsk;
426         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
427         if (sk->sk_state != TCP_LISTEN &&
428             !between(seq, snd_una, tp->snd_nxt)) {
429                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
430                 goto out;
431         }
432
433         switch (type) {
434         case ICMP_REDIRECT:
435                 if (!sock_owned_by_user(sk))
436                         do_redirect(icmp_skb, sk);
437                 goto out;
438         case ICMP_SOURCE_QUENCH:
439                 /* Just silently ignore these. */
440                 goto out;
441         case ICMP_PARAMETERPROB:
442                 err = EPROTO;
443                 break;
444         case ICMP_DEST_UNREACH:
445                 if (code > NR_ICMP_UNREACH)
446                         goto out;
447
448                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
449                         /* We are not interested in TCP_LISTEN and open_requests
450                          * (SYN-ACKs send out by Linux are always <576bytes so
451                          * they should go through unfragmented).
452                          */
453                         if (sk->sk_state == TCP_LISTEN)
454                                 goto out;
455
456                         WRITE_ONCE(tp->mtu_info, info);
457                         if (!sock_owned_by_user(sk)) {
458                                 tcp_v4_mtu_reduced(sk);
459                         } else {
460                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
461                                         sock_hold(sk);
462                         }
463                         goto out;
464                 }
465
466                 err = icmp_err_convert[code].errno;
467                 /* check if icmp_skb allows revert of backoff
468                  * (see draft-zimmermann-tcp-lcd) */
469                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
470                         break;
471                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
472                     !icsk->icsk_backoff || fastopen)
473                         break;
474
475                 if (sock_owned_by_user(sk))
476                         break;
477
478                 skb = tcp_write_queue_head(sk);
479                 if (WARN_ON_ONCE(!skb))
480                         break;
481
482                 icsk->icsk_backoff--;
483                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
484                                                TCP_TIMEOUT_INIT;
485                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
486
487                 tcp_mstamp_refresh(tp);
488                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
489                 remaining = icsk->icsk_rto -
490                             usecs_to_jiffies(delta_us);
491
492                 if (remaining > 0) {
493                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
494                                                   remaining, TCP_RTO_MAX);
495                 } else {
496                         /* RTO revert clocked out retransmission.
497                          * Will retransmit now */
498                         tcp_retransmit_timer(sk);
499                 }
500
501                 break;
502         case ICMP_TIME_EXCEEDED:
503                 err = EHOSTUNREACH;
504                 break;
505         default:
506                 goto out;
507         }
508
509         switch (sk->sk_state) {
510         case TCP_SYN_SENT:
511         case TCP_SYN_RECV:
512                 /* Only in fast or simultaneous open. If a fast open socket is
513                  * is already accepted it is treated as a connected one below.
514                  */
515                 if (fastopen && !fastopen->sk)
516                         break;
517
518                 if (!sock_owned_by_user(sk)) {
519                         sk->sk_err = err;
520
521                         sk->sk_error_report(sk);
522
523                         tcp_done(sk);
524                 } else {
525                         sk->sk_err_soft = err;
526                 }
527                 goto out;
528         }
529
530         /* If we've already connected we will keep trying
531          * until we time out, or the user gives up.
532          *
533          * rfc1122 4.2.3.9 allows to consider as hard errors
534          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
535          * but it is obsoleted by pmtu discovery).
536          *
537          * Note, that in modern internet, where routing is unreliable
538          * and in each dark corner broken firewalls sit, sending random
539          * errors ordered by their masters even this two messages finally lose
540          * their original sense (even Linux sends invalid PORT_UNREACHs)
541          *
542          * Now we are in compliance with RFCs.
543          *                                                      --ANK (980905)
544          */
545
546         inet = inet_sk(sk);
547         if (!sock_owned_by_user(sk) && inet->recverr) {
548                 sk->sk_err = err;
549                 sk->sk_error_report(sk);
550         } else  { /* Only an error on timeout */
551                 sk->sk_err_soft = err;
552         }
553
554 out:
555         bh_unlock_sock(sk);
556         sock_put(sk);
557 }
558
559 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
560 {
561         struct tcphdr *th = tcp_hdr(skb);
562
563         if (skb->ip_summed == CHECKSUM_PARTIAL) {
564                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
565                 skb->csum_start = skb_transport_header(skb) - skb->head;
566                 skb->csum_offset = offsetof(struct tcphdr, check);
567         } else {
568                 th->check = tcp_v4_check(skb->len, saddr, daddr,
569                                          csum_partial(th,
570                                                       th->doff << 2,
571                                                       skb->csum));
572         }
573 }
574
575 /* This routine computes an IPv4 TCP checksum. */
576 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
577 {
578         const struct inet_sock *inet = inet_sk(sk);
579
580         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
581 }
582 EXPORT_SYMBOL(tcp_v4_send_check);
583
584 /*
585  *      This routine will send an RST to the other tcp.
586  *
587  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
588  *                    for reset.
589  *      Answer: if a packet caused RST, it is not for a socket
590  *              existing in our system, if it is matched to a socket,
591  *              it is just duplicate segment or bug in other side's TCP.
592  *              So that we build reply only basing on parameters
593  *              arrived with segment.
594  *      Exception: precedence violation. We do not implement it in any case.
595  */
596
597 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
598 {
599         const struct tcphdr *th = tcp_hdr(skb);
600         struct {
601                 struct tcphdr th;
602 #ifdef CONFIG_TCP_MD5SIG
603                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
604 #endif
605         } rep;
606         struct ip_reply_arg arg;
607 #ifdef CONFIG_TCP_MD5SIG
608         struct tcp_md5sig_key *key = NULL;
609         const __u8 *hash_location = NULL;
610         unsigned char newhash[16];
611         int genhash;
612         struct sock *sk1 = NULL;
613 #endif
614         struct net *net;
615
616         /* Never send a reset in response to a reset. */
617         if (th->rst)
618                 return;
619
620         /* If sk not NULL, it means we did a successful lookup and incoming
621          * route had to be correct. prequeue might have dropped our dst.
622          */
623         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
624                 return;
625
626         /* Swap the send and the receive. */
627         memset(&rep, 0, sizeof(rep));
628         rep.th.dest   = th->source;
629         rep.th.source = th->dest;
630         rep.th.doff   = sizeof(struct tcphdr) / 4;
631         rep.th.rst    = 1;
632
633         if (th->ack) {
634                 rep.th.seq = th->ack_seq;
635         } else {
636                 rep.th.ack = 1;
637                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
638                                        skb->len - (th->doff << 2));
639         }
640
641         memset(&arg, 0, sizeof(arg));
642         arg.iov[0].iov_base = (unsigned char *)&rep;
643         arg.iov[0].iov_len  = sizeof(rep.th);
644
645         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
646 #ifdef CONFIG_TCP_MD5SIG
647         rcu_read_lock();
648         hash_location = tcp_parse_md5sig_option(th);
649         if (sk && sk_fullsock(sk)) {
650                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
651                                         &ip_hdr(skb)->saddr, AF_INET);
652         } else if (hash_location) {
653                 /*
654                  * active side is lost. Try to find listening socket through
655                  * source port, and then find md5 key through listening socket.
656                  * we are not loose security here:
657                  * Incoming packet is checked with md5 hash with finding key,
658                  * no RST generated if md5 hash doesn't match.
659                  */
660                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
661                                              ip_hdr(skb)->saddr,
662                                              th->source, ip_hdr(skb)->daddr,
663                                              ntohs(th->source), inet_iif(skb),
664                                              tcp_v4_sdif(skb));
665                 /* don't send rst if it can't find key */
666                 if (!sk1)
667                         goto out;
668
669                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
670                                         &ip_hdr(skb)->saddr, AF_INET);
671                 if (!key)
672                         goto out;
673
674
675                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
676                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
677                         goto out;
678
679         }
680
681         if (key) {
682                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
683                                    (TCPOPT_NOP << 16) |
684                                    (TCPOPT_MD5SIG << 8) |
685                                    TCPOLEN_MD5SIG);
686                 /* Update length and the length the header thinks exists */
687                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
688                 rep.th.doff = arg.iov[0].iov_len / 4;
689
690                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
691                                      key, ip_hdr(skb)->saddr,
692                                      ip_hdr(skb)->daddr, &rep.th);
693         }
694 #endif
695         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
696                                       ip_hdr(skb)->saddr, /* XXX */
697                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
698         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
699         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
700
701         /* When socket is gone, all binding information is lost.
702          * routing might fail in this case. No choice here, if we choose to force
703          * input interface, we will misroute in case of asymmetric route.
704          */
705         if (sk)
706                 arg.bound_dev_if = sk->sk_bound_dev_if;
707
708         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
709                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
710
711         arg.tos = ip_hdr(skb)->tos;
712         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
713         local_bh_disable();
714         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
715                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
716                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
717                               &arg, arg.iov[0].iov_len);
718
719         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
720         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
721         local_bh_enable();
722
723 #ifdef CONFIG_TCP_MD5SIG
724 out:
725         rcu_read_unlock();
726 #endif
727 }
728
729 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
730    outside socket context is ugly, certainly. What can I do?
731  */
732
733 static void tcp_v4_send_ack(const struct sock *sk,
734                             struct sk_buff *skb, u32 seq, u32 ack,
735                             u32 win, u32 tsval, u32 tsecr, int oif,
736                             struct tcp_md5sig_key *key,
737                             int reply_flags, u8 tos)
738 {
739         const struct tcphdr *th = tcp_hdr(skb);
740         struct {
741                 struct tcphdr th;
742                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
743 #ifdef CONFIG_TCP_MD5SIG
744                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
745 #endif
746                         ];
747         } rep;
748         struct net *net = sock_net(sk);
749         struct ip_reply_arg arg;
750
751         memset(&rep.th, 0, sizeof(struct tcphdr));
752         memset(&arg, 0, sizeof(arg));
753
754         arg.iov[0].iov_base = (unsigned char *)&rep;
755         arg.iov[0].iov_len  = sizeof(rep.th);
756         if (tsecr) {
757                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
758                                    (TCPOPT_TIMESTAMP << 8) |
759                                    TCPOLEN_TIMESTAMP);
760                 rep.opt[1] = htonl(tsval);
761                 rep.opt[2] = htonl(tsecr);
762                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
763         }
764
765         /* Swap the send and the receive. */
766         rep.th.dest    = th->source;
767         rep.th.source  = th->dest;
768         rep.th.doff    = arg.iov[0].iov_len / 4;
769         rep.th.seq     = htonl(seq);
770         rep.th.ack_seq = htonl(ack);
771         rep.th.ack     = 1;
772         rep.th.window  = htons(win);
773
774 #ifdef CONFIG_TCP_MD5SIG
775         if (key) {
776                 int offset = (tsecr) ? 3 : 0;
777
778                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
779                                           (TCPOPT_NOP << 16) |
780                                           (TCPOPT_MD5SIG << 8) |
781                                           TCPOLEN_MD5SIG);
782                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783                 rep.th.doff = arg.iov[0].iov_len/4;
784
785                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
786                                     key, ip_hdr(skb)->saddr,
787                                     ip_hdr(skb)->daddr, &rep.th);
788         }
789 #endif
790         arg.flags = reply_flags;
791         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792                                       ip_hdr(skb)->saddr, /* XXX */
793                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
794         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795         if (oif)
796                 arg.bound_dev_if = oif;
797         arg.tos = tos;
798         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
799         local_bh_disable();
800         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
801                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
802                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
803                               &arg, arg.iov[0].iov_len);
804
805         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
806         local_bh_enable();
807 }
808
809 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
810 {
811         struct inet_timewait_sock *tw = inet_twsk(sk);
812         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
813
814         tcp_v4_send_ack(sk, skb,
815                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
816                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
817                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
818                         tcptw->tw_ts_recent,
819                         tw->tw_bound_dev_if,
820                         tcp_twsk_md5_key(tcptw),
821                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
822                         tw->tw_tos
823                         );
824
825         inet_twsk_put(tw);
826 }
827
828 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
829                                   struct request_sock *req)
830 {
831         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
832          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
833          */
834         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
835                                              tcp_sk(sk)->snd_nxt;
836
837         /* RFC 7323 2.3
838          * The window field (SEG.WND) of every outgoing segment, with the
839          * exception of <SYN> segments, MUST be right-shifted by
840          * Rcv.Wind.Shift bits:
841          */
842         tcp_v4_send_ack(sk, skb, seq,
843                         tcp_rsk(req)->rcv_nxt,
844                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
845                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
846                         req->ts_recent,
847                         0,
848                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
849                                           AF_INET),
850                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
851                         ip_hdr(skb)->tos);
852 }
853
854 /*
855  *      Send a SYN-ACK after having received a SYN.
856  *      This still operates on a request_sock only, not on a big
857  *      socket.
858  */
859 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
860                               struct flowi *fl,
861                               struct request_sock *req,
862                               struct tcp_fastopen_cookie *foc,
863                               enum tcp_synack_type synack_type)
864 {
865         const struct inet_request_sock *ireq = inet_rsk(req);
866         struct flowi4 fl4;
867         int err = -1;
868         struct sk_buff *skb;
869
870         /* First, grab a route. */
871         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
872                 return -1;
873
874         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
875
876         if (skb) {
877                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
878
879                 rcu_read_lock();
880                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
881                                             ireq->ir_rmt_addr,
882                                             rcu_dereference(ireq->ireq_opt));
883                 rcu_read_unlock();
884                 err = net_xmit_eval(err);
885         }
886
887         return err;
888 }
889
890 /*
891  *      IPv4 request_sock destructor.
892  */
893 static void tcp_v4_reqsk_destructor(struct request_sock *req)
894 {
895         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
896 }
897
898 #ifdef CONFIG_TCP_MD5SIG
899 /*
900  * RFC2385 MD5 checksumming requires a mapping of
901  * IP address->MD5 Key.
902  * We need to maintain these in the sk structure.
903  */
904
905 /* Find the Key structure for an address.  */
906 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
907                                          const union tcp_md5_addr *addr,
908                                          int family)
909 {
910         const struct tcp_sock *tp = tcp_sk(sk);
911         struct tcp_md5sig_key *key;
912         const struct tcp_md5sig_info *md5sig;
913         __be32 mask;
914         struct tcp_md5sig_key *best_match = NULL;
915         bool match;
916
917         /* caller either holds rcu_read_lock() or socket lock */
918         md5sig = rcu_dereference_check(tp->md5sig_info,
919                                        lockdep_sock_is_held(sk));
920         if (!md5sig)
921                 return NULL;
922
923         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
924                 if (key->family != family)
925                         continue;
926
927                 if (family == AF_INET) {
928                         mask = inet_make_mask(key->prefixlen);
929                         match = (key->addr.a4.s_addr & mask) ==
930                                 (addr->a4.s_addr & mask);
931 #if IS_ENABLED(CONFIG_IPV6)
932                 } else if (family == AF_INET6) {
933                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
934                                                   key->prefixlen);
935 #endif
936                 } else {
937                         match = false;
938                 }
939
940                 if (match && (!best_match ||
941                               key->prefixlen > best_match->prefixlen))
942                         best_match = key;
943         }
944         return best_match;
945 }
946 EXPORT_SYMBOL(tcp_md5_do_lookup);
947
948 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
949                                                       const union tcp_md5_addr *addr,
950                                                       int family, u8 prefixlen)
951 {
952         const struct tcp_sock *tp = tcp_sk(sk);
953         struct tcp_md5sig_key *key;
954         unsigned int size = sizeof(struct in_addr);
955         const struct tcp_md5sig_info *md5sig;
956
957         /* caller either holds rcu_read_lock() or socket lock */
958         md5sig = rcu_dereference_check(tp->md5sig_info,
959                                        lockdep_sock_is_held(sk));
960         if (!md5sig)
961                 return NULL;
962 #if IS_ENABLED(CONFIG_IPV6)
963         if (family == AF_INET6)
964                 size = sizeof(struct in6_addr);
965 #endif
966         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
967                 if (key->family != family)
968                         continue;
969                 if (!memcmp(&key->addr, addr, size) &&
970                     key->prefixlen == prefixlen)
971                         return key;
972         }
973         return NULL;
974 }
975
976 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
977                                          const struct sock *addr_sk)
978 {
979         const union tcp_md5_addr *addr;
980
981         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
982         return tcp_md5_do_lookup(sk, addr, AF_INET);
983 }
984 EXPORT_SYMBOL(tcp_v4_md5_lookup);
985
986 /* This can be called on a newly created socket, from other files */
987 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
988                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
989                    gfp_t gfp)
990 {
991         /* Add Key to the list */
992         struct tcp_md5sig_key *key;
993         struct tcp_sock *tp = tcp_sk(sk);
994         struct tcp_md5sig_info *md5sig;
995
996         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
997         if (key) {
998                 /* Pre-existing entry - just update that one.
999                  * Note that the key might be used concurrently.
1000                  */
1001                 memcpy(key->key, newkey, newkeylen);
1002
1003                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1004                  * Also note that a reader could catch new key->keylen value
1005                  * but old key->key[], this is the reason we use __GFP_ZERO
1006                  * at sock_kmalloc() time below these lines.
1007                  */
1008                 WRITE_ONCE(key->keylen, newkeylen);
1009
1010                 return 0;
1011         }
1012
1013         md5sig = rcu_dereference_protected(tp->md5sig_info,
1014                                            lockdep_sock_is_held(sk));
1015         if (!md5sig) {
1016                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1017                 if (!md5sig)
1018                         return -ENOMEM;
1019
1020                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1021                 INIT_HLIST_HEAD(&md5sig->head);
1022                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1023         }
1024
1025         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1026         if (!key)
1027                 return -ENOMEM;
1028         if (!tcp_alloc_md5sig_pool()) {
1029                 sock_kfree_s(sk, key, sizeof(*key));
1030                 return -ENOMEM;
1031         }
1032
1033         memcpy(key->key, newkey, newkeylen);
1034         key->keylen = newkeylen;
1035         key->family = family;
1036         key->prefixlen = prefixlen;
1037         memcpy(&key->addr, addr,
1038                (family == AF_INET6) ? sizeof(struct in6_addr) :
1039                                       sizeof(struct in_addr));
1040         hlist_add_head_rcu(&key->node, &md5sig->head);
1041         return 0;
1042 }
1043 EXPORT_SYMBOL(tcp_md5_do_add);
1044
1045 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1046                    u8 prefixlen)
1047 {
1048         struct tcp_md5sig_key *key;
1049
1050         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1051         if (!key)
1052                 return -ENOENT;
1053         hlist_del_rcu(&key->node);
1054         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1055         kfree_rcu(key, rcu);
1056         return 0;
1057 }
1058 EXPORT_SYMBOL(tcp_md5_do_del);
1059
1060 static void tcp_clear_md5_list(struct sock *sk)
1061 {
1062         struct tcp_sock *tp = tcp_sk(sk);
1063         struct tcp_md5sig_key *key;
1064         struct hlist_node *n;
1065         struct tcp_md5sig_info *md5sig;
1066
1067         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1068
1069         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1070                 hlist_del_rcu(&key->node);
1071                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1072                 kfree_rcu(key, rcu);
1073         }
1074 }
1075
1076 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1077                                  char __user *optval, int optlen)
1078 {
1079         struct tcp_md5sig cmd;
1080         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1081         u8 prefixlen = 32;
1082
1083         if (optlen < sizeof(cmd))
1084                 return -EINVAL;
1085
1086         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1087                 return -EFAULT;
1088
1089         if (sin->sin_family != AF_INET)
1090                 return -EINVAL;
1091
1092         if (optname == TCP_MD5SIG_EXT &&
1093             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1094                 prefixlen = cmd.tcpm_prefixlen;
1095                 if (prefixlen > 32)
1096                         return -EINVAL;
1097         }
1098
1099         if (!cmd.tcpm_keylen)
1100                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1101                                       AF_INET, prefixlen);
1102
1103         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1104                 return -EINVAL;
1105
1106         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1107                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1108                               GFP_KERNEL);
1109 }
1110
1111 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1112                                    __be32 daddr, __be32 saddr,
1113                                    const struct tcphdr *th, int nbytes)
1114 {
1115         struct tcp4_pseudohdr *bp;
1116         struct scatterlist sg;
1117         struct tcphdr *_th;
1118
1119         bp = hp->scratch;
1120         bp->saddr = saddr;
1121         bp->daddr = daddr;
1122         bp->pad = 0;
1123         bp->protocol = IPPROTO_TCP;
1124         bp->len = cpu_to_be16(nbytes);
1125
1126         _th = (struct tcphdr *)(bp + 1);
1127         memcpy(_th, th, sizeof(*th));
1128         _th->check = 0;
1129
1130         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1131         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1132                                 sizeof(*bp) + sizeof(*th));
1133         return crypto_ahash_update(hp->md5_req);
1134 }
1135
1136 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1137                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1138 {
1139         struct tcp_md5sig_pool *hp;
1140         struct ahash_request *req;
1141
1142         hp = tcp_get_md5sig_pool();
1143         if (!hp)
1144                 goto clear_hash_noput;
1145         req = hp->md5_req;
1146
1147         if (crypto_ahash_init(req))
1148                 goto clear_hash;
1149         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1150                 goto clear_hash;
1151         if (tcp_md5_hash_key(hp, key))
1152                 goto clear_hash;
1153         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1154         if (crypto_ahash_final(req))
1155                 goto clear_hash;
1156
1157         tcp_put_md5sig_pool();
1158         return 0;
1159
1160 clear_hash:
1161         tcp_put_md5sig_pool();
1162 clear_hash_noput:
1163         memset(md5_hash, 0, 16);
1164         return 1;
1165 }
1166
1167 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1168                         const struct sock *sk,
1169                         const struct sk_buff *skb)
1170 {
1171         struct tcp_md5sig_pool *hp;
1172         struct ahash_request *req;
1173         const struct tcphdr *th = tcp_hdr(skb);
1174         __be32 saddr, daddr;
1175
1176         if (sk) { /* valid for establish/request sockets */
1177                 saddr = sk->sk_rcv_saddr;
1178                 daddr = sk->sk_daddr;
1179         } else {
1180                 const struct iphdr *iph = ip_hdr(skb);
1181                 saddr = iph->saddr;
1182                 daddr = iph->daddr;
1183         }
1184
1185         hp = tcp_get_md5sig_pool();
1186         if (!hp)
1187                 goto clear_hash_noput;
1188         req = hp->md5_req;
1189
1190         if (crypto_ahash_init(req))
1191                 goto clear_hash;
1192
1193         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1194                 goto clear_hash;
1195         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1196                 goto clear_hash;
1197         if (tcp_md5_hash_key(hp, key))
1198                 goto clear_hash;
1199         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1200         if (crypto_ahash_final(req))
1201                 goto clear_hash;
1202
1203         tcp_put_md5sig_pool();
1204         return 0;
1205
1206 clear_hash:
1207         tcp_put_md5sig_pool();
1208 clear_hash_noput:
1209         memset(md5_hash, 0, 16);
1210         return 1;
1211 }
1212 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1213
1214 #endif
1215
1216 /* Called with rcu_read_lock() */
1217 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1218                                     const struct sk_buff *skb)
1219 {
1220 #ifdef CONFIG_TCP_MD5SIG
1221         /*
1222          * This gets called for each TCP segment that arrives
1223          * so we want to be efficient.
1224          * We have 3 drop cases:
1225          * o No MD5 hash and one expected.
1226          * o MD5 hash and we're not expecting one.
1227          * o MD5 hash and its wrong.
1228          */
1229         const __u8 *hash_location = NULL;
1230         struct tcp_md5sig_key *hash_expected;
1231         const struct iphdr *iph = ip_hdr(skb);
1232         const struct tcphdr *th = tcp_hdr(skb);
1233         int genhash;
1234         unsigned char newhash[16];
1235
1236         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1237                                           AF_INET);
1238         hash_location = tcp_parse_md5sig_option(th);
1239
1240         /* We've parsed the options - do we have a hash? */
1241         if (!hash_expected && !hash_location)
1242                 return false;
1243
1244         if (hash_expected && !hash_location) {
1245                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1246                 return true;
1247         }
1248
1249         if (!hash_expected && hash_location) {
1250                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1251                 return true;
1252         }
1253
1254         /* Okay, so this is hash_expected and hash_location -
1255          * so we need to calculate the checksum.
1256          */
1257         genhash = tcp_v4_md5_hash_skb(newhash,
1258                                       hash_expected,
1259                                       NULL, skb);
1260
1261         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1262                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1263                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1264                                      &iph->saddr, ntohs(th->source),
1265                                      &iph->daddr, ntohs(th->dest),
1266                                      genhash ? " tcp_v4_calc_md5_hash failed"
1267                                      : "");
1268                 return true;
1269         }
1270         return false;
1271 #endif
1272         return false;
1273 }
1274
1275 static void tcp_v4_init_req(struct request_sock *req,
1276                             const struct sock *sk_listener,
1277                             struct sk_buff *skb)
1278 {
1279         struct inet_request_sock *ireq = inet_rsk(req);
1280         struct net *net = sock_net(sk_listener);
1281
1282         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1283         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1284         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1285 }
1286
1287 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1288                                           struct flowi *fl,
1289                                           const struct request_sock *req)
1290 {
1291         return inet_csk_route_req(sk, &fl->u.ip4, req);
1292 }
1293
1294 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1295         .family         =       PF_INET,
1296         .obj_size       =       sizeof(struct tcp_request_sock),
1297         .rtx_syn_ack    =       tcp_rtx_synack,
1298         .send_ack       =       tcp_v4_reqsk_send_ack,
1299         .destructor     =       tcp_v4_reqsk_destructor,
1300         .send_reset     =       tcp_v4_send_reset,
1301         .syn_ack_timeout =      tcp_syn_ack_timeout,
1302 };
1303
1304 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1305         .mss_clamp      =       TCP_MSS_DEFAULT,
1306 #ifdef CONFIG_TCP_MD5SIG
1307         .req_md5_lookup =       tcp_v4_md5_lookup,
1308         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1309 #endif
1310         .init_req       =       tcp_v4_init_req,
1311 #ifdef CONFIG_SYN_COOKIES
1312         .cookie_init_seq =      cookie_v4_init_sequence,
1313 #endif
1314         .route_req      =       tcp_v4_route_req,
1315         .init_seq       =       tcp_v4_init_seq,
1316         .init_ts_off    =       tcp_v4_init_ts_off,
1317         .send_synack    =       tcp_v4_send_synack,
1318 };
1319
1320 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1321 {
1322         /* Never answer to SYNs send to broadcast or multicast */
1323         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1324                 goto drop;
1325
1326         return tcp_conn_request(&tcp_request_sock_ops,
1327                                 &tcp_request_sock_ipv4_ops, sk, skb);
1328
1329 drop:
1330         tcp_listendrop(sk);
1331         return 0;
1332 }
1333 EXPORT_SYMBOL(tcp_v4_conn_request);
1334
1335
1336 /*
1337  * The three way handshake has completed - we got a valid synack -
1338  * now create the new socket.
1339  */
1340 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1341                                   struct request_sock *req,
1342                                   struct dst_entry *dst,
1343                                   struct request_sock *req_unhash,
1344                                   bool *own_req)
1345 {
1346         struct inet_request_sock *ireq;
1347         struct inet_sock *newinet;
1348         struct tcp_sock *newtp;
1349         struct sock *newsk;
1350 #ifdef CONFIG_TCP_MD5SIG
1351         struct tcp_md5sig_key *key;
1352 #endif
1353         struct ip_options_rcu *inet_opt;
1354
1355         if (sk_acceptq_is_full(sk))
1356                 goto exit_overflow;
1357
1358         newsk = tcp_create_openreq_child(sk, req, skb);
1359         if (!newsk)
1360                 goto exit_nonewsk;
1361
1362         newsk->sk_gso_type = SKB_GSO_TCPV4;
1363         inet_sk_rx_dst_set(newsk, skb);
1364
1365         newtp                 = tcp_sk(newsk);
1366         newinet               = inet_sk(newsk);
1367         ireq                  = inet_rsk(req);
1368         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1369         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1370         newsk->sk_bound_dev_if = ireq->ir_iif;
1371         newinet->inet_saddr   = ireq->ir_loc_addr;
1372         inet_opt              = rcu_dereference(ireq->ireq_opt);
1373         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1374         newinet->mc_index     = inet_iif(skb);
1375         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1376         newinet->rcv_tos      = ip_hdr(skb)->tos;
1377         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1378         if (inet_opt)
1379                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1380         newinet->inet_id = prandom_u32();
1381
1382         if (!dst) {
1383                 dst = inet_csk_route_child_sock(sk, newsk, req);
1384                 if (!dst)
1385                         goto put_and_exit;
1386         } else {
1387                 /* syncookie case : see end of cookie_v4_check() */
1388         }
1389         sk_setup_caps(newsk, dst);
1390
1391         tcp_ca_openreq_child(newsk, dst);
1392
1393         tcp_sync_mss(newsk, dst_mtu(dst));
1394         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1395
1396         tcp_initialize_rcv_mss(newsk);
1397
1398 #ifdef CONFIG_TCP_MD5SIG
1399         /* Copy over the MD5 key from the original socket */
1400         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1401                                 AF_INET);
1402         if (key) {
1403                 /*
1404                  * We're using one, so create a matching key
1405                  * on the newsk structure. If we fail to get
1406                  * memory, then we end up not copying the key
1407                  * across. Shucks.
1408                  */
1409                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1410                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1411                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1412         }
1413 #endif
1414
1415         if (__inet_inherit_port(sk, newsk) < 0)
1416                 goto put_and_exit;
1417         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1418         if (likely(*own_req)) {
1419                 tcp_move_syn(newtp, req);
1420                 ireq->ireq_opt = NULL;
1421         } else {
1422                 newinet->inet_opt = NULL;
1423         }
1424         return newsk;
1425
1426 exit_overflow:
1427         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1428 exit_nonewsk:
1429         dst_release(dst);
1430 exit:
1431         tcp_listendrop(sk);
1432         return NULL;
1433 put_and_exit:
1434         newinet->inet_opt = NULL;
1435         inet_csk_prepare_forced_close(newsk);
1436         tcp_done(newsk);
1437         goto exit;
1438 }
1439 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1440
1441 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1442 {
1443 #ifdef CONFIG_SYN_COOKIES
1444         const struct tcphdr *th = tcp_hdr(skb);
1445
1446         if (!th->syn)
1447                 sk = cookie_v4_check(sk, skb);
1448 #endif
1449         return sk;
1450 }
1451
1452 /* The socket must have it's spinlock held when we get
1453  * here, unless it is a TCP_LISTEN socket.
1454  *
1455  * We have a potential double-lock case here, so even when
1456  * doing backlog processing we use the BH locking scheme.
1457  * This is because we cannot sleep with the original spinlock
1458  * held.
1459  */
1460 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1461 {
1462         struct sock *rsk;
1463
1464         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1465                 struct dst_entry *dst = sk->sk_rx_dst;
1466
1467                 sock_rps_save_rxhash(sk, skb);
1468                 sk_mark_napi_id(sk, skb);
1469                 if (dst) {
1470                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1471                             !dst->ops->check(dst, 0)) {
1472                                 dst_release(dst);
1473                                 sk->sk_rx_dst = NULL;
1474                         }
1475                 }
1476                 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1477                 return 0;
1478         }
1479
1480         if (tcp_checksum_complete(skb))
1481                 goto csum_err;
1482
1483         if (sk->sk_state == TCP_LISTEN) {
1484                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1485
1486                 if (!nsk)
1487                         goto discard;
1488                 if (nsk != sk) {
1489                         if (tcp_child_process(sk, nsk, skb)) {
1490                                 rsk = nsk;
1491                                 goto reset;
1492                         }
1493                         return 0;
1494                 }
1495         } else
1496                 sock_rps_save_rxhash(sk, skb);
1497
1498         if (tcp_rcv_state_process(sk, skb)) {
1499                 rsk = sk;
1500                 goto reset;
1501         }
1502         return 0;
1503
1504 reset:
1505         tcp_v4_send_reset(rsk, skb);
1506 discard:
1507         kfree_skb(skb);
1508         /* Be careful here. If this function gets more complicated and
1509          * gcc suffers from register pressure on the x86, sk (in %ebx)
1510          * might be destroyed here. This current version compiles correctly,
1511          * but you have been warned.
1512          */
1513         return 0;
1514
1515 csum_err:
1516         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1517         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1518         goto discard;
1519 }
1520 EXPORT_SYMBOL(tcp_v4_do_rcv);
1521
1522 int tcp_v4_early_demux(struct sk_buff *skb)
1523 {
1524         const struct iphdr *iph;
1525         const struct tcphdr *th;
1526         struct sock *sk;
1527
1528         if (skb->pkt_type != PACKET_HOST)
1529                 return 0;
1530
1531         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1532                 return 0;
1533
1534         iph = ip_hdr(skb);
1535         th = tcp_hdr(skb);
1536
1537         if (th->doff < sizeof(struct tcphdr) / 4)
1538                 return 0;
1539
1540         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1541                                        iph->saddr, th->source,
1542                                        iph->daddr, ntohs(th->dest),
1543                                        skb->skb_iif, inet_sdif(skb));
1544         if (sk) {
1545                 skb->sk = sk;
1546                 skb->destructor = sock_edemux;
1547                 if (sk_fullsock(sk)) {
1548                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1549
1550                         if (dst)
1551                                 dst = dst_check(dst, 0);
1552                         if (dst &&
1553                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1554                                 skb_dst_set_noref(skb, dst);
1555                 }
1556         }
1557         return 0;
1558 }
1559
1560 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1561 {
1562         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1563
1564         /* Only socket owner can try to collapse/prune rx queues
1565          * to reduce memory overhead, so add a little headroom here.
1566          * Few sockets backlog are possibly concurrently non empty.
1567          */
1568         limit += 64*1024;
1569
1570         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1571          * we can fix skb->truesize to its real value to avoid future drops.
1572          * This is valid because skb is not yet charged to the socket.
1573          * It has been noticed pure SACK packets were sometimes dropped
1574          * (if cooked by drivers without copybreak feature).
1575          */
1576         skb_condense(skb);
1577
1578         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1579                 bh_unlock_sock(sk);
1580                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1581                 return true;
1582         }
1583         return false;
1584 }
1585 EXPORT_SYMBOL(tcp_add_backlog);
1586
1587 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1588 {
1589         struct tcphdr *th = (struct tcphdr *)skb->data;
1590
1591         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1592 }
1593 EXPORT_SYMBOL(tcp_filter);
1594
1595 static void tcp_v4_restore_cb(struct sk_buff *skb)
1596 {
1597         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1598                 sizeof(struct inet_skb_parm));
1599 }
1600
1601 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1602                            const struct tcphdr *th)
1603 {
1604         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1605          * barrier() makes sure compiler wont play fool^Waliasing games.
1606          */
1607         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1608                 sizeof(struct inet_skb_parm));
1609         barrier();
1610
1611         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1612         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1613                                     skb->len - th->doff * 4);
1614         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1615         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1616         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1617         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1618         TCP_SKB_CB(skb)->sacked  = 0;
1619         TCP_SKB_CB(skb)->has_rxtstamp =
1620                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1621 }
1622
1623 /*
1624  *      From tcp_input.c
1625  */
1626
1627 int tcp_v4_rcv(struct sk_buff *skb)
1628 {
1629         struct net *net = dev_net(skb->dev);
1630         int sdif = inet_sdif(skb);
1631         const struct iphdr *iph;
1632         const struct tcphdr *th;
1633         bool refcounted;
1634         struct sock *sk;
1635         int ret;
1636
1637         if (skb->pkt_type != PACKET_HOST)
1638                 goto discard_it;
1639
1640         /* Count it even if it's bad */
1641         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1642
1643         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1644                 goto discard_it;
1645
1646         th = (const struct tcphdr *)skb->data;
1647
1648         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1649                 goto bad_packet;
1650         if (!pskb_may_pull(skb, th->doff * 4))
1651                 goto discard_it;
1652
1653         /* An explanation is required here, I think.
1654          * Packet length and doff are validated by header prediction,
1655          * provided case of th->doff==0 is eliminated.
1656          * So, we defer the checks. */
1657
1658         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1659                 goto csum_error;
1660
1661         th = (const struct tcphdr *)skb->data;
1662         iph = ip_hdr(skb);
1663 lookup:
1664         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1665                                th->dest, sdif, &refcounted);
1666         if (!sk)
1667                 goto no_tcp_socket;
1668
1669 process:
1670         if (sk->sk_state == TCP_TIME_WAIT)
1671                 goto do_time_wait;
1672
1673         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1674                 struct request_sock *req = inet_reqsk(sk);
1675                 struct sock *nsk;
1676
1677                 sk = req->rsk_listener;
1678                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1679                         sk_drops_add(sk, skb);
1680                         reqsk_put(req);
1681                         goto discard_it;
1682                 }
1683                 if (tcp_checksum_complete(skb)) {
1684                         reqsk_put(req);
1685                         goto csum_error;
1686                 }
1687                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1688                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1689                         goto lookup;
1690                 }
1691                 /* We own a reference on the listener, increase it again
1692                  * as we might lose it too soon.
1693                  */
1694                 sock_hold(sk);
1695                 refcounted = true;
1696                 nsk = NULL;
1697                 if (!tcp_filter(sk, skb)) {
1698                         th = (const struct tcphdr *)skb->data;
1699                         iph = ip_hdr(skb);
1700                         tcp_v4_fill_cb(skb, iph, th);
1701                         nsk = tcp_check_req(sk, skb, req, false);
1702                 }
1703                 if (!nsk) {
1704                         reqsk_put(req);
1705                         goto discard_and_relse;
1706                 }
1707                 if (nsk == sk) {
1708                         reqsk_put(req);
1709                         tcp_v4_restore_cb(skb);
1710                 } else if (tcp_child_process(sk, nsk, skb)) {
1711                         tcp_v4_send_reset(nsk, skb);
1712                         goto discard_and_relse;
1713                 } else {
1714                         sock_put(sk);
1715                         return 0;
1716                 }
1717         }
1718         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1719                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1720                 goto discard_and_relse;
1721         }
1722
1723         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1724                 goto discard_and_relse;
1725
1726         if (tcp_v4_inbound_md5_hash(sk, skb))
1727                 goto discard_and_relse;
1728
1729         nf_reset(skb);
1730
1731         if (tcp_filter(sk, skb))
1732                 goto discard_and_relse;
1733         th = (const struct tcphdr *)skb->data;
1734         iph = ip_hdr(skb);
1735         tcp_v4_fill_cb(skb, iph, th);
1736
1737         skb->dev = NULL;
1738
1739         if (sk->sk_state == TCP_LISTEN) {
1740                 ret = tcp_v4_do_rcv(sk, skb);
1741                 goto put_and_return;
1742         }
1743
1744         sk_incoming_cpu_update(sk);
1745
1746         bh_lock_sock_nested(sk);
1747         tcp_segs_in(tcp_sk(sk), skb);
1748         ret = 0;
1749         if (!sock_owned_by_user(sk)) {
1750                 ret = tcp_v4_do_rcv(sk, skb);
1751         } else if (tcp_add_backlog(sk, skb)) {
1752                 goto discard_and_relse;
1753         }
1754         bh_unlock_sock(sk);
1755
1756 put_and_return:
1757         if (refcounted)
1758                 sock_put(sk);
1759
1760         return ret;
1761
1762 no_tcp_socket:
1763         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1764                 goto discard_it;
1765
1766         tcp_v4_fill_cb(skb, iph, th);
1767
1768         if (tcp_checksum_complete(skb)) {
1769 csum_error:
1770                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1771 bad_packet:
1772                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1773         } else {
1774                 tcp_v4_send_reset(NULL, skb);
1775         }
1776
1777 discard_it:
1778         /* Discard frame. */
1779         kfree_skb(skb);
1780         return 0;
1781
1782 discard_and_relse:
1783         sk_drops_add(sk, skb);
1784         if (refcounted)
1785                 sock_put(sk);
1786         goto discard_it;
1787
1788 do_time_wait:
1789         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1790                 inet_twsk_put(inet_twsk(sk));
1791                 goto discard_it;
1792         }
1793
1794         tcp_v4_fill_cb(skb, iph, th);
1795
1796         if (tcp_checksum_complete(skb)) {
1797                 inet_twsk_put(inet_twsk(sk));
1798                 goto csum_error;
1799         }
1800         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1801         case TCP_TW_SYN: {
1802                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1803                                                         &tcp_hashinfo, skb,
1804                                                         __tcp_hdrlen(th),
1805                                                         iph->saddr, th->source,
1806                                                         iph->daddr, th->dest,
1807                                                         inet_iif(skb),
1808                                                         sdif);
1809                 if (sk2) {
1810                         inet_twsk_deschedule_put(inet_twsk(sk));
1811                         sk = sk2;
1812                         tcp_v4_restore_cb(skb);
1813                         refcounted = false;
1814                         goto process;
1815                 }
1816                 /* Fall through to ACK */
1817         }
1818         case TCP_TW_ACK:
1819                 tcp_v4_timewait_ack(sk, skb);
1820                 break;
1821         case TCP_TW_RST:
1822                 tcp_v4_send_reset(sk, skb);
1823                 inet_twsk_deschedule_put(inet_twsk(sk));
1824                 goto discard_it;
1825         case TCP_TW_SUCCESS:;
1826         }
1827         goto discard_it;
1828 }
1829
1830 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1831         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1832         .twsk_unique    = tcp_twsk_unique,
1833         .twsk_destructor= tcp_twsk_destructor,
1834 };
1835
1836 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1837 {
1838         struct dst_entry *dst = skb_dst(skb);
1839
1840         if (dst && dst_hold_safe(dst)) {
1841                 sk->sk_rx_dst = dst;
1842                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1843         }
1844 }
1845 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1846
1847 const struct inet_connection_sock_af_ops ipv4_specific = {
1848         .queue_xmit        = ip_queue_xmit,
1849         .send_check        = tcp_v4_send_check,
1850         .rebuild_header    = inet_sk_rebuild_header,
1851         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1852         .conn_request      = tcp_v4_conn_request,
1853         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1854         .net_header_len    = sizeof(struct iphdr),
1855         .setsockopt        = ip_setsockopt,
1856         .getsockopt        = ip_getsockopt,
1857         .addr2sockaddr     = inet_csk_addr2sockaddr,
1858         .sockaddr_len      = sizeof(struct sockaddr_in),
1859 #ifdef CONFIG_COMPAT
1860         .compat_setsockopt = compat_ip_setsockopt,
1861         .compat_getsockopt = compat_ip_getsockopt,
1862 #endif
1863         .mtu_reduced       = tcp_v4_mtu_reduced,
1864 };
1865 EXPORT_SYMBOL(ipv4_specific);
1866
1867 #ifdef CONFIG_TCP_MD5SIG
1868 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1869         .md5_lookup             = tcp_v4_md5_lookup,
1870         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1871         .md5_parse              = tcp_v4_parse_md5_keys,
1872 };
1873 #endif
1874
1875 /* NOTE: A lot of things set to zero explicitly by call to
1876  *       sk_alloc() so need not be done here.
1877  */
1878 static int tcp_v4_init_sock(struct sock *sk)
1879 {
1880         struct inet_connection_sock *icsk = inet_csk(sk);
1881
1882         tcp_init_sock(sk);
1883
1884         icsk->icsk_af_ops = &ipv4_specific;
1885
1886 #ifdef CONFIG_TCP_MD5SIG
1887         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1888 #endif
1889
1890         return 0;
1891 }
1892
1893 void tcp_v4_destroy_sock(struct sock *sk)
1894 {
1895         struct tcp_sock *tp = tcp_sk(sk);
1896
1897         tcp_clear_xmit_timers(sk);
1898
1899         tcp_cleanup_congestion_control(sk);
1900
1901         tcp_cleanup_ulp(sk);
1902
1903         /* Cleanup up the write buffer. */
1904         tcp_write_queue_purge(sk);
1905
1906         /* Check if we want to disable active TFO */
1907         tcp_fastopen_active_disable_ofo_check(sk);
1908
1909         /* Cleans up our, hopefully empty, out_of_order_queue. */
1910         skb_rbtree_purge(&tp->out_of_order_queue);
1911
1912 #ifdef CONFIG_TCP_MD5SIG
1913         /* Clean up the MD5 key list, if any */
1914         if (tp->md5sig_info) {
1915                 tcp_clear_md5_list(sk);
1916                 kfree_rcu(tp->md5sig_info, rcu);
1917                 tp->md5sig_info = NULL;
1918         }
1919 #endif
1920
1921         /* Clean up a referenced TCP bind bucket. */
1922         if (inet_csk(sk)->icsk_bind_hash)
1923                 inet_put_port(sk);
1924
1925         BUG_ON(tp->fastopen_rsk);
1926
1927         /* If socket is aborted during connect operation */
1928         tcp_free_fastopen_req(tp);
1929         tcp_saved_syn_free(tp);
1930
1931         sk_sockets_allocated_dec(sk);
1932 }
1933 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1934
1935 #ifdef CONFIG_PROC_FS
1936 /* Proc filesystem TCP sock list dumping. */
1937
1938 /*
1939  * Get next listener socket follow cur.  If cur is NULL, get first socket
1940  * starting from bucket given in st->bucket; when st->bucket is zero the
1941  * very first socket in the hash table is returned.
1942  */
1943 static void *listening_get_next(struct seq_file *seq, void *cur)
1944 {
1945         struct tcp_iter_state *st = seq->private;
1946         struct net *net = seq_file_net(seq);
1947         struct inet_listen_hashbucket *ilb;
1948         struct hlist_nulls_node *node;
1949         struct sock *sk = cur;
1950
1951         if (!sk) {
1952 get_head:
1953                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1954                 spin_lock(&ilb->lock);
1955                 sk = sk_nulls_head(&ilb->nulls_head);
1956                 st->offset = 0;
1957                 goto get_sk;
1958         }
1959         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1960         ++st->num;
1961         ++st->offset;
1962
1963         sk = sk_nulls_next(sk);
1964 get_sk:
1965         sk_nulls_for_each_from(sk, node) {
1966                 if (!net_eq(sock_net(sk), net))
1967                         continue;
1968                 if (sk->sk_family == st->family)
1969                         return sk;
1970         }
1971         spin_unlock(&ilb->lock);
1972         st->offset = 0;
1973         if (++st->bucket < INET_LHTABLE_SIZE)
1974                 goto get_head;
1975         return NULL;
1976 }
1977
1978 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1979 {
1980         struct tcp_iter_state *st = seq->private;
1981         void *rc;
1982
1983         st->bucket = 0;
1984         st->offset = 0;
1985         rc = listening_get_next(seq, NULL);
1986
1987         while (rc && *pos) {
1988                 rc = listening_get_next(seq, rc);
1989                 --*pos;
1990         }
1991         return rc;
1992 }
1993
1994 static inline bool empty_bucket(const struct tcp_iter_state *st)
1995 {
1996         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1997 }
1998
1999 /*
2000  * Get first established socket starting from bucket given in st->bucket.
2001  * If st->bucket is zero, the very first socket in the hash is returned.
2002  */
2003 static void *established_get_first(struct seq_file *seq)
2004 {
2005         struct tcp_iter_state *st = seq->private;
2006         struct net *net = seq_file_net(seq);
2007         void *rc = NULL;
2008
2009         st->offset = 0;
2010         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2011                 struct sock *sk;
2012                 struct hlist_nulls_node *node;
2013                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2014
2015                 /* Lockless fast path for the common case of empty buckets */
2016                 if (empty_bucket(st))
2017                         continue;
2018
2019                 spin_lock_bh(lock);
2020                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2021                         if (sk->sk_family != st->family ||
2022                             !net_eq(sock_net(sk), net)) {
2023                                 continue;
2024                         }
2025                         rc = sk;
2026                         goto out;
2027                 }
2028                 spin_unlock_bh(lock);
2029         }
2030 out:
2031         return rc;
2032 }
2033
2034 static void *established_get_next(struct seq_file *seq, void *cur)
2035 {
2036         struct sock *sk = cur;
2037         struct hlist_nulls_node *node;
2038         struct tcp_iter_state *st = seq->private;
2039         struct net *net = seq_file_net(seq);
2040
2041         ++st->num;
2042         ++st->offset;
2043
2044         sk = sk_nulls_next(sk);
2045
2046         sk_nulls_for_each_from(sk, node) {
2047                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2048                         return sk;
2049         }
2050
2051         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2052         ++st->bucket;
2053         return established_get_first(seq);
2054 }
2055
2056 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2057 {
2058         struct tcp_iter_state *st = seq->private;
2059         void *rc;
2060
2061         st->bucket = 0;
2062         rc = established_get_first(seq);
2063
2064         while (rc && pos) {
2065                 rc = established_get_next(seq, rc);
2066                 --pos;
2067         }
2068         return rc;
2069 }
2070
2071 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2072 {
2073         void *rc;
2074         struct tcp_iter_state *st = seq->private;
2075
2076         st->state = TCP_SEQ_STATE_LISTENING;
2077         rc        = listening_get_idx(seq, &pos);
2078
2079         if (!rc) {
2080                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2081                 rc        = established_get_idx(seq, pos);
2082         }
2083
2084         return rc;
2085 }
2086
2087 static void *tcp_seek_last_pos(struct seq_file *seq)
2088 {
2089         struct tcp_iter_state *st = seq->private;
2090         int bucket = st->bucket;
2091         int offset = st->offset;
2092         int orig_num = st->num;
2093         void *rc = NULL;
2094
2095         switch (st->state) {
2096         case TCP_SEQ_STATE_LISTENING:
2097                 if (st->bucket >= INET_LHTABLE_SIZE)
2098                         break;
2099                 st->state = TCP_SEQ_STATE_LISTENING;
2100                 rc = listening_get_next(seq, NULL);
2101                 while (offset-- && rc && bucket == st->bucket)
2102                         rc = listening_get_next(seq, rc);
2103                 if (rc)
2104                         break;
2105                 st->bucket = 0;
2106                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2107                 /* Fallthrough */
2108         case TCP_SEQ_STATE_ESTABLISHED:
2109                 if (st->bucket > tcp_hashinfo.ehash_mask)
2110                         break;
2111                 rc = established_get_first(seq);
2112                 while (offset-- && rc && bucket == st->bucket)
2113                         rc = established_get_next(seq, rc);
2114         }
2115
2116         st->num = orig_num;
2117
2118         return rc;
2119 }
2120
2121 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2122 {
2123         struct tcp_iter_state *st = seq->private;
2124         void *rc;
2125
2126         if (*pos && *pos == st->last_pos) {
2127                 rc = tcp_seek_last_pos(seq);
2128                 if (rc)
2129                         goto out;
2130         }
2131
2132         st->state = TCP_SEQ_STATE_LISTENING;
2133         st->num = 0;
2134         st->bucket = 0;
2135         st->offset = 0;
2136         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2137
2138 out:
2139         st->last_pos = *pos;
2140         return rc;
2141 }
2142
2143 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2144 {
2145         struct tcp_iter_state *st = seq->private;
2146         void *rc = NULL;
2147
2148         if (v == SEQ_START_TOKEN) {
2149                 rc = tcp_get_idx(seq, 0);
2150                 goto out;
2151         }
2152
2153         switch (st->state) {
2154         case TCP_SEQ_STATE_LISTENING:
2155                 rc = listening_get_next(seq, v);
2156                 if (!rc) {
2157                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2158                         st->bucket = 0;
2159                         st->offset = 0;
2160                         rc        = established_get_first(seq);
2161                 }
2162                 break;
2163         case TCP_SEQ_STATE_ESTABLISHED:
2164                 rc = established_get_next(seq, v);
2165                 break;
2166         }
2167 out:
2168         ++*pos;
2169         st->last_pos = *pos;
2170         return rc;
2171 }
2172
2173 static void tcp_seq_stop(struct seq_file *seq, void *v)
2174 {
2175         struct tcp_iter_state *st = seq->private;
2176
2177         switch (st->state) {
2178         case TCP_SEQ_STATE_LISTENING:
2179                 if (v != SEQ_START_TOKEN)
2180                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2181                 break;
2182         case TCP_SEQ_STATE_ESTABLISHED:
2183                 if (v)
2184                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2185                 break;
2186         }
2187 }
2188
2189 int tcp_seq_open(struct inode *inode, struct file *file)
2190 {
2191         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2192         struct tcp_iter_state *s;
2193         int err;
2194
2195         err = seq_open_net(inode, file, &afinfo->seq_ops,
2196                           sizeof(struct tcp_iter_state));
2197         if (err < 0)
2198                 return err;
2199
2200         s = ((struct seq_file *)file->private_data)->private;
2201         s->family               = afinfo->family;
2202         s->last_pos             = 0;
2203         return 0;
2204 }
2205 EXPORT_SYMBOL(tcp_seq_open);
2206
2207 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2208 {
2209         int rc = 0;
2210         struct proc_dir_entry *p;
2211
2212         afinfo->seq_ops.start           = tcp_seq_start;
2213         afinfo->seq_ops.next            = tcp_seq_next;
2214         afinfo->seq_ops.stop            = tcp_seq_stop;
2215
2216         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2217                              afinfo->seq_fops, afinfo);
2218         if (!p)
2219                 rc = -ENOMEM;
2220         return rc;
2221 }
2222 EXPORT_SYMBOL(tcp_proc_register);
2223
2224 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2225 {
2226         remove_proc_entry(afinfo->name, net->proc_net);
2227 }
2228 EXPORT_SYMBOL(tcp_proc_unregister);
2229
2230 static void get_openreq4(const struct request_sock *req,
2231                          struct seq_file *f, int i)
2232 {
2233         const struct inet_request_sock *ireq = inet_rsk(req);
2234         long delta = req->rsk_timer.expires - jiffies;
2235
2236         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2237                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2238                 i,
2239                 ireq->ir_loc_addr,
2240                 ireq->ir_num,
2241                 ireq->ir_rmt_addr,
2242                 ntohs(ireq->ir_rmt_port),
2243                 TCP_SYN_RECV,
2244                 0, 0, /* could print option size, but that is af dependent. */
2245                 1,    /* timers active (only the expire timer) */
2246                 jiffies_delta_to_clock_t(delta),
2247                 req->num_timeout,
2248                 from_kuid_munged(seq_user_ns(f),
2249                                  sock_i_uid(req->rsk_listener)),
2250                 0,  /* non standard timer */
2251                 0, /* open_requests have no inode */
2252                 0,
2253                 req);
2254 }
2255
2256 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2257 {
2258         int timer_active;
2259         unsigned long timer_expires;
2260         const struct tcp_sock *tp = tcp_sk(sk);
2261         const struct inet_connection_sock *icsk = inet_csk(sk);
2262         const struct inet_sock *inet = inet_sk(sk);
2263         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2264         __be32 dest = inet->inet_daddr;
2265         __be32 src = inet->inet_rcv_saddr;
2266         __u16 destp = ntohs(inet->inet_dport);
2267         __u16 srcp = ntohs(inet->inet_sport);
2268         int rx_queue;
2269         int state;
2270
2271         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2272             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2273             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2274                 timer_active    = 1;
2275                 timer_expires   = icsk->icsk_timeout;
2276         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2277                 timer_active    = 4;
2278                 timer_expires   = icsk->icsk_timeout;
2279         } else if (timer_pending(&sk->sk_timer)) {
2280                 timer_active    = 2;
2281                 timer_expires   = sk->sk_timer.expires;
2282         } else {
2283                 timer_active    = 0;
2284                 timer_expires = jiffies;
2285         }
2286
2287         state = sk_state_load(sk);
2288         if (state == TCP_LISTEN)
2289                 rx_queue = sk->sk_ack_backlog;
2290         else
2291                 /* Because we don't lock the socket,
2292                  * we might find a transient negative value.
2293                  */
2294                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2295
2296         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2297                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2298                 i, src, srcp, dest, destp, state,
2299                 tp->write_seq - tp->snd_una,
2300                 rx_queue,
2301                 timer_active,
2302                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2303                 icsk->icsk_retransmits,
2304                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2305                 icsk->icsk_probes_out,
2306                 sock_i_ino(sk),
2307                 refcount_read(&sk->sk_refcnt), sk,
2308                 jiffies_to_clock_t(icsk->icsk_rto),
2309                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2310                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2311                 tp->snd_cwnd,
2312                 state == TCP_LISTEN ?
2313                     fastopenq->max_qlen :
2314                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2315 }
2316
2317 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2318                                struct seq_file *f, int i)
2319 {
2320         long delta = tw->tw_timer.expires - jiffies;
2321         __be32 dest, src;
2322         __u16 destp, srcp;
2323
2324         dest  = tw->tw_daddr;
2325         src   = tw->tw_rcv_saddr;
2326         destp = ntohs(tw->tw_dport);
2327         srcp  = ntohs(tw->tw_sport);
2328
2329         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2330                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2331                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2332                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2333                 refcount_read(&tw->tw_refcnt), tw);
2334 }
2335
2336 #define TMPSZ 150
2337
2338 static int tcp4_seq_show(struct seq_file *seq, void *v)
2339 {
2340         struct tcp_iter_state *st;
2341         struct sock *sk = v;
2342
2343         seq_setwidth(seq, TMPSZ - 1);
2344         if (v == SEQ_START_TOKEN) {
2345                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2346                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2347                            "inode");
2348                 goto out;
2349         }
2350         st = seq->private;
2351
2352         if (sk->sk_state == TCP_TIME_WAIT)
2353                 get_timewait4_sock(v, seq, st->num);
2354         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2355                 get_openreq4(v, seq, st->num);
2356         else
2357                 get_tcp4_sock(v, seq, st->num);
2358 out:
2359         seq_pad(seq, '\n');
2360         return 0;
2361 }
2362
2363 static const struct file_operations tcp_afinfo_seq_fops = {
2364         .owner   = THIS_MODULE,
2365         .open    = tcp_seq_open,
2366         .read    = seq_read,
2367         .llseek  = seq_lseek,
2368         .release = seq_release_net
2369 };
2370
2371 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2372         .name           = "tcp",
2373         .family         = AF_INET,
2374         .seq_fops       = &tcp_afinfo_seq_fops,
2375         .seq_ops        = {
2376                 .show           = tcp4_seq_show,
2377         },
2378 };
2379
2380 static int __net_init tcp4_proc_init_net(struct net *net)
2381 {
2382         return tcp_proc_register(net, &tcp4_seq_afinfo);
2383 }
2384
2385 static void __net_exit tcp4_proc_exit_net(struct net *net)
2386 {
2387         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2388 }
2389
2390 static struct pernet_operations tcp4_net_ops = {
2391         .init = tcp4_proc_init_net,
2392         .exit = tcp4_proc_exit_net,
2393 };
2394
2395 int __init tcp4_proc_init(void)
2396 {
2397         return register_pernet_subsys(&tcp4_net_ops);
2398 }
2399
2400 void tcp4_proc_exit(void)
2401 {
2402         unregister_pernet_subsys(&tcp4_net_ops);
2403 }
2404 #endif /* CONFIG_PROC_FS */
2405
2406 struct proto tcp_prot = {
2407         .name                   = "TCP",
2408         .owner                  = THIS_MODULE,
2409         .close                  = tcp_close,
2410         .connect                = tcp_v4_connect,
2411         .disconnect             = tcp_disconnect,
2412         .accept                 = inet_csk_accept,
2413         .ioctl                  = tcp_ioctl,
2414         .init                   = tcp_v4_init_sock,
2415         .destroy                = tcp_v4_destroy_sock,
2416         .shutdown               = tcp_shutdown,
2417         .setsockopt             = tcp_setsockopt,
2418         .getsockopt             = tcp_getsockopt,
2419         .keepalive              = tcp_set_keepalive,
2420         .recvmsg                = tcp_recvmsg,
2421         .sendmsg                = tcp_sendmsg,
2422         .sendpage               = tcp_sendpage,
2423         .backlog_rcv            = tcp_v4_do_rcv,
2424         .release_cb             = tcp_release_cb,
2425         .hash                   = inet_hash,
2426         .unhash                 = inet_unhash,
2427         .get_port               = inet_csk_get_port,
2428         .enter_memory_pressure  = tcp_enter_memory_pressure,
2429         .leave_memory_pressure  = tcp_leave_memory_pressure,
2430         .stream_memory_free     = tcp_stream_memory_free,
2431         .sockets_allocated      = &tcp_sockets_allocated,
2432         .orphan_count           = &tcp_orphan_count,
2433         .memory_allocated       = &tcp_memory_allocated,
2434         .memory_pressure        = &tcp_memory_pressure,
2435         .sysctl_mem             = sysctl_tcp_mem,
2436         .sysctl_wmem            = sysctl_tcp_wmem,
2437         .sysctl_rmem            = sysctl_tcp_rmem,
2438         .max_header             = MAX_TCP_HEADER,
2439         .obj_size               = sizeof(struct tcp_sock),
2440         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2441         .twsk_prot              = &tcp_timewait_sock_ops,
2442         .rsk_prot               = &tcp_request_sock_ops,
2443         .h.hashinfo             = &tcp_hashinfo,
2444         .no_autobind            = true,
2445 #ifdef CONFIG_COMPAT
2446         .compat_setsockopt      = compat_tcp_setsockopt,
2447         .compat_getsockopt      = compat_tcp_getsockopt,
2448 #endif
2449         .diag_destroy           = tcp_abort,
2450 };
2451 EXPORT_SYMBOL(tcp_prot);
2452
2453 static void __net_exit tcp_sk_exit(struct net *net)
2454 {
2455         int cpu;
2456
2457         for_each_possible_cpu(cpu)
2458                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2459         free_percpu(net->ipv4.tcp_sk);
2460 }
2461
2462 static int __net_init tcp_sk_init(struct net *net)
2463 {
2464         int res, cpu, cnt;
2465
2466         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2467         if (!net->ipv4.tcp_sk)
2468                 return -ENOMEM;
2469
2470         for_each_possible_cpu(cpu) {
2471                 struct sock *sk;
2472
2473                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2474                                            IPPROTO_TCP, net);
2475                 if (res)
2476                         goto fail;
2477                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2478
2479                 /* Please enforce IP_DF and IPID==0 for RST and
2480                  * ACK sent in SYN-RECV and TIME-WAIT state.
2481                  */
2482                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2483
2484                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2485         }
2486
2487         net->ipv4.sysctl_tcp_ecn = 2;
2488         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2489
2490         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2491         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2492         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2493         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2494
2495         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2496         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2497         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2498
2499         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2500         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2501         net->ipv4.sysctl_tcp_syncookies = 1;
2502         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2503         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2504         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2505         net->ipv4.sysctl_tcp_orphan_retries = 0;
2506         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2507         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2508         net->ipv4.sysctl_tcp_tw_reuse = 0;
2509
2510         cnt = tcp_hashinfo.ehash_mask + 1;
2511         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2512         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2513
2514         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2515         net->ipv4.sysctl_tcp_sack = 1;
2516         net->ipv4.sysctl_tcp_window_scaling = 1;
2517         net->ipv4.sysctl_tcp_timestamps = 1;
2518
2519         return 0;
2520 fail:
2521         tcp_sk_exit(net);
2522
2523         return res;
2524 }
2525
2526 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2527 {
2528         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2529 }
2530
2531 static struct pernet_operations __net_initdata tcp_sk_ops = {
2532        .init       = tcp_sk_init,
2533        .exit       = tcp_sk_exit,
2534        .exit_batch = tcp_sk_exit_batch,
2535 };
2536
2537 void __init tcp_v4_init(void)
2538 {
2539         if (register_pernet_subsys(&tcp_sk_ops))
2540                 panic("Failed to create the TCP control socket.\n");
2541 }