GNU Linux-libre 4.19.264-gnu1
[releases.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 #include <trace/events/tcp.h>
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100         return secure_tcp_seq(ip_hdr(skb)->daddr,
101                               ip_hdr(skb)->saddr,
102                               tcp_hdr(skb)->dest,
103                               tcp_hdr(skb)->source);
104 }
105
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
114         const struct inet_timewait_sock *tw = inet_twsk(sktw);
115         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
116         struct tcp_sock *tp = tcp_sk(sk);
117
118         if (reuse == 2) {
119                 /* Still does not detect *everything* that goes through
120                  * lo, since we require a loopback src or dst address
121                  * or direct binding to 'lo' interface.
122                  */
123                 bool loopback = false;
124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125                         loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127                 if (tw->tw_family == AF_INET6) {
128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134                                 loopback = true;
135                 } else
136 #endif
137                 {
138                         if (ipv4_is_loopback(tw->tw_daddr) ||
139                             ipv4_is_loopback(tw->tw_rcv_saddr))
140                                 loopback = true;
141                 }
142                 if (!loopback)
143                         reuse = 0;
144         }
145
146         /* With PAWS, it is safe from the viewpoint
147            of data integrity. Even without PAWS it is safe provided sequence
148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150            Actually, the idea is close to VJ's one, only timestamp cache is
151            held not per host, but per port pair and TW bucket is used as state
152            holder.
153
154            If TW bucket has been already destroyed we fall back to VJ's scheme
155            and use initial timestamp retrieved from peer table.
156          */
157         if (tcptw->tw_ts_recent_stamp &&
158             (!twp || (reuse && time_after32(ktime_get_seconds(),
159                                             tcptw->tw_ts_recent_stamp)))) {
160                 /* In case of repair and re-using TIME-WAIT sockets we still
161                  * want to be sure that it is safe as above but honor the
162                  * sequence numbers and time stamps set as part of the repair
163                  * process.
164                  *
165                  * Without this check re-using a TIME-WAIT socket with TCP
166                  * repair would accumulate a -1 on the repair assigned
167                  * sequence number. The first time it is reused the sequence
168                  * is -1, the second time -2, etc. This fixes that issue
169                  * without appearing to create any others.
170                  */
171                 if (likely(!tp->repair)) {
172                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
173
174                         if (!seq)
175                                 seq = 1;
176                         WRITE_ONCE(tp->write_seq, seq);
177                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
178                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
179                 }
180                 sock_hold(sktw);
181                 return 1;
182         }
183
184         return 0;
185 }
186 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
187
188 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
189                               int addr_len)
190 {
191         /* This check is replicated from tcp_v4_connect() and intended to
192          * prevent BPF program called below from accessing bytes that are out
193          * of the bound specified by user in addr_len.
194          */
195         if (addr_len < sizeof(struct sockaddr_in))
196                 return -EINVAL;
197
198         sock_owned_by_me(sk);
199
200         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
201 }
202
203 /* This will initiate an outgoing connection. */
204 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
205 {
206         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
207         struct inet_sock *inet = inet_sk(sk);
208         struct tcp_sock *tp = tcp_sk(sk);
209         __be16 orig_sport, orig_dport;
210         __be32 daddr, nexthop;
211         struct flowi4 *fl4;
212         struct rtable *rt;
213         int err;
214         struct ip_options_rcu *inet_opt;
215         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
216
217         if (addr_len < sizeof(struct sockaddr_in))
218                 return -EINVAL;
219
220         if (usin->sin_family != AF_INET)
221                 return -EAFNOSUPPORT;
222
223         nexthop = daddr = usin->sin_addr.s_addr;
224         inet_opt = rcu_dereference_protected(inet->inet_opt,
225                                              lockdep_sock_is_held(sk));
226         if (inet_opt && inet_opt->opt.srr) {
227                 if (!daddr)
228                         return -EINVAL;
229                 nexthop = inet_opt->opt.faddr;
230         }
231
232         orig_sport = inet->inet_sport;
233         orig_dport = usin->sin_port;
234         fl4 = &inet->cork.fl.u.ip4;
235         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
236                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
237                               IPPROTO_TCP,
238                               orig_sport, orig_dport, sk);
239         if (IS_ERR(rt)) {
240                 err = PTR_ERR(rt);
241                 if (err == -ENETUNREACH)
242                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
243                 return err;
244         }
245
246         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
247                 ip_rt_put(rt);
248                 return -ENETUNREACH;
249         }
250
251         if (!inet_opt || !inet_opt->opt.srr)
252                 daddr = fl4->daddr;
253
254         if (!inet->inet_saddr)
255                 inet->inet_saddr = fl4->saddr;
256         sk_rcv_saddr_set(sk, inet->inet_saddr);
257
258         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
259                 /* Reset inherited state */
260                 tp->rx_opt.ts_recent       = 0;
261                 tp->rx_opt.ts_recent_stamp = 0;
262                 if (likely(!tp->repair))
263                         WRITE_ONCE(tp->write_seq, 0);
264         }
265
266         inet->inet_dport = usin->sin_port;
267         sk_daddr_set(sk, daddr);
268
269         inet_csk(sk)->icsk_ext_hdr_len = 0;
270         if (inet_opt)
271                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
272
273         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
274
275         /* Socket identity is still unknown (sport may be zero).
276          * However we set state to SYN-SENT and not releasing socket
277          * lock select source port, enter ourselves into the hash tables and
278          * complete initialization after this.
279          */
280         tcp_set_state(sk, TCP_SYN_SENT);
281         err = inet_hash_connect(tcp_death_row, sk);
282         if (err)
283                 goto failure;
284
285         sk_set_txhash(sk);
286
287         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
288                                inet->inet_sport, inet->inet_dport, sk);
289         if (IS_ERR(rt)) {
290                 err = PTR_ERR(rt);
291                 rt = NULL;
292                 goto failure;
293         }
294         /* OK, now commit destination to socket.  */
295         sk->sk_gso_type = SKB_GSO_TCPV4;
296         sk_setup_caps(sk, &rt->dst);
297         rt = NULL;
298
299         if (likely(!tp->repair)) {
300                 if (!tp->write_seq)
301                         WRITE_ONCE(tp->write_seq,
302                                    secure_tcp_seq(inet->inet_saddr,
303                                                   inet->inet_daddr,
304                                                   inet->inet_sport,
305                                                   usin->sin_port));
306                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
307                                                  inet->inet_saddr,
308                                                  inet->inet_daddr);
309         }
310
311         inet->inet_id = prandom_u32();
312
313         if (tcp_fastopen_defer_connect(sk, &err))
314                 return err;
315         if (err)
316                 goto failure;
317
318         err = tcp_connect(sk);
319
320         if (err)
321                 goto failure;
322
323         return 0;
324
325 failure:
326         /*
327          * This unhashes the socket and releases the local port,
328          * if necessary.
329          */
330         tcp_set_state(sk, TCP_CLOSE);
331         ip_rt_put(rt);
332         sk->sk_route_caps = 0;
333         inet->inet_dport = 0;
334         return err;
335 }
336 EXPORT_SYMBOL(tcp_v4_connect);
337
338 /*
339  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
340  * It can be called through tcp_release_cb() if socket was owned by user
341  * at the time tcp_v4_err() was called to handle ICMP message.
342  */
343 void tcp_v4_mtu_reduced(struct sock *sk)
344 {
345         struct inet_sock *inet = inet_sk(sk);
346         struct dst_entry *dst;
347         u32 mtu;
348
349         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
350                 return;
351         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
352         dst = inet_csk_update_pmtu(sk, mtu);
353         if (!dst)
354                 return;
355
356         /* Something is about to be wrong... Remember soft error
357          * for the case, if this connection will not able to recover.
358          */
359         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
360                 sk->sk_err_soft = EMSGSIZE;
361
362         mtu = dst_mtu(dst);
363
364         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
365             ip_sk_accept_pmtu(sk) &&
366             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
367                 tcp_sync_mss(sk, mtu);
368
369                 /* Resend the TCP packet because it's
370                  * clear that the old packet has been
371                  * dropped. This is the new "fast" path mtu
372                  * discovery.
373                  */
374                 tcp_simple_retransmit(sk);
375         } /* else let the usual retransmit timer handle it */
376 }
377 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
378
379 static void do_redirect(struct sk_buff *skb, struct sock *sk)
380 {
381         struct dst_entry *dst = __sk_dst_check(sk, 0);
382
383         if (dst)
384                 dst->ops->redirect(dst, sk, skb);
385 }
386
387
388 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
389 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
390 {
391         struct request_sock *req = inet_reqsk(sk);
392         struct net *net = sock_net(sk);
393
394         /* ICMPs are not backlogged, hence we cannot get
395          * an established socket here.
396          */
397         if (seq != tcp_rsk(req)->snt_isn) {
398                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
399         } else if (abort) {
400                 /*
401                  * Still in SYN_RECV, just remove it silently.
402                  * There is no good way to pass the error to the newly
403                  * created socket, and POSIX does not want network
404                  * errors returned from accept().
405                  */
406                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
407                 tcp_listendrop(req->rsk_listener);
408         }
409         reqsk_put(req);
410 }
411 EXPORT_SYMBOL(tcp_req_err);
412
413 /*
414  * This routine is called by the ICMP module when it gets some
415  * sort of error condition.  If err < 0 then the socket should
416  * be closed and the error returned to the user.  If err > 0
417  * it's just the icmp type << 8 | icmp code.  After adjustment
418  * header points to the first 8 bytes of the tcp header.  We need
419  * to find the appropriate port.
420  *
421  * The locking strategy used here is very "optimistic". When
422  * someone else accesses the socket the ICMP is just dropped
423  * and for some paths there is no check at all.
424  * A more general error queue to queue errors for later handling
425  * is probably better.
426  *
427  */
428
429 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
430 {
431         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
432         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
433         struct inet_connection_sock *icsk;
434         struct tcp_sock *tp;
435         struct inet_sock *inet;
436         const int type = icmp_hdr(icmp_skb)->type;
437         const int code = icmp_hdr(icmp_skb)->code;
438         struct sock *sk;
439         struct sk_buff *skb;
440         struct request_sock *fastopen;
441         u32 seq, snd_una;
442         s32 remaining;
443         u32 delta_us;
444         int err;
445         struct net *net = dev_net(icmp_skb->dev);
446
447         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
448                                        th->dest, iph->saddr, ntohs(th->source),
449                                        inet_iif(icmp_skb), 0);
450         if (!sk) {
451                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
452                 return;
453         }
454         if (sk->sk_state == TCP_TIME_WAIT) {
455                 inet_twsk_put(inet_twsk(sk));
456                 return;
457         }
458         seq = ntohl(th->seq);
459         if (sk->sk_state == TCP_NEW_SYN_RECV)
460                 return tcp_req_err(sk, seq,
461                                   type == ICMP_PARAMETERPROB ||
462                                   type == ICMP_TIME_EXCEEDED ||
463                                   (type == ICMP_DEST_UNREACH &&
464                                    (code == ICMP_NET_UNREACH ||
465                                     code == ICMP_HOST_UNREACH)));
466
467         bh_lock_sock(sk);
468         /* If too many ICMPs get dropped on busy
469          * servers this needs to be solved differently.
470          * We do take care of PMTU discovery (RFC1191) special case :
471          * we can receive locally generated ICMP messages while socket is held.
472          */
473         if (sock_owned_by_user(sk)) {
474                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
475                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
476         }
477         if (sk->sk_state == TCP_CLOSE)
478                 goto out;
479
480         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
481                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
482                 goto out;
483         }
484
485         icsk = inet_csk(sk);
486         tp = tcp_sk(sk);
487         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
488         fastopen = tp->fastopen_rsk;
489         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
490         if (sk->sk_state != TCP_LISTEN &&
491             !between(seq, snd_una, tp->snd_nxt)) {
492                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
493                 goto out;
494         }
495
496         switch (type) {
497         case ICMP_REDIRECT:
498                 if (!sock_owned_by_user(sk))
499                         do_redirect(icmp_skb, sk);
500                 goto out;
501         case ICMP_SOURCE_QUENCH:
502                 /* Just silently ignore these. */
503                 goto out;
504         case ICMP_PARAMETERPROB:
505                 err = EPROTO;
506                 break;
507         case ICMP_DEST_UNREACH:
508                 if (code > NR_ICMP_UNREACH)
509                         goto out;
510
511                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
512                         /* We are not interested in TCP_LISTEN and open_requests
513                          * (SYN-ACKs send out by Linux are always <576bytes so
514                          * they should go through unfragmented).
515                          */
516                         if (sk->sk_state == TCP_LISTEN)
517                                 goto out;
518
519                         WRITE_ONCE(tp->mtu_info, info);
520                         if (!sock_owned_by_user(sk)) {
521                                 tcp_v4_mtu_reduced(sk);
522                         } else {
523                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
524                                         sock_hold(sk);
525                         }
526                         goto out;
527                 }
528
529                 err = icmp_err_convert[code].errno;
530                 /* check if icmp_skb allows revert of backoff
531                  * (see draft-zimmermann-tcp-lcd) */
532                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
533                         break;
534                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
535                     !icsk->icsk_backoff || fastopen)
536                         break;
537
538                 if (sock_owned_by_user(sk))
539                         break;
540
541                 skb = tcp_rtx_queue_head(sk);
542                 if (WARN_ON_ONCE(!skb))
543                         break;
544
545                 icsk->icsk_backoff--;
546                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
547                                                TCP_TIMEOUT_INIT;
548                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
549
550                 tcp_mstamp_refresh(tp);
551                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
552                 remaining = icsk->icsk_rto -
553                             usecs_to_jiffies(delta_us);
554
555                 if (remaining > 0) {
556                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
557                                                   remaining, TCP_RTO_MAX);
558                 } else {
559                         /* RTO revert clocked out retransmission.
560                          * Will retransmit now */
561                         tcp_retransmit_timer(sk);
562                 }
563
564                 break;
565         case ICMP_TIME_EXCEEDED:
566                 err = EHOSTUNREACH;
567                 break;
568         default:
569                 goto out;
570         }
571
572         switch (sk->sk_state) {
573         case TCP_SYN_SENT:
574         case TCP_SYN_RECV:
575                 /* Only in fast or simultaneous open. If a fast open socket is
576                  * is already accepted it is treated as a connected one below.
577                  */
578                 if (fastopen && !fastopen->sk)
579                         break;
580
581                 if (!sock_owned_by_user(sk)) {
582                         sk->sk_err = err;
583
584                         sk->sk_error_report(sk);
585
586                         tcp_done(sk);
587                 } else {
588                         sk->sk_err_soft = err;
589                 }
590                 goto out;
591         }
592
593         /* If we've already connected we will keep trying
594          * until we time out, or the user gives up.
595          *
596          * rfc1122 4.2.3.9 allows to consider as hard errors
597          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
598          * but it is obsoleted by pmtu discovery).
599          *
600          * Note, that in modern internet, where routing is unreliable
601          * and in each dark corner broken firewalls sit, sending random
602          * errors ordered by their masters even this two messages finally lose
603          * their original sense (even Linux sends invalid PORT_UNREACHs)
604          *
605          * Now we are in compliance with RFCs.
606          *                                                      --ANK (980905)
607          */
608
609         inet = inet_sk(sk);
610         if (!sock_owned_by_user(sk) && inet->recverr) {
611                 sk->sk_err = err;
612                 sk->sk_error_report(sk);
613         } else  { /* Only an error on timeout */
614                 sk->sk_err_soft = err;
615         }
616
617 out:
618         bh_unlock_sock(sk);
619         sock_put(sk);
620 }
621
622 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
623 {
624         struct tcphdr *th = tcp_hdr(skb);
625
626         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
627         skb->csum_start = skb_transport_header(skb) - skb->head;
628         skb->csum_offset = offsetof(struct tcphdr, check);
629 }
630
631 /* This routine computes an IPv4 TCP checksum. */
632 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
633 {
634         const struct inet_sock *inet = inet_sk(sk);
635
636         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
637 }
638 EXPORT_SYMBOL(tcp_v4_send_check);
639
640 /*
641  *      This routine will send an RST to the other tcp.
642  *
643  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
644  *                    for reset.
645  *      Answer: if a packet caused RST, it is not for a socket
646  *              existing in our system, if it is matched to a socket,
647  *              it is just duplicate segment or bug in other side's TCP.
648  *              So that we build reply only basing on parameters
649  *              arrived with segment.
650  *      Exception: precedence violation. We do not implement it in any case.
651  */
652
653 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
654 {
655         const struct tcphdr *th = tcp_hdr(skb);
656         struct {
657                 struct tcphdr th;
658 #ifdef CONFIG_TCP_MD5SIG
659                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
660 #endif
661         } rep;
662         struct ip_reply_arg arg;
663 #ifdef CONFIG_TCP_MD5SIG
664         struct tcp_md5sig_key *key = NULL;
665         const __u8 *hash_location = NULL;
666         unsigned char newhash[16];
667         int genhash;
668         struct sock *sk1 = NULL;
669 #endif
670         struct net *net;
671         struct sock *ctl_sk;
672
673         /* Never send a reset in response to a reset. */
674         if (th->rst)
675                 return;
676
677         /* If sk not NULL, it means we did a successful lookup and incoming
678          * route had to be correct. prequeue might have dropped our dst.
679          */
680         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
681                 return;
682
683         /* Swap the send and the receive. */
684         memset(&rep, 0, sizeof(rep));
685         rep.th.dest   = th->source;
686         rep.th.source = th->dest;
687         rep.th.doff   = sizeof(struct tcphdr) / 4;
688         rep.th.rst    = 1;
689
690         if (th->ack) {
691                 rep.th.seq = th->ack_seq;
692         } else {
693                 rep.th.ack = 1;
694                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
695                                        skb->len - (th->doff << 2));
696         }
697
698         memset(&arg, 0, sizeof(arg));
699         arg.iov[0].iov_base = (unsigned char *)&rep;
700         arg.iov[0].iov_len  = sizeof(rep.th);
701
702         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
703 #ifdef CONFIG_TCP_MD5SIG
704         rcu_read_lock();
705         hash_location = tcp_parse_md5sig_option(th);
706         if (sk && sk_fullsock(sk)) {
707                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
708                                         &ip_hdr(skb)->saddr, AF_INET);
709         } else if (hash_location) {
710                 /*
711                  * active side is lost. Try to find listening socket through
712                  * source port, and then find md5 key through listening socket.
713                  * we are not loose security here:
714                  * Incoming packet is checked with md5 hash with finding key,
715                  * no RST generated if md5 hash doesn't match.
716                  */
717                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
718                                              ip_hdr(skb)->saddr,
719                                              th->source, ip_hdr(skb)->daddr,
720                                              ntohs(th->source), inet_iif(skb),
721                                              tcp_v4_sdif(skb));
722                 /* don't send rst if it can't find key */
723                 if (!sk1)
724                         goto out;
725
726                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
727                                         &ip_hdr(skb)->saddr, AF_INET);
728                 if (!key)
729                         goto out;
730
731
732                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
733                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
734                         goto out;
735
736         }
737
738         if (key) {
739                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
740                                    (TCPOPT_NOP << 16) |
741                                    (TCPOPT_MD5SIG << 8) |
742                                    TCPOLEN_MD5SIG);
743                 /* Update length and the length the header thinks exists */
744                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
745                 rep.th.doff = arg.iov[0].iov_len / 4;
746
747                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
748                                      key, ip_hdr(skb)->saddr,
749                                      ip_hdr(skb)->daddr, &rep.th);
750         }
751 #endif
752         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
753                                       ip_hdr(skb)->saddr, /* XXX */
754                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
755         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
756         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
757
758         /* When socket is gone, all binding information is lost.
759          * routing might fail in this case. No choice here, if we choose to force
760          * input interface, we will misroute in case of asymmetric route.
761          */
762         if (sk) {
763                 arg.bound_dev_if = sk->sk_bound_dev_if;
764                 if (sk_fullsock(sk))
765                         trace_tcp_send_reset(sk, skb);
766         }
767
768         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
769                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
770
771         arg.tos = ip_hdr(skb)->tos;
772         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
773         local_bh_disable();
774         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
775         if (sk)
776                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
777                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
778         ip_send_unicast_reply(ctl_sk,
779                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
780                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
781                               &arg, arg.iov[0].iov_len);
782
783         ctl_sk->sk_mark = 0;
784         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
785         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
786         local_bh_enable();
787
788 #ifdef CONFIG_TCP_MD5SIG
789 out:
790         rcu_read_unlock();
791 #endif
792 }
793
794 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
795    outside socket context is ugly, certainly. What can I do?
796  */
797
798 static void tcp_v4_send_ack(const struct sock *sk,
799                             struct sk_buff *skb, u32 seq, u32 ack,
800                             u32 win, u32 tsval, u32 tsecr, int oif,
801                             struct tcp_md5sig_key *key,
802                             int reply_flags, u8 tos)
803 {
804         const struct tcphdr *th = tcp_hdr(skb);
805         struct {
806                 struct tcphdr th;
807                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
808 #ifdef CONFIG_TCP_MD5SIG
809                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
810 #endif
811                         ];
812         } rep;
813         struct net *net = sock_net(sk);
814         struct ip_reply_arg arg;
815         struct sock *ctl_sk;
816
817         memset(&rep.th, 0, sizeof(struct tcphdr));
818         memset(&arg, 0, sizeof(arg));
819
820         arg.iov[0].iov_base = (unsigned char *)&rep;
821         arg.iov[0].iov_len  = sizeof(rep.th);
822         if (tsecr) {
823                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
824                                    (TCPOPT_TIMESTAMP << 8) |
825                                    TCPOLEN_TIMESTAMP);
826                 rep.opt[1] = htonl(tsval);
827                 rep.opt[2] = htonl(tsecr);
828                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
829         }
830
831         /* Swap the send and the receive. */
832         rep.th.dest    = th->source;
833         rep.th.source  = th->dest;
834         rep.th.doff    = arg.iov[0].iov_len / 4;
835         rep.th.seq     = htonl(seq);
836         rep.th.ack_seq = htonl(ack);
837         rep.th.ack     = 1;
838         rep.th.window  = htons(win);
839
840 #ifdef CONFIG_TCP_MD5SIG
841         if (key) {
842                 int offset = (tsecr) ? 3 : 0;
843
844                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
845                                           (TCPOPT_NOP << 16) |
846                                           (TCPOPT_MD5SIG << 8) |
847                                           TCPOLEN_MD5SIG);
848                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
849                 rep.th.doff = arg.iov[0].iov_len/4;
850
851                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
852                                     key, ip_hdr(skb)->saddr,
853                                     ip_hdr(skb)->daddr, &rep.th);
854         }
855 #endif
856         arg.flags = reply_flags;
857         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
858                                       ip_hdr(skb)->saddr, /* XXX */
859                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
860         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
861         if (oif)
862                 arg.bound_dev_if = oif;
863         arg.tos = tos;
864         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
865         local_bh_disable();
866         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
867         if (sk)
868                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
869                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
870         ip_send_unicast_reply(ctl_sk,
871                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
872                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
873                               &arg, arg.iov[0].iov_len);
874
875         ctl_sk->sk_mark = 0;
876         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
877         local_bh_enable();
878 }
879
880 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
881 {
882         struct inet_timewait_sock *tw = inet_twsk(sk);
883         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
884
885         tcp_v4_send_ack(sk, skb,
886                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
887                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
888                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
889                         tcptw->tw_ts_recent,
890                         tw->tw_bound_dev_if,
891                         tcp_twsk_md5_key(tcptw),
892                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
893                         tw->tw_tos
894                         );
895
896         inet_twsk_put(tw);
897 }
898
899 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
900                                   struct request_sock *req)
901 {
902         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
903          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
904          */
905         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
906                                              tcp_sk(sk)->snd_nxt;
907
908         /* RFC 7323 2.3
909          * The window field (SEG.WND) of every outgoing segment, with the
910          * exception of <SYN> segments, MUST be right-shifted by
911          * Rcv.Wind.Shift bits:
912          */
913         tcp_v4_send_ack(sk, skb, seq,
914                         tcp_rsk(req)->rcv_nxt,
915                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
916                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
917                         req->ts_recent,
918                         0,
919                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
920                                           AF_INET),
921                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
922                         ip_hdr(skb)->tos);
923 }
924
925 /*
926  *      Send a SYN-ACK after having received a SYN.
927  *      This still operates on a request_sock only, not on a big
928  *      socket.
929  */
930 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
931                               struct flowi *fl,
932                               struct request_sock *req,
933                               struct tcp_fastopen_cookie *foc,
934                               enum tcp_synack_type synack_type)
935 {
936         const struct inet_request_sock *ireq = inet_rsk(req);
937         struct flowi4 fl4;
938         int err = -1;
939         struct sk_buff *skb;
940
941         /* First, grab a route. */
942         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
943                 return -1;
944
945         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
946
947         if (skb) {
948                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
949
950                 rcu_read_lock();
951                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
952                                             ireq->ir_rmt_addr,
953                                             rcu_dereference(ireq->ireq_opt));
954                 rcu_read_unlock();
955                 err = net_xmit_eval(err);
956         }
957
958         return err;
959 }
960
961 /*
962  *      IPv4 request_sock destructor.
963  */
964 static void tcp_v4_reqsk_destructor(struct request_sock *req)
965 {
966         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
967 }
968
969 #ifdef CONFIG_TCP_MD5SIG
970 /*
971  * RFC2385 MD5 checksumming requires a mapping of
972  * IP address->MD5 Key.
973  * We need to maintain these in the sk structure.
974  */
975
976 /* Find the Key structure for an address.  */
977 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
978                                          const union tcp_md5_addr *addr,
979                                          int family)
980 {
981         const struct tcp_sock *tp = tcp_sk(sk);
982         struct tcp_md5sig_key *key;
983         const struct tcp_md5sig_info *md5sig;
984         __be32 mask;
985         struct tcp_md5sig_key *best_match = NULL;
986         bool match;
987
988         /* caller either holds rcu_read_lock() or socket lock */
989         md5sig = rcu_dereference_check(tp->md5sig_info,
990                                        lockdep_sock_is_held(sk));
991         if (!md5sig)
992                 return NULL;
993
994         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
995                 if (key->family != family)
996                         continue;
997
998                 if (family == AF_INET) {
999                         mask = inet_make_mask(key->prefixlen);
1000                         match = (key->addr.a4.s_addr & mask) ==
1001                                 (addr->a4.s_addr & mask);
1002 #if IS_ENABLED(CONFIG_IPV6)
1003                 } else if (family == AF_INET6) {
1004                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1005                                                   key->prefixlen);
1006 #endif
1007                 } else {
1008                         match = false;
1009                 }
1010
1011                 if (match && (!best_match ||
1012                               key->prefixlen > best_match->prefixlen))
1013                         best_match = key;
1014         }
1015         return best_match;
1016 }
1017 EXPORT_SYMBOL(tcp_md5_do_lookup);
1018
1019 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1020                                                       const union tcp_md5_addr *addr,
1021                                                       int family, u8 prefixlen)
1022 {
1023         const struct tcp_sock *tp = tcp_sk(sk);
1024         struct tcp_md5sig_key *key;
1025         unsigned int size = sizeof(struct in_addr);
1026         const struct tcp_md5sig_info *md5sig;
1027
1028         /* caller either holds rcu_read_lock() or socket lock */
1029         md5sig = rcu_dereference_check(tp->md5sig_info,
1030                                        lockdep_sock_is_held(sk));
1031         if (!md5sig)
1032                 return NULL;
1033 #if IS_ENABLED(CONFIG_IPV6)
1034         if (family == AF_INET6)
1035                 size = sizeof(struct in6_addr);
1036 #endif
1037         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1038                 if (key->family != family)
1039                         continue;
1040                 if (!memcmp(&key->addr, addr, size) &&
1041                     key->prefixlen == prefixlen)
1042                         return key;
1043         }
1044         return NULL;
1045 }
1046
1047 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1048                                          const struct sock *addr_sk)
1049 {
1050         const union tcp_md5_addr *addr;
1051
1052         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1053         return tcp_md5_do_lookup(sk, addr, AF_INET);
1054 }
1055 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1056
1057 /* This can be called on a newly created socket, from other files */
1058 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1059                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1060                    gfp_t gfp)
1061 {
1062         /* Add Key to the list */
1063         struct tcp_md5sig_key *key;
1064         struct tcp_sock *tp = tcp_sk(sk);
1065         struct tcp_md5sig_info *md5sig;
1066
1067         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1068         if (key) {
1069                 /* Pre-existing entry - just update that one.
1070                  * Note that the key might be used concurrently.
1071                  */
1072                 memcpy(key->key, newkey, newkeylen);
1073
1074                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1075                  * Also note that a reader could catch new key->keylen value
1076                  * but old key->key[], this is the reason we use __GFP_ZERO
1077                  * at sock_kmalloc() time below these lines.
1078                  */
1079                 WRITE_ONCE(key->keylen, newkeylen);
1080
1081                 return 0;
1082         }
1083
1084         md5sig = rcu_dereference_protected(tp->md5sig_info,
1085                                            lockdep_sock_is_held(sk));
1086         if (!md5sig) {
1087                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1088                 if (!md5sig)
1089                         return -ENOMEM;
1090
1091                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1092                 INIT_HLIST_HEAD(&md5sig->head);
1093                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1094         }
1095
1096         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1097         if (!key)
1098                 return -ENOMEM;
1099         if (!tcp_alloc_md5sig_pool()) {
1100                 sock_kfree_s(sk, key, sizeof(*key));
1101                 return -ENOMEM;
1102         }
1103
1104         memcpy(key->key, newkey, newkeylen);
1105         key->keylen = newkeylen;
1106         key->family = family;
1107         key->prefixlen = prefixlen;
1108         memcpy(&key->addr, addr,
1109                (family == AF_INET6) ? sizeof(struct in6_addr) :
1110                                       sizeof(struct in_addr));
1111         hlist_add_head_rcu(&key->node, &md5sig->head);
1112         return 0;
1113 }
1114 EXPORT_SYMBOL(tcp_md5_do_add);
1115
1116 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1117                    u8 prefixlen)
1118 {
1119         struct tcp_md5sig_key *key;
1120
1121         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1122         if (!key)
1123                 return -ENOENT;
1124         hlist_del_rcu(&key->node);
1125         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1126         kfree_rcu(key, rcu);
1127         return 0;
1128 }
1129 EXPORT_SYMBOL(tcp_md5_do_del);
1130
1131 static void tcp_clear_md5_list(struct sock *sk)
1132 {
1133         struct tcp_sock *tp = tcp_sk(sk);
1134         struct tcp_md5sig_key *key;
1135         struct hlist_node *n;
1136         struct tcp_md5sig_info *md5sig;
1137
1138         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1139
1140         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1141                 hlist_del_rcu(&key->node);
1142                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1143                 kfree_rcu(key, rcu);
1144         }
1145 }
1146
1147 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1148                                  char __user *optval, int optlen)
1149 {
1150         struct tcp_md5sig cmd;
1151         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1152         u8 prefixlen = 32;
1153
1154         if (optlen < sizeof(cmd))
1155                 return -EINVAL;
1156
1157         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1158                 return -EFAULT;
1159
1160         if (sin->sin_family != AF_INET)
1161                 return -EINVAL;
1162
1163         if (optname == TCP_MD5SIG_EXT &&
1164             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1165                 prefixlen = cmd.tcpm_prefixlen;
1166                 if (prefixlen > 32)
1167                         return -EINVAL;
1168         }
1169
1170         if (!cmd.tcpm_keylen)
1171                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1172                                       AF_INET, prefixlen);
1173
1174         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1175                 return -EINVAL;
1176
1177         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1178                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1179                               GFP_KERNEL);
1180 }
1181
1182 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1183                                    __be32 daddr, __be32 saddr,
1184                                    const struct tcphdr *th, int nbytes)
1185 {
1186         struct tcp4_pseudohdr *bp;
1187         struct scatterlist sg;
1188         struct tcphdr *_th;
1189
1190         bp = hp->scratch;
1191         bp->saddr = saddr;
1192         bp->daddr = daddr;
1193         bp->pad = 0;
1194         bp->protocol = IPPROTO_TCP;
1195         bp->len = cpu_to_be16(nbytes);
1196
1197         _th = (struct tcphdr *)(bp + 1);
1198         memcpy(_th, th, sizeof(*th));
1199         _th->check = 0;
1200
1201         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1202         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1203                                 sizeof(*bp) + sizeof(*th));
1204         return crypto_ahash_update(hp->md5_req);
1205 }
1206
1207 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1208                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1209 {
1210         struct tcp_md5sig_pool *hp;
1211         struct ahash_request *req;
1212
1213         hp = tcp_get_md5sig_pool();
1214         if (!hp)
1215                 goto clear_hash_noput;
1216         req = hp->md5_req;
1217
1218         if (crypto_ahash_init(req))
1219                 goto clear_hash;
1220         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1221                 goto clear_hash;
1222         if (tcp_md5_hash_key(hp, key))
1223                 goto clear_hash;
1224         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1225         if (crypto_ahash_final(req))
1226                 goto clear_hash;
1227
1228         tcp_put_md5sig_pool();
1229         return 0;
1230
1231 clear_hash:
1232         tcp_put_md5sig_pool();
1233 clear_hash_noput:
1234         memset(md5_hash, 0, 16);
1235         return 1;
1236 }
1237
1238 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1239                         const struct sock *sk,
1240                         const struct sk_buff *skb)
1241 {
1242         struct tcp_md5sig_pool *hp;
1243         struct ahash_request *req;
1244         const struct tcphdr *th = tcp_hdr(skb);
1245         __be32 saddr, daddr;
1246
1247         if (sk) { /* valid for establish/request sockets */
1248                 saddr = sk->sk_rcv_saddr;
1249                 daddr = sk->sk_daddr;
1250         } else {
1251                 const struct iphdr *iph = ip_hdr(skb);
1252                 saddr = iph->saddr;
1253                 daddr = iph->daddr;
1254         }
1255
1256         hp = tcp_get_md5sig_pool();
1257         if (!hp)
1258                 goto clear_hash_noput;
1259         req = hp->md5_req;
1260
1261         if (crypto_ahash_init(req))
1262                 goto clear_hash;
1263
1264         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1265                 goto clear_hash;
1266         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1267                 goto clear_hash;
1268         if (tcp_md5_hash_key(hp, key))
1269                 goto clear_hash;
1270         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1271         if (crypto_ahash_final(req))
1272                 goto clear_hash;
1273
1274         tcp_put_md5sig_pool();
1275         return 0;
1276
1277 clear_hash:
1278         tcp_put_md5sig_pool();
1279 clear_hash_noput:
1280         memset(md5_hash, 0, 16);
1281         return 1;
1282 }
1283 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1284
1285 #endif
1286
1287 /* Called with rcu_read_lock() */
1288 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1289                                     const struct sk_buff *skb)
1290 {
1291 #ifdef CONFIG_TCP_MD5SIG
1292         /*
1293          * This gets called for each TCP segment that arrives
1294          * so we want to be efficient.
1295          * We have 3 drop cases:
1296          * o No MD5 hash and one expected.
1297          * o MD5 hash and we're not expecting one.
1298          * o MD5 hash and its wrong.
1299          */
1300         const __u8 *hash_location = NULL;
1301         struct tcp_md5sig_key *hash_expected;
1302         const struct iphdr *iph = ip_hdr(skb);
1303         const struct tcphdr *th = tcp_hdr(skb);
1304         int genhash;
1305         unsigned char newhash[16];
1306
1307         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1308                                           AF_INET);
1309         hash_location = tcp_parse_md5sig_option(th);
1310
1311         /* We've parsed the options - do we have a hash? */
1312         if (!hash_expected && !hash_location)
1313                 return false;
1314
1315         if (hash_expected && !hash_location) {
1316                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1317                 return true;
1318         }
1319
1320         if (!hash_expected && hash_location) {
1321                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1322                 return true;
1323         }
1324
1325         /* Okay, so this is hash_expected and hash_location -
1326          * so we need to calculate the checksum.
1327          */
1328         genhash = tcp_v4_md5_hash_skb(newhash,
1329                                       hash_expected,
1330                                       NULL, skb);
1331
1332         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1333                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1334                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1335                                      &iph->saddr, ntohs(th->source),
1336                                      &iph->daddr, ntohs(th->dest),
1337                                      genhash ? " tcp_v4_calc_md5_hash failed"
1338                                      : "");
1339                 return true;
1340         }
1341         return false;
1342 #endif
1343         return false;
1344 }
1345
1346 static void tcp_v4_init_req(struct request_sock *req,
1347                             const struct sock *sk_listener,
1348                             struct sk_buff *skb)
1349 {
1350         struct inet_request_sock *ireq = inet_rsk(req);
1351         struct net *net = sock_net(sk_listener);
1352
1353         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1354         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1355         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1356 }
1357
1358 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1359                                           struct flowi *fl,
1360                                           const struct request_sock *req)
1361 {
1362         return inet_csk_route_req(sk, &fl->u.ip4, req);
1363 }
1364
1365 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1366         .family         =       PF_INET,
1367         .obj_size       =       sizeof(struct tcp_request_sock),
1368         .rtx_syn_ack    =       tcp_rtx_synack,
1369         .send_ack       =       tcp_v4_reqsk_send_ack,
1370         .destructor     =       tcp_v4_reqsk_destructor,
1371         .send_reset     =       tcp_v4_send_reset,
1372         .syn_ack_timeout =      tcp_syn_ack_timeout,
1373 };
1374
1375 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1376         .mss_clamp      =       TCP_MSS_DEFAULT,
1377 #ifdef CONFIG_TCP_MD5SIG
1378         .req_md5_lookup =       tcp_v4_md5_lookup,
1379         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1380 #endif
1381         .init_req       =       tcp_v4_init_req,
1382 #ifdef CONFIG_SYN_COOKIES
1383         .cookie_init_seq =      cookie_v4_init_sequence,
1384 #endif
1385         .route_req      =       tcp_v4_route_req,
1386         .init_seq       =       tcp_v4_init_seq,
1387         .init_ts_off    =       tcp_v4_init_ts_off,
1388         .send_synack    =       tcp_v4_send_synack,
1389 };
1390
1391 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1392 {
1393         /* Never answer to SYNs send to broadcast or multicast */
1394         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1395                 goto drop;
1396
1397         return tcp_conn_request(&tcp_request_sock_ops,
1398                                 &tcp_request_sock_ipv4_ops, sk, skb);
1399
1400 drop:
1401         tcp_listendrop(sk);
1402         return 0;
1403 }
1404 EXPORT_SYMBOL(tcp_v4_conn_request);
1405
1406
1407 /*
1408  * The three way handshake has completed - we got a valid synack -
1409  * now create the new socket.
1410  */
1411 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1412                                   struct request_sock *req,
1413                                   struct dst_entry *dst,
1414                                   struct request_sock *req_unhash,
1415                                   bool *own_req)
1416 {
1417         struct inet_request_sock *ireq;
1418         bool found_dup_sk = false;
1419         struct inet_sock *newinet;
1420         struct tcp_sock *newtp;
1421         struct sock *newsk;
1422 #ifdef CONFIG_TCP_MD5SIG
1423         struct tcp_md5sig_key *key;
1424 #endif
1425         struct ip_options_rcu *inet_opt;
1426
1427         if (sk_acceptq_is_full(sk))
1428                 goto exit_overflow;
1429
1430         newsk = tcp_create_openreq_child(sk, req, skb);
1431         if (!newsk)
1432                 goto exit_nonewsk;
1433
1434         newsk->sk_gso_type = SKB_GSO_TCPV4;
1435         inet_sk_rx_dst_set(newsk, skb);
1436
1437         newtp                 = tcp_sk(newsk);
1438         newinet               = inet_sk(newsk);
1439         ireq                  = inet_rsk(req);
1440         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1441         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1442         newsk->sk_bound_dev_if = ireq->ir_iif;
1443         newinet->inet_saddr   = ireq->ir_loc_addr;
1444         inet_opt              = rcu_dereference(ireq->ireq_opt);
1445         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1446         newinet->mc_index     = inet_iif(skb);
1447         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1448         newinet->rcv_tos      = ip_hdr(skb)->tos;
1449         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1450         if (inet_opt)
1451                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1452         newinet->inet_id = prandom_u32();
1453
1454         if (!dst) {
1455                 dst = inet_csk_route_child_sock(sk, newsk, req);
1456                 if (!dst)
1457                         goto put_and_exit;
1458         } else {
1459                 /* syncookie case : see end of cookie_v4_check() */
1460         }
1461         sk_setup_caps(newsk, dst);
1462
1463         tcp_ca_openreq_child(newsk, dst);
1464
1465         tcp_sync_mss(newsk, dst_mtu(dst));
1466         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1467
1468         tcp_initialize_rcv_mss(newsk);
1469
1470 #ifdef CONFIG_TCP_MD5SIG
1471         /* Copy over the MD5 key from the original socket */
1472         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1473                                 AF_INET);
1474         if (key) {
1475                 /*
1476                  * We're using one, so create a matching key
1477                  * on the newsk structure. If we fail to get
1478                  * memory, then we end up not copying the key
1479                  * across. Shucks.
1480                  */
1481                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1482                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1483                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1484         }
1485 #endif
1486
1487         if (__inet_inherit_port(sk, newsk) < 0)
1488                 goto put_and_exit;
1489         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1490                                        &found_dup_sk);
1491         if (likely(*own_req)) {
1492                 tcp_move_syn(newtp, req);
1493                 ireq->ireq_opt = NULL;
1494         } else {
1495                 newinet->inet_opt = NULL;
1496
1497                 if (!req_unhash && found_dup_sk) {
1498                         /* This code path should only be executed in the
1499                          * syncookie case only
1500                          */
1501                         bh_unlock_sock(newsk);
1502                         sock_put(newsk);
1503                         newsk = NULL;
1504                 }
1505         }
1506         return newsk;
1507
1508 exit_overflow:
1509         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1510 exit_nonewsk:
1511         dst_release(dst);
1512 exit:
1513         tcp_listendrop(sk);
1514         return NULL;
1515 put_and_exit:
1516         newinet->inet_opt = NULL;
1517         inet_csk_prepare_forced_close(newsk);
1518         tcp_done(newsk);
1519         goto exit;
1520 }
1521 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1522
1523 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1524 {
1525 #ifdef CONFIG_SYN_COOKIES
1526         const struct tcphdr *th = tcp_hdr(skb);
1527
1528         if (!th->syn)
1529                 sk = cookie_v4_check(sk, skb);
1530 #endif
1531         return sk;
1532 }
1533
1534 /* The socket must have it's spinlock held when we get
1535  * here, unless it is a TCP_LISTEN socket.
1536  *
1537  * We have a potential double-lock case here, so even when
1538  * doing backlog processing we use the BH locking scheme.
1539  * This is because we cannot sleep with the original spinlock
1540  * held.
1541  */
1542 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1543 {
1544         struct sock *rsk;
1545
1546         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1547                 struct dst_entry *dst;
1548
1549                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1550                                                 lockdep_sock_is_held(sk));
1551
1552                 sock_rps_save_rxhash(sk, skb);
1553                 sk_mark_napi_id(sk, skb);
1554                 if (dst) {
1555                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1556                             !dst->ops->check(dst, 0)) {
1557                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1558                                 dst_release(dst);
1559                         }
1560                 }
1561                 tcp_rcv_established(sk, skb);
1562                 return 0;
1563         }
1564
1565         if (tcp_checksum_complete(skb))
1566                 goto csum_err;
1567
1568         if (sk->sk_state == TCP_LISTEN) {
1569                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1570
1571                 if (!nsk)
1572                         goto discard;
1573                 if (nsk != sk) {
1574                         if (tcp_child_process(sk, nsk, skb)) {
1575                                 rsk = nsk;
1576                                 goto reset;
1577                         }
1578                         return 0;
1579                 }
1580         } else
1581                 sock_rps_save_rxhash(sk, skb);
1582
1583         if (tcp_rcv_state_process(sk, skb)) {
1584                 rsk = sk;
1585                 goto reset;
1586         }
1587         return 0;
1588
1589 reset:
1590         tcp_v4_send_reset(rsk, skb);
1591 discard:
1592         kfree_skb(skb);
1593         /* Be careful here. If this function gets more complicated and
1594          * gcc suffers from register pressure on the x86, sk (in %ebx)
1595          * might be destroyed here. This current version compiles correctly,
1596          * but you have been warned.
1597          */
1598         return 0;
1599
1600 csum_err:
1601         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1602         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1603         goto discard;
1604 }
1605 EXPORT_SYMBOL(tcp_v4_do_rcv);
1606
1607 int tcp_v4_early_demux(struct sk_buff *skb)
1608 {
1609         const struct iphdr *iph;
1610         const struct tcphdr *th;
1611         struct sock *sk;
1612
1613         if (skb->pkt_type != PACKET_HOST)
1614                 return 0;
1615
1616         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1617                 return 0;
1618
1619         iph = ip_hdr(skb);
1620         th = tcp_hdr(skb);
1621
1622         if (th->doff < sizeof(struct tcphdr) / 4)
1623                 return 0;
1624
1625         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1626                                        iph->saddr, th->source,
1627                                        iph->daddr, ntohs(th->dest),
1628                                        skb->skb_iif, inet_sdif(skb));
1629         if (sk) {
1630                 skb->sk = sk;
1631                 skb->destructor = sock_edemux;
1632                 if (sk_fullsock(sk)) {
1633                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1634
1635                         if (dst)
1636                                 dst = dst_check(dst, 0);
1637                         if (dst &&
1638                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1639                                 skb_dst_set_noref(skb, dst);
1640                 }
1641         }
1642         return 0;
1643 }
1644
1645 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1646 {
1647         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1648
1649         /* Only socket owner can try to collapse/prune rx queues
1650          * to reduce memory overhead, so add a little headroom here.
1651          * Few sockets backlog are possibly concurrently non empty.
1652          */
1653         limit += 64*1024;
1654
1655         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1656          * we can fix skb->truesize to its real value to avoid future drops.
1657          * This is valid because skb is not yet charged to the socket.
1658          * It has been noticed pure SACK packets were sometimes dropped
1659          * (if cooked by drivers without copybreak feature).
1660          */
1661         skb_condense(skb);
1662
1663         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1664                 bh_unlock_sock(sk);
1665                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1666                 return true;
1667         }
1668         return false;
1669 }
1670 EXPORT_SYMBOL(tcp_add_backlog);
1671
1672 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1673 {
1674         struct tcphdr *th = (struct tcphdr *)skb->data;
1675
1676         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1677 }
1678 EXPORT_SYMBOL(tcp_filter);
1679
1680 static void tcp_v4_restore_cb(struct sk_buff *skb)
1681 {
1682         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1683                 sizeof(struct inet_skb_parm));
1684 }
1685
1686 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1687                            const struct tcphdr *th)
1688 {
1689         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1690          * barrier() makes sure compiler wont play fool^Waliasing games.
1691          */
1692         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1693                 sizeof(struct inet_skb_parm));
1694         barrier();
1695
1696         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1697         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1698                                     skb->len - th->doff * 4);
1699         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1700         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1701         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1702         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1703         TCP_SKB_CB(skb)->sacked  = 0;
1704         TCP_SKB_CB(skb)->has_rxtstamp =
1705                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1706 }
1707
1708 /*
1709  *      From tcp_input.c
1710  */
1711
1712 int tcp_v4_rcv(struct sk_buff *skb)
1713 {
1714         struct net *net = dev_net(skb->dev);
1715         int sdif = inet_sdif(skb);
1716         const struct iphdr *iph;
1717         const struct tcphdr *th;
1718         bool refcounted;
1719         struct sock *sk;
1720         int ret;
1721
1722         if (skb->pkt_type != PACKET_HOST)
1723                 goto discard_it;
1724
1725         /* Count it even if it's bad */
1726         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1727
1728         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1729                 goto discard_it;
1730
1731         th = (const struct tcphdr *)skb->data;
1732
1733         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1734                 goto bad_packet;
1735         if (!pskb_may_pull(skb, th->doff * 4))
1736                 goto discard_it;
1737
1738         /* An explanation is required here, I think.
1739          * Packet length and doff are validated by header prediction,
1740          * provided case of th->doff==0 is eliminated.
1741          * So, we defer the checks. */
1742
1743         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1744                 goto csum_error;
1745
1746         th = (const struct tcphdr *)skb->data;
1747         iph = ip_hdr(skb);
1748 lookup:
1749         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1750                                th->dest, sdif, &refcounted);
1751         if (!sk)
1752                 goto no_tcp_socket;
1753
1754 process:
1755         if (sk->sk_state == TCP_TIME_WAIT)
1756                 goto do_time_wait;
1757
1758         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1759                 struct request_sock *req = inet_reqsk(sk);
1760                 bool req_stolen = false;
1761                 struct sock *nsk;
1762
1763                 sk = req->rsk_listener;
1764                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1765                         sk_drops_add(sk, skb);
1766                         reqsk_put(req);
1767                         goto discard_it;
1768                 }
1769                 if (tcp_checksum_complete(skb)) {
1770                         reqsk_put(req);
1771                         goto csum_error;
1772                 }
1773                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1774                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1775                         goto lookup;
1776                 }
1777                 /* We own a reference on the listener, increase it again
1778                  * as we might lose it too soon.
1779                  */
1780                 sock_hold(sk);
1781                 refcounted = true;
1782                 nsk = NULL;
1783                 if (!tcp_filter(sk, skb)) {
1784                         th = (const struct tcphdr *)skb->data;
1785                         iph = ip_hdr(skb);
1786                         tcp_v4_fill_cb(skb, iph, th);
1787                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1788                 }
1789                 if (!nsk) {
1790                         reqsk_put(req);
1791                         if (req_stolen) {
1792                                 /* Another cpu got exclusive access to req
1793                                  * and created a full blown socket.
1794                                  * Try to feed this packet to this socket
1795                                  * instead of discarding it.
1796                                  */
1797                                 tcp_v4_restore_cb(skb);
1798                                 sock_put(sk);
1799                                 goto lookup;
1800                         }
1801                         goto discard_and_relse;
1802                 }
1803                 if (nsk == sk) {
1804                         reqsk_put(req);
1805                         tcp_v4_restore_cb(skb);
1806                 } else if (tcp_child_process(sk, nsk, skb)) {
1807                         tcp_v4_send_reset(nsk, skb);
1808                         goto discard_and_relse;
1809                 } else {
1810                         sock_put(sk);
1811                         return 0;
1812                 }
1813         }
1814         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1815                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1816                 goto discard_and_relse;
1817         }
1818
1819         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1820                 goto discard_and_relse;
1821
1822         if (tcp_v4_inbound_md5_hash(sk, skb))
1823                 goto discard_and_relse;
1824
1825         nf_reset(skb);
1826
1827         if (tcp_filter(sk, skb))
1828                 goto discard_and_relse;
1829         th = (const struct tcphdr *)skb->data;
1830         iph = ip_hdr(skb);
1831         tcp_v4_fill_cb(skb, iph, th);
1832
1833         skb->dev = NULL;
1834
1835         if (sk->sk_state == TCP_LISTEN) {
1836                 ret = tcp_v4_do_rcv(sk, skb);
1837                 goto put_and_return;
1838         }
1839
1840         sk_incoming_cpu_update(sk);
1841
1842         bh_lock_sock_nested(sk);
1843         tcp_segs_in(tcp_sk(sk), skb);
1844         ret = 0;
1845         if (!sock_owned_by_user(sk)) {
1846                 ret = tcp_v4_do_rcv(sk, skb);
1847         } else if (tcp_add_backlog(sk, skb)) {
1848                 goto discard_and_relse;
1849         }
1850         bh_unlock_sock(sk);
1851
1852 put_and_return:
1853         if (refcounted)
1854                 sock_put(sk);
1855
1856         return ret;
1857
1858 no_tcp_socket:
1859         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1860                 goto discard_it;
1861
1862         tcp_v4_fill_cb(skb, iph, th);
1863
1864         if (tcp_checksum_complete(skb)) {
1865 csum_error:
1866                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1867 bad_packet:
1868                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1869         } else {
1870                 tcp_v4_send_reset(NULL, skb);
1871         }
1872
1873 discard_it:
1874         /* Discard frame. */
1875         kfree_skb(skb);
1876         return 0;
1877
1878 discard_and_relse:
1879         sk_drops_add(sk, skb);
1880         if (refcounted)
1881                 sock_put(sk);
1882         goto discard_it;
1883
1884 do_time_wait:
1885         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1886                 inet_twsk_put(inet_twsk(sk));
1887                 goto discard_it;
1888         }
1889
1890         tcp_v4_fill_cb(skb, iph, th);
1891
1892         if (tcp_checksum_complete(skb)) {
1893                 inet_twsk_put(inet_twsk(sk));
1894                 goto csum_error;
1895         }
1896         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1897         case TCP_TW_SYN: {
1898                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1899                                                         &tcp_hashinfo, skb,
1900                                                         __tcp_hdrlen(th),
1901                                                         iph->saddr, th->source,
1902                                                         iph->daddr, th->dest,
1903                                                         inet_iif(skb),
1904                                                         sdif);
1905                 if (sk2) {
1906                         inet_twsk_deschedule_put(inet_twsk(sk));
1907                         sk = sk2;
1908                         tcp_v4_restore_cb(skb);
1909                         refcounted = false;
1910                         goto process;
1911                 }
1912         }
1913                 /* to ACK */
1914                 /* fall through */
1915         case TCP_TW_ACK:
1916                 tcp_v4_timewait_ack(sk, skb);
1917                 break;
1918         case TCP_TW_RST:
1919                 tcp_v4_send_reset(sk, skb);
1920                 inet_twsk_deschedule_put(inet_twsk(sk));
1921                 goto discard_it;
1922         case TCP_TW_SUCCESS:;
1923         }
1924         goto discard_it;
1925 }
1926
1927 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1928         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1929         .twsk_unique    = tcp_twsk_unique,
1930         .twsk_destructor= tcp_twsk_destructor,
1931 };
1932
1933 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1934 {
1935         struct dst_entry *dst = skb_dst(skb);
1936
1937         if (dst && dst_hold_safe(dst)) {
1938                 rcu_assign_pointer(sk->sk_rx_dst, dst);
1939                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1940         }
1941 }
1942 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1943
1944 const struct inet_connection_sock_af_ops ipv4_specific = {
1945         .queue_xmit        = ip_queue_xmit,
1946         .send_check        = tcp_v4_send_check,
1947         .rebuild_header    = inet_sk_rebuild_header,
1948         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1949         .conn_request      = tcp_v4_conn_request,
1950         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1951         .net_header_len    = sizeof(struct iphdr),
1952         .setsockopt        = ip_setsockopt,
1953         .getsockopt        = ip_getsockopt,
1954         .addr2sockaddr     = inet_csk_addr2sockaddr,
1955         .sockaddr_len      = sizeof(struct sockaddr_in),
1956 #ifdef CONFIG_COMPAT
1957         .compat_setsockopt = compat_ip_setsockopt,
1958         .compat_getsockopt = compat_ip_getsockopt,
1959 #endif
1960         .mtu_reduced       = tcp_v4_mtu_reduced,
1961 };
1962 EXPORT_SYMBOL(ipv4_specific);
1963
1964 #ifdef CONFIG_TCP_MD5SIG
1965 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1966         .md5_lookup             = tcp_v4_md5_lookup,
1967         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1968         .md5_parse              = tcp_v4_parse_md5_keys,
1969 };
1970 #endif
1971
1972 /* NOTE: A lot of things set to zero explicitly by call to
1973  *       sk_alloc() so need not be done here.
1974  */
1975 static int tcp_v4_init_sock(struct sock *sk)
1976 {
1977         struct inet_connection_sock *icsk = inet_csk(sk);
1978
1979         tcp_init_sock(sk);
1980
1981         icsk->icsk_af_ops = &ipv4_specific;
1982
1983 #ifdef CONFIG_TCP_MD5SIG
1984         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1985 #endif
1986
1987         return 0;
1988 }
1989
1990 void tcp_v4_destroy_sock(struct sock *sk)
1991 {
1992         struct tcp_sock *tp = tcp_sk(sk);
1993
1994         trace_tcp_destroy_sock(sk);
1995
1996         tcp_clear_xmit_timers(sk);
1997
1998         tcp_cleanup_congestion_control(sk);
1999
2000         tcp_cleanup_ulp(sk);
2001
2002         /* Cleanup up the write buffer. */
2003         tcp_write_queue_purge(sk);
2004
2005         /* Check if we want to disable active TFO */
2006         tcp_fastopen_active_disable_ofo_check(sk);
2007
2008         /* Cleans up our, hopefully empty, out_of_order_queue. */
2009         skb_rbtree_purge(&tp->out_of_order_queue);
2010
2011 #ifdef CONFIG_TCP_MD5SIG
2012         /* Clean up the MD5 key list, if any */
2013         if (tp->md5sig_info) {
2014                 tcp_clear_md5_list(sk);
2015                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2016                 tp->md5sig_info = NULL;
2017         }
2018 #endif
2019
2020         /* Clean up a referenced TCP bind bucket. */
2021         if (inet_csk(sk)->icsk_bind_hash)
2022                 inet_put_port(sk);
2023
2024         BUG_ON(tp->fastopen_rsk);
2025
2026         /* If socket is aborted during connect operation */
2027         tcp_free_fastopen_req(tp);
2028         tcp_fastopen_destroy_cipher(sk);
2029         tcp_saved_syn_free(tp);
2030
2031         sk_sockets_allocated_dec(sk);
2032 }
2033 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2034
2035 #ifdef CONFIG_PROC_FS
2036 /* Proc filesystem TCP sock list dumping. */
2037
2038 /*
2039  * Get next listener socket follow cur.  If cur is NULL, get first socket
2040  * starting from bucket given in st->bucket; when st->bucket is zero the
2041  * very first socket in the hash table is returned.
2042  */
2043 static void *listening_get_next(struct seq_file *seq, void *cur)
2044 {
2045         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2046         struct tcp_iter_state *st = seq->private;
2047         struct net *net = seq_file_net(seq);
2048         struct inet_listen_hashbucket *ilb;
2049         struct hlist_nulls_node *node;
2050         struct sock *sk = cur;
2051
2052         if (!sk) {
2053 get_head:
2054                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2055                 spin_lock(&ilb->lock);
2056                 sk = sk_nulls_head(&ilb->nulls_head);
2057                 st->offset = 0;
2058                 goto get_sk;
2059         }
2060         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2061         ++st->num;
2062         ++st->offset;
2063
2064         sk = sk_nulls_next(sk);
2065 get_sk:
2066         sk_nulls_for_each_from(sk, node) {
2067                 if (!net_eq(sock_net(sk), net))
2068                         continue;
2069                 if (sk->sk_family == afinfo->family)
2070                         return sk;
2071         }
2072         spin_unlock(&ilb->lock);
2073         st->offset = 0;
2074         if (++st->bucket < INET_LHTABLE_SIZE)
2075                 goto get_head;
2076         return NULL;
2077 }
2078
2079 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2080 {
2081         struct tcp_iter_state *st = seq->private;
2082         void *rc;
2083
2084         st->bucket = 0;
2085         st->offset = 0;
2086         rc = listening_get_next(seq, NULL);
2087
2088         while (rc && *pos) {
2089                 rc = listening_get_next(seq, rc);
2090                 --*pos;
2091         }
2092         return rc;
2093 }
2094
2095 static inline bool empty_bucket(const struct tcp_iter_state *st)
2096 {
2097         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2098 }
2099
2100 /*
2101  * Get first established socket starting from bucket given in st->bucket.
2102  * If st->bucket is zero, the very first socket in the hash is returned.
2103  */
2104 static void *established_get_first(struct seq_file *seq)
2105 {
2106         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2107         struct tcp_iter_state *st = seq->private;
2108         struct net *net = seq_file_net(seq);
2109         void *rc = NULL;
2110
2111         st->offset = 0;
2112         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2113                 struct sock *sk;
2114                 struct hlist_nulls_node *node;
2115                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2116
2117                 /* Lockless fast path for the common case of empty buckets */
2118                 if (empty_bucket(st))
2119                         continue;
2120
2121                 spin_lock_bh(lock);
2122                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2123                         if (sk->sk_family != afinfo->family ||
2124                             !net_eq(sock_net(sk), net)) {
2125                                 continue;
2126                         }
2127                         rc = sk;
2128                         goto out;
2129                 }
2130                 spin_unlock_bh(lock);
2131         }
2132 out:
2133         return rc;
2134 }
2135
2136 static void *established_get_next(struct seq_file *seq, void *cur)
2137 {
2138         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2139         struct sock *sk = cur;
2140         struct hlist_nulls_node *node;
2141         struct tcp_iter_state *st = seq->private;
2142         struct net *net = seq_file_net(seq);
2143
2144         ++st->num;
2145         ++st->offset;
2146
2147         sk = sk_nulls_next(sk);
2148
2149         sk_nulls_for_each_from(sk, node) {
2150                 if (sk->sk_family == afinfo->family &&
2151                     net_eq(sock_net(sk), net))
2152                         return sk;
2153         }
2154
2155         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2156         ++st->bucket;
2157         return established_get_first(seq);
2158 }
2159
2160 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2161 {
2162         struct tcp_iter_state *st = seq->private;
2163         void *rc;
2164
2165         st->bucket = 0;
2166         rc = established_get_first(seq);
2167
2168         while (rc && pos) {
2169                 rc = established_get_next(seq, rc);
2170                 --pos;
2171         }
2172         return rc;
2173 }
2174
2175 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2176 {
2177         void *rc;
2178         struct tcp_iter_state *st = seq->private;
2179
2180         st->state = TCP_SEQ_STATE_LISTENING;
2181         rc        = listening_get_idx(seq, &pos);
2182
2183         if (!rc) {
2184                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2185                 rc        = established_get_idx(seq, pos);
2186         }
2187
2188         return rc;
2189 }
2190
2191 static void *tcp_seek_last_pos(struct seq_file *seq)
2192 {
2193         struct tcp_iter_state *st = seq->private;
2194         int bucket = st->bucket;
2195         int offset = st->offset;
2196         int orig_num = st->num;
2197         void *rc = NULL;
2198
2199         switch (st->state) {
2200         case TCP_SEQ_STATE_LISTENING:
2201                 if (st->bucket >= INET_LHTABLE_SIZE)
2202                         break;
2203                 st->state = TCP_SEQ_STATE_LISTENING;
2204                 rc = listening_get_next(seq, NULL);
2205                 while (offset-- && rc && bucket == st->bucket)
2206                         rc = listening_get_next(seq, rc);
2207                 if (rc)
2208                         break;
2209                 st->bucket = 0;
2210                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2211                 /* Fallthrough */
2212         case TCP_SEQ_STATE_ESTABLISHED:
2213                 if (st->bucket > tcp_hashinfo.ehash_mask)
2214                         break;
2215                 rc = established_get_first(seq);
2216                 while (offset-- && rc && bucket == st->bucket)
2217                         rc = established_get_next(seq, rc);
2218         }
2219
2220         st->num = orig_num;
2221
2222         return rc;
2223 }
2224
2225 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2226 {
2227         struct tcp_iter_state *st = seq->private;
2228         void *rc;
2229
2230         if (*pos && *pos == st->last_pos) {
2231                 rc = tcp_seek_last_pos(seq);
2232                 if (rc)
2233                         goto out;
2234         }
2235
2236         st->state = TCP_SEQ_STATE_LISTENING;
2237         st->num = 0;
2238         st->bucket = 0;
2239         st->offset = 0;
2240         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2241
2242 out:
2243         st->last_pos = *pos;
2244         return rc;
2245 }
2246 EXPORT_SYMBOL(tcp_seq_start);
2247
2248 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2249 {
2250         struct tcp_iter_state *st = seq->private;
2251         void *rc = NULL;
2252
2253         if (v == SEQ_START_TOKEN) {
2254                 rc = tcp_get_idx(seq, 0);
2255                 goto out;
2256         }
2257
2258         switch (st->state) {
2259         case TCP_SEQ_STATE_LISTENING:
2260                 rc = listening_get_next(seq, v);
2261                 if (!rc) {
2262                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2263                         st->bucket = 0;
2264                         st->offset = 0;
2265                         rc        = established_get_first(seq);
2266                 }
2267                 break;
2268         case TCP_SEQ_STATE_ESTABLISHED:
2269                 rc = established_get_next(seq, v);
2270                 break;
2271         }
2272 out:
2273         ++*pos;
2274         st->last_pos = *pos;
2275         return rc;
2276 }
2277 EXPORT_SYMBOL(tcp_seq_next);
2278
2279 void tcp_seq_stop(struct seq_file *seq, void *v)
2280 {
2281         struct tcp_iter_state *st = seq->private;
2282
2283         switch (st->state) {
2284         case TCP_SEQ_STATE_LISTENING:
2285                 if (v != SEQ_START_TOKEN)
2286                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2287                 break;
2288         case TCP_SEQ_STATE_ESTABLISHED:
2289                 if (v)
2290                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2291                 break;
2292         }
2293 }
2294 EXPORT_SYMBOL(tcp_seq_stop);
2295
2296 static void get_openreq4(const struct request_sock *req,
2297                          struct seq_file *f, int i)
2298 {
2299         const struct inet_request_sock *ireq = inet_rsk(req);
2300         long delta = req->rsk_timer.expires - jiffies;
2301
2302         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2303                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2304                 i,
2305                 ireq->ir_loc_addr,
2306                 ireq->ir_num,
2307                 ireq->ir_rmt_addr,
2308                 ntohs(ireq->ir_rmt_port),
2309                 TCP_SYN_RECV,
2310                 0, 0, /* could print option size, but that is af dependent. */
2311                 1,    /* timers active (only the expire timer) */
2312                 jiffies_delta_to_clock_t(delta),
2313                 req->num_timeout,
2314                 from_kuid_munged(seq_user_ns(f),
2315                                  sock_i_uid(req->rsk_listener)),
2316                 0,  /* non standard timer */
2317                 0, /* open_requests have no inode */
2318                 0,
2319                 req);
2320 }
2321
2322 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2323 {
2324         int timer_active;
2325         unsigned long timer_expires;
2326         const struct tcp_sock *tp = tcp_sk(sk);
2327         const struct inet_connection_sock *icsk = inet_csk(sk);
2328         const struct inet_sock *inet = inet_sk(sk);
2329         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2330         __be32 dest = inet->inet_daddr;
2331         __be32 src = inet->inet_rcv_saddr;
2332         __u16 destp = ntohs(inet->inet_dport);
2333         __u16 srcp = ntohs(inet->inet_sport);
2334         int rx_queue;
2335         int state;
2336
2337         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2338             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2339             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2340                 timer_active    = 1;
2341                 timer_expires   = icsk->icsk_timeout;
2342         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2343                 timer_active    = 4;
2344                 timer_expires   = icsk->icsk_timeout;
2345         } else if (timer_pending(&sk->sk_timer)) {
2346                 timer_active    = 2;
2347                 timer_expires   = sk->sk_timer.expires;
2348         } else {
2349                 timer_active    = 0;
2350                 timer_expires = jiffies;
2351         }
2352
2353         state = inet_sk_state_load(sk);
2354         if (state == TCP_LISTEN)
2355                 rx_queue = sk->sk_ack_backlog;
2356         else
2357                 /* Because we don't lock the socket,
2358                  * we might find a transient negative value.
2359                  */
2360                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2361                                       READ_ONCE(tp->copied_seq), 0);
2362
2363         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2364                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2365                 i, src, srcp, dest, destp, state,
2366                 READ_ONCE(tp->write_seq) - tp->snd_una,
2367                 rx_queue,
2368                 timer_active,
2369                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2370                 icsk->icsk_retransmits,
2371                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2372                 icsk->icsk_probes_out,
2373                 sock_i_ino(sk),
2374                 refcount_read(&sk->sk_refcnt), sk,
2375                 jiffies_to_clock_t(icsk->icsk_rto),
2376                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2377                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2378                 tp->snd_cwnd,
2379                 state == TCP_LISTEN ?
2380                     fastopenq->max_qlen :
2381                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2382 }
2383
2384 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2385                                struct seq_file *f, int i)
2386 {
2387         long delta = tw->tw_timer.expires - jiffies;
2388         __be32 dest, src;
2389         __u16 destp, srcp;
2390
2391         dest  = tw->tw_daddr;
2392         src   = tw->tw_rcv_saddr;
2393         destp = ntohs(tw->tw_dport);
2394         srcp  = ntohs(tw->tw_sport);
2395
2396         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2397                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2398                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2399                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2400                 refcount_read(&tw->tw_refcnt), tw);
2401 }
2402
2403 #define TMPSZ 150
2404
2405 static int tcp4_seq_show(struct seq_file *seq, void *v)
2406 {
2407         struct tcp_iter_state *st;
2408         struct sock *sk = v;
2409
2410         seq_setwidth(seq, TMPSZ - 1);
2411         if (v == SEQ_START_TOKEN) {
2412                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2413                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2414                            "inode");
2415                 goto out;
2416         }
2417         st = seq->private;
2418
2419         if (sk->sk_state == TCP_TIME_WAIT)
2420                 get_timewait4_sock(v, seq, st->num);
2421         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2422                 get_openreq4(v, seq, st->num);
2423         else
2424                 get_tcp4_sock(v, seq, st->num);
2425 out:
2426         seq_pad(seq, '\n');
2427         return 0;
2428 }
2429
2430 static const struct seq_operations tcp4_seq_ops = {
2431         .show           = tcp4_seq_show,
2432         .start          = tcp_seq_start,
2433         .next           = tcp_seq_next,
2434         .stop           = tcp_seq_stop,
2435 };
2436
2437 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2438         .family         = AF_INET,
2439 };
2440
2441 static int __net_init tcp4_proc_init_net(struct net *net)
2442 {
2443         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2444                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2445                 return -ENOMEM;
2446         return 0;
2447 }
2448
2449 static void __net_exit tcp4_proc_exit_net(struct net *net)
2450 {
2451         remove_proc_entry("tcp", net->proc_net);
2452 }
2453
2454 static struct pernet_operations tcp4_net_ops = {
2455         .init = tcp4_proc_init_net,
2456         .exit = tcp4_proc_exit_net,
2457 };
2458
2459 int __init tcp4_proc_init(void)
2460 {
2461         return register_pernet_subsys(&tcp4_net_ops);
2462 }
2463
2464 void tcp4_proc_exit(void)
2465 {
2466         unregister_pernet_subsys(&tcp4_net_ops);
2467 }
2468 #endif /* CONFIG_PROC_FS */
2469
2470 struct proto tcp_prot = {
2471         .name                   = "TCP",
2472         .owner                  = THIS_MODULE,
2473         .close                  = tcp_close,
2474         .pre_connect            = tcp_v4_pre_connect,
2475         .connect                = tcp_v4_connect,
2476         .disconnect             = tcp_disconnect,
2477         .accept                 = inet_csk_accept,
2478         .ioctl                  = tcp_ioctl,
2479         .init                   = tcp_v4_init_sock,
2480         .destroy                = tcp_v4_destroy_sock,
2481         .shutdown               = tcp_shutdown,
2482         .setsockopt             = tcp_setsockopt,
2483         .getsockopt             = tcp_getsockopt,
2484         .keepalive              = tcp_set_keepalive,
2485         .recvmsg                = tcp_recvmsg,
2486         .sendmsg                = tcp_sendmsg,
2487         .sendpage               = tcp_sendpage,
2488         .backlog_rcv            = tcp_v4_do_rcv,
2489         .release_cb             = tcp_release_cb,
2490         .hash                   = inet_hash,
2491         .unhash                 = inet_unhash,
2492         .get_port               = inet_csk_get_port,
2493         .enter_memory_pressure  = tcp_enter_memory_pressure,
2494         .leave_memory_pressure  = tcp_leave_memory_pressure,
2495         .stream_memory_free     = tcp_stream_memory_free,
2496         .sockets_allocated      = &tcp_sockets_allocated,
2497         .orphan_count           = &tcp_orphan_count,
2498         .memory_allocated       = &tcp_memory_allocated,
2499         .memory_pressure        = &tcp_memory_pressure,
2500         .sysctl_mem             = sysctl_tcp_mem,
2501         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2502         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2503         .max_header             = MAX_TCP_HEADER,
2504         .obj_size               = sizeof(struct tcp_sock),
2505         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2506         .twsk_prot              = &tcp_timewait_sock_ops,
2507         .rsk_prot               = &tcp_request_sock_ops,
2508         .h.hashinfo             = &tcp_hashinfo,
2509         .no_autobind            = true,
2510 #ifdef CONFIG_COMPAT
2511         .compat_setsockopt      = compat_tcp_setsockopt,
2512         .compat_getsockopt      = compat_tcp_getsockopt,
2513 #endif
2514         .diag_destroy           = tcp_abort,
2515 };
2516 EXPORT_SYMBOL(tcp_prot);
2517
2518 static void __net_exit tcp_sk_exit(struct net *net)
2519 {
2520         int cpu;
2521
2522         if (net->ipv4.tcp_congestion_control)
2523                 module_put(net->ipv4.tcp_congestion_control->owner);
2524
2525         for_each_possible_cpu(cpu)
2526                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2527         free_percpu(net->ipv4.tcp_sk);
2528 }
2529
2530 static int __net_init tcp_sk_init(struct net *net)
2531 {
2532         int res, cpu, cnt;
2533
2534         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2535         if (!net->ipv4.tcp_sk)
2536                 return -ENOMEM;
2537
2538         for_each_possible_cpu(cpu) {
2539                 struct sock *sk;
2540
2541                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2542                                            IPPROTO_TCP, net);
2543                 if (res)
2544                         goto fail;
2545                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2546
2547                 /* Please enforce IP_DF and IPID==0 for RST and
2548                  * ACK sent in SYN-RECV and TIME-WAIT state.
2549                  */
2550                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2551
2552                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2553         }
2554
2555         net->ipv4.sysctl_tcp_ecn = 2;
2556         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2557
2558         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2559         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2560         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2561         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2562
2563         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2564         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2565         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2566
2567         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2568         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2569         net->ipv4.sysctl_tcp_syncookies = 1;
2570         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2571         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2572         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2573         net->ipv4.sysctl_tcp_orphan_retries = 0;
2574         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2575         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2576         net->ipv4.sysctl_tcp_tw_reuse = 2;
2577
2578         cnt = tcp_hashinfo.ehash_mask + 1;
2579         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2580         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2581
2582         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2583         net->ipv4.sysctl_tcp_sack = 1;
2584         net->ipv4.sysctl_tcp_window_scaling = 1;
2585         net->ipv4.sysctl_tcp_timestamps = 1;
2586         net->ipv4.sysctl_tcp_early_retrans = 3;
2587         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2588         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2589         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2590         net->ipv4.sysctl_tcp_max_reordering = 300;
2591         net->ipv4.sysctl_tcp_dsack = 1;
2592         net->ipv4.sysctl_tcp_app_win = 31;
2593         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2594         net->ipv4.sysctl_tcp_frto = 2;
2595         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2596         /* This limits the percentage of the congestion window which we
2597          * will allow a single TSO frame to consume.  Building TSO frames
2598          * which are too large can cause TCP streams to be bursty.
2599          */
2600         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2601         /* Default TSQ limit of four TSO segments */
2602         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2603         /* rfc5961 challenge ack rate limiting */
2604         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2605         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2606         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2607         net->ipv4.sysctl_tcp_autocorking = 1;
2608         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2609         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2610         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2611         if (net != &init_net) {
2612                 memcpy(net->ipv4.sysctl_tcp_rmem,
2613                        init_net.ipv4.sysctl_tcp_rmem,
2614                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2615                 memcpy(net->ipv4.sysctl_tcp_wmem,
2616                        init_net.ipv4.sysctl_tcp_wmem,
2617                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2618         }
2619         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2620         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2621         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2622         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2623         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2624         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2625
2626         /* Reno is always built in */
2627         if (!net_eq(net, &init_net) &&
2628             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2629                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2630         else
2631                 net->ipv4.tcp_congestion_control = &tcp_reno;
2632
2633         return 0;
2634 fail:
2635         tcp_sk_exit(net);
2636
2637         return res;
2638 }
2639
2640 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2641 {
2642         struct net *net;
2643
2644         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2645
2646         list_for_each_entry(net, net_exit_list, exit_list)
2647                 tcp_fastopen_ctx_destroy(net);
2648 }
2649
2650 static struct pernet_operations __net_initdata tcp_sk_ops = {
2651        .init       = tcp_sk_init,
2652        .exit       = tcp_sk_exit,
2653        .exit_batch = tcp_sk_exit_batch,
2654 };
2655
2656 void __init tcp_v4_init(void)
2657 {
2658         if (register_pernet_subsys(&tcp_sk_ops))
2659                 panic("Failed to create the TCP control socket.\n");
2660 }