~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/tcp_ipv4.c

Version: ~ [ linux-5.13-rc1 ] ~ [ linux-5.12.2 ] ~ [ linux-5.11.19 ] ~ [ linux-5.10.35 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.117 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.190 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.232 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.268 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.268 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              Implementation of the Transmission Control Protocol(TCP).
  7  *
  8  *              IPv4 specific functions
  9  *
 10  *
 11  *              code split from:
 12  *              linux/ipv4/tcp.c
 13  *              linux/ipv4/tcp_input.c
 14  *              linux/ipv4/tcp_output.c
 15  *
 16  *              See tcp.c for author information
 17  *
 18  *      This program is free software; you can redistribute it and/or
 19  *      modify it under the terms of the GNU General Public License
 20  *      as published by the Free Software Foundation; either version
 21  *      2 of the License, or (at your option) any later version.
 22  */
 23 
 24 /*
 25  * Changes:
 26  *              David S. Miller :       New socket lookup architecture.
 27  *                                      This code is dedicated to John Dyson.
 28  *              David S. Miller :       Change semantics of established hash,
 29  *                                      half is devoted to TIME_WAIT sockets
 30  *                                      and the rest go in the other half.
 31  *              Andi Kleen :            Add support for syncookies and fixed
 32  *                                      some bugs: ip options weren't passed to
 33  *                                      the TCP layer, missed a check for an
 34  *                                      ACK bit.
 35  *              Andi Kleen :            Implemented fast path mtu discovery.
 36  *                                      Fixed many serious bugs in the
 37  *                                      request_sock handling and moved
 38  *                                      most of it into the af independent code.
 39  *                                      Added tail drop and some other bugfixes.
 40  *                                      Added new listen semantics.
 41  *              Mike McLagan    :       Routing by source
 42  *      Juan Jose Ciarlante:            ip_dynaddr bits
 43  *              Andi Kleen:             various fixes.
 44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
 45  *                                      coma.
 46  *      Andi Kleen              :       Fix new listen.
 47  *      Andi Kleen              :       Fix accept error reporting.
 48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
 49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 50  *                                      a single port at the same time.
 51  */
 52 
 53 #define pr_fmt(fmt) "TCP: " fmt
 54 
 55 #include <linux/bottom_half.h>
 56 #include <linux/types.h>
 57 #include <linux/fcntl.h>
 58 #include <linux/module.h>
 59 #include <linux/random.h>
 60 #include <linux/cache.h>
 61 #include <linux/jhash.h>
 62 #include <linux/init.h>
 63 #include <linux/times.h>
 64 #include <linux/slab.h>
 65 
 66 #include <net/net_namespace.h>
 67 #include <net/icmp.h>
 68 #include <net/inet_hashtables.h>
 69 #include <net/tcp.h>
 70 #include <net/transp_v6.h>
 71 #include <net/ipv6.h>
 72 #include <net/inet_common.h>
 73 #include <net/timewait_sock.h>
 74 #include <net/xfrm.h>
 75 #include <net/secure_seq.h>
 76 #include <net/busy_poll.h>
 77 
 78 #include <linux/inet.h>
 79 #include <linux/ipv6.h>
 80 #include <linux/stddef.h>
 81 #include <linux/proc_fs.h>
 82 #include <linux/seq_file.h>
 83 #include <linux/inetdevice.h>
 84 
 85 #include <crypto/hash.h>
 86 #include <linux/scatterlist.h>
 87 
 88 #include <trace/events/tcp.h>
 89 
 90 #ifdef CONFIG_TCP_MD5SIG
 91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
 93 #endif
 94 
 95 struct inet_hashinfo tcp_hashinfo;
 96 EXPORT_SYMBOL(tcp_hashinfo);
 97 
 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
 99 {
100         return secure_tcp_seq(ip_hdr(skb)->daddr,
101                               ip_hdr(skb)->saddr,
102                               tcp_hdr(skb)->dest,
103                               tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115 
116         /* With PAWS, it is safe from the viewpoint
117            of data integrity. Even without PAWS it is safe provided sequence
118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120            Actually, the idea is close to VJ's one, only timestamp cache is
121            held not per host, but per port pair and TW bucket is used as state
122            holder.
123 
124            If TW bucket has been already destroyed we fall back to VJ's scheme
125            and use initial timestamp retrieved from peer table.
126          */
127         if (tcptw->tw_ts_recent_stamp &&
128             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131                 if (tp->write_seq == 0)
132                         tp->write_seq = 1;
133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135                 sock_hold(sktw);
136                 return 1;
137         }
138 
139         return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147         struct inet_sock *inet = inet_sk(sk);
148         struct tcp_sock *tp = tcp_sk(sk);
149         __be16 orig_sport, orig_dport;
150         __be32 daddr, nexthop;
151         struct flowi4 *fl4;
152         struct rtable *rt;
153         int err;
154         struct ip_options_rcu *inet_opt;
155         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
156 
157         if (addr_len < sizeof(struct sockaddr_in))
158                 return -EINVAL;
159 
160         if (usin->sin_family != AF_INET)
161                 return -EAFNOSUPPORT;
162 
163         nexthop = daddr = usin->sin_addr.s_addr;
164         inet_opt = rcu_dereference_protected(inet->inet_opt,
165                                              lockdep_sock_is_held(sk));
166         if (inet_opt && inet_opt->opt.srr) {
167                 if (!daddr)
168                         return -EINVAL;
169                 nexthop = inet_opt->opt.faddr;
170         }
171 
172         orig_sport = inet->inet_sport;
173         orig_dport = usin->sin_port;
174         fl4 = &inet->cork.fl.u.ip4;
175         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
176                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177                               IPPROTO_TCP,
178                               orig_sport, orig_dport, sk);
179         if (IS_ERR(rt)) {
180                 err = PTR_ERR(rt);
181                 if (err == -ENETUNREACH)
182                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
183                 return err;
184         }
185 
186         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187                 ip_rt_put(rt);
188                 return -ENETUNREACH;
189         }
190 
191         if (!inet_opt || !inet_opt->opt.srr)
192                 daddr = fl4->daddr;
193 
194         if (!inet->inet_saddr)
195                 inet->inet_saddr = fl4->saddr;
196         sk_rcv_saddr_set(sk, inet->inet_saddr);
197 
198         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
199                 /* Reset inherited state */
200                 tp->rx_opt.ts_recent       = 0;
201                 tp->rx_opt.ts_recent_stamp = 0;
202                 if (likely(!tp->repair))
203                         tp->write_seq      = 0;
204         }
205 
206         inet->inet_dport = usin->sin_port;
207         sk_daddr_set(sk, daddr);
208 
209         inet_csk(sk)->icsk_ext_hdr_len = 0;
210         if (inet_opt)
211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215         /* Socket identity is still unknown (sport may be zero).
216          * However we set state to SYN-SENT and not releasing socket
217          * lock select source port, enter ourselves into the hash tables and
218          * complete initialization after this.
219          */
220         tcp_set_state(sk, TCP_SYN_SENT);
221         err = inet_hash_connect(tcp_death_row, sk);
222         if (err)
223                 goto failure;
224 
225         sk_set_txhash(sk);
226 
227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228                                inet->inet_sport, inet->inet_dport, sk);
229         if (IS_ERR(rt)) {
230                 err = PTR_ERR(rt);
231                 rt = NULL;
232                 goto failure;
233         }
234         /* OK, now commit destination to socket.  */
235         sk->sk_gso_type = SKB_GSO_TCPV4;
236         sk_setup_caps(sk, &rt->dst);
237         rt = NULL;
238 
239         if (likely(!tp->repair)) {
240                 if (!tp->write_seq)
241                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
242                                                        inet->inet_daddr,
243                                                        inet->inet_sport,
244                                                        usin->sin_port);
245                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
246                                                  inet->inet_saddr,
247                                                  inet->inet_daddr);
248         }
249 
250         inet->inet_id = tp->write_seq ^ jiffies;
251 
252         if (tcp_fastopen_defer_connect(sk, &err))
253                 return err;
254         if (err)
255                 goto failure;
256 
257         err = tcp_connect(sk);
258 
259         if (err)
260                 goto failure;
261 
262         return 0;
263 
264 failure:
265         /*
266          * This unhashes the socket and releases the local port,
267          * if necessary.
268          */
269         tcp_set_state(sk, TCP_CLOSE);
270         ip_rt_put(rt);
271         sk->sk_route_caps = 0;
272         inet->inet_dport = 0;
273         return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276 
277 /*
278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279  * It can be called through tcp_release_cb() if socket was owned by user
280  * at the time tcp_v4_err() was called to handle ICMP message.
281  */
282 void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284         struct inet_sock *inet = inet_sk(sk);
285         struct dst_entry *dst;
286         u32 mtu;
287 
288         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
289                 return;
290         mtu = tcp_sk(sk)->mtu_info;
291         dst = inet_csk_update_pmtu(sk, mtu);
292         if (!dst)
293                 return;
294 
295         /* Something is about to be wrong... Remember soft error
296          * for the case, if this connection will not able to recover.
297          */
298         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299                 sk->sk_err_soft = EMSGSIZE;
300 
301         mtu = dst_mtu(dst);
302 
303         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304             ip_sk_accept_pmtu(sk) &&
305             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
306                 tcp_sync_mss(sk, mtu);
307 
308                 /* Resend the TCP packet because it's
309                  * clear that the old packet has been
310                  * dropped. This is the new "fast" path mtu
311                  * discovery.
312                  */
313                 tcp_simple_retransmit(sk);
314         } /* else let the usual retransmit timer handle it */
315 }
316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
317 
318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
319 {
320         struct dst_entry *dst = __sk_dst_check(sk, 0);
321 
322         if (dst)
323                 dst->ops->redirect(dst, sk, skb);
324 }
325 
326 
327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
329 {
330         struct request_sock *req = inet_reqsk(sk);
331         struct net *net = sock_net(sk);
332 
333         /* ICMPs are not backlogged, hence we cannot get
334          * an established socket here.
335          */
336         if (seq != tcp_rsk(req)->snt_isn) {
337                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
338         } else if (abort) {
339                 /*
340                  * Still in SYN_RECV, just remove it silently.
341                  * There is no good way to pass the error to the newly
342                  * created socket, and POSIX does not want network
343                  * errors returned from accept().
344                  */
345                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
346                 tcp_listendrop(req->rsk_listener);
347         }
348         reqsk_put(req);
349 }
350 EXPORT_SYMBOL(tcp_req_err);
351 
352 /*
353  * This routine is called by the ICMP module when it gets some
354  * sort of error condition.  If err < 0 then the socket should
355  * be closed and the error returned to the user.  If err > 0
356  * it's just the icmp type << 8 | icmp code.  After adjustment
357  * header points to the first 8 bytes of the tcp header.  We need
358  * to find the appropriate port.
359  *
360  * The locking strategy used here is very "optimistic". When
361  * someone else accesses the socket the ICMP is just dropped
362  * and for some paths there is no check at all.
363  * A more general error queue to queue errors for later handling
364  * is probably better.
365  *
366  */
367 
368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
369 {
370         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
371         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
372         struct inet_connection_sock *icsk;
373         struct tcp_sock *tp;
374         struct inet_sock *inet;
375         const int type = icmp_hdr(icmp_skb)->type;
376         const int code = icmp_hdr(icmp_skb)->code;
377         struct sock *sk;
378         struct sk_buff *skb;
379         struct request_sock *fastopen;
380         u32 seq, snd_una;
381         s32 remaining;
382         u32 delta_us;
383         int err;
384         struct net *net = dev_net(icmp_skb->dev);
385 
386         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
387                                        th->dest, iph->saddr, ntohs(th->source),
388                                        inet_iif(icmp_skb), 0);
389         if (!sk) {
390                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
391                 return;
392         }
393         if (sk->sk_state == TCP_TIME_WAIT) {
394                 inet_twsk_put(inet_twsk(sk));
395                 return;
396         }
397         seq = ntohl(th->seq);
398         if (sk->sk_state == TCP_NEW_SYN_RECV)
399                 return tcp_req_err(sk, seq,
400                                   type == ICMP_PARAMETERPROB ||
401                                   type == ICMP_TIME_EXCEEDED ||
402                                   (type == ICMP_DEST_UNREACH &&
403                                    (code == ICMP_NET_UNREACH ||
404                                     code == ICMP_HOST_UNREACH)));
405 
406         bh_lock_sock(sk);
407         /* If too many ICMPs get dropped on busy
408          * servers this needs to be solved differently.
409          * We do take care of PMTU discovery (RFC1191) special case :
410          * we can receive locally generated ICMP messages while socket is held.
411          */
412         if (sock_owned_by_user(sk)) {
413                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
414                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
415         }
416         if (sk->sk_state == TCP_CLOSE)
417                 goto out;
418 
419         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
420                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
421                 goto out;
422         }
423 
424         icsk = inet_csk(sk);
425         tp = tcp_sk(sk);
426         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
427         fastopen = tp->fastopen_rsk;
428         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
429         if (sk->sk_state != TCP_LISTEN &&
430             !between(seq, snd_una, tp->snd_nxt)) {
431                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
432                 goto out;
433         }
434 
435         switch (type) {
436         case ICMP_REDIRECT:
437                 if (!sock_owned_by_user(sk))
438                         do_redirect(icmp_skb, sk);
439                 goto out;
440         case ICMP_SOURCE_QUENCH:
441                 /* Just silently ignore these. */
442                 goto out;
443         case ICMP_PARAMETERPROB:
444                 err = EPROTO;
445                 break;
446         case ICMP_DEST_UNREACH:
447                 if (code > NR_ICMP_UNREACH)
448                         goto out;
449 
450                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
451                         /* We are not interested in TCP_LISTEN and open_requests
452                          * (SYN-ACKs send out by Linux are always <576bytes so
453                          * they should go through unfragmented).
454                          */
455                         if (sk->sk_state == TCP_LISTEN)
456                                 goto out;
457 
458                         tp->mtu_info = info;
459                         if (!sock_owned_by_user(sk)) {
460                                 tcp_v4_mtu_reduced(sk);
461                         } else {
462                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
463                                         sock_hold(sk);
464                         }
465                         goto out;
466                 }
467 
468                 err = icmp_err_convert[code].errno;
469                 /* check if icmp_skb allows revert of backoff
470                  * (see draft-zimmermann-tcp-lcd) */
471                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
472                         break;
473                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
474                     !icsk->icsk_backoff || fastopen)
475                         break;
476 
477                 if (sock_owned_by_user(sk))
478                         break;
479 
480                 icsk->icsk_backoff--;
481                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
482                                                TCP_TIMEOUT_INIT;
483                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
484 
485                 skb = tcp_rtx_queue_head(sk);
486                 BUG_ON(!skb);
487 
488                 tcp_mstamp_refresh(tp);
489                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
490                 remaining = icsk->icsk_rto -
491                             usecs_to_jiffies(delta_us);
492 
493                 if (remaining > 0) {
494                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
495                                                   remaining, TCP_RTO_MAX);
496                 } else {
497                         /* RTO revert clocked out retransmission.
498                          * Will retransmit now */
499                         tcp_retransmit_timer(sk);
500                 }
501 
502                 break;
503         case ICMP_TIME_EXCEEDED:
504                 err = EHOSTUNREACH;
505                 break;
506         default:
507                 goto out;
508         }
509 
510         switch (sk->sk_state) {
511         case TCP_SYN_SENT:
512         case TCP_SYN_RECV:
513                 /* Only in fast or simultaneous open. If a fast open socket is
514                  * is already accepted it is treated as a connected one below.
515                  */
516                 if (fastopen && !fastopen->sk)
517                         break;
518 
519                 if (!sock_owned_by_user(sk)) {
520                         sk->sk_err = err;
521 
522                         sk->sk_error_report(sk);
523 
524                         tcp_done(sk);
525                 } else {
526                         sk->sk_err_soft = err;
527                 }
528                 goto out;
529         }
530 
531         /* If we've already connected we will keep trying
532          * until we time out, or the user gives up.
533          *
534          * rfc1122 4.2.3.9 allows to consider as hard errors
535          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
536          * but it is obsoleted by pmtu discovery).
537          *
538          * Note, that in modern internet, where routing is unreliable
539          * and in each dark corner broken firewalls sit, sending random
540          * errors ordered by their masters even this two messages finally lose
541          * their original sense (even Linux sends invalid PORT_UNREACHs)
542          *
543          * Now we are in compliance with RFCs.
544          *                                                      --ANK (980905)
545          */
546 
547         inet = inet_sk(sk);
548         if (!sock_owned_by_user(sk) && inet->recverr) {
549                 sk->sk_err = err;
550                 sk->sk_error_report(sk);
551         } else  { /* Only an error on timeout */
552                 sk->sk_err_soft = err;
553         }
554 
555 out:
556         bh_unlock_sock(sk);
557         sock_put(sk);
558 }
559 
560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
561 {
562         struct tcphdr *th = tcp_hdr(skb);
563 
564         if (skb->ip_summed == CHECKSUM_PARTIAL) {
565                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
566                 skb->csum_start = skb_transport_header(skb) - skb->head;
567                 skb->csum_offset = offsetof(struct tcphdr, check);
568         } else {
569                 th->check = tcp_v4_check(skb->len, saddr, daddr,
570                                          csum_partial(th,
571                                                       th->doff << 2,
572                                                       skb->csum));
573         }
574 }
575 
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
578 {
579         const struct inet_sock *inet = inet_sk(sk);
580 
581         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582 }
583 EXPORT_SYMBOL(tcp_v4_send_check);
584 
585 /*
586  *      This routine will send an RST to the other tcp.
587  *
588  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
589  *                    for reset.
590  *      Answer: if a packet caused RST, it is not for a socket
591  *              existing in our system, if it is matched to a socket,
592  *              it is just duplicate segment or bug in other side's TCP.
593  *              So that we build reply only basing on parameters
594  *              arrived with segment.
595  *      Exception: precedence violation. We do not implement it in any case.
596  */
597 
598 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
599 {
600         const struct tcphdr *th = tcp_hdr(skb);
601         struct {
602                 struct tcphdr th;
603 #ifdef CONFIG_TCP_MD5SIG
604                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
605 #endif
606         } rep;
607         struct ip_reply_arg arg;
608 #ifdef CONFIG_TCP_MD5SIG
609         struct tcp_md5sig_key *key = NULL;
610         const __u8 *hash_location = NULL;
611         unsigned char newhash[16];
612         int genhash;
613         struct sock *sk1 = NULL;
614 #endif
615         struct net *net;
616 
617         /* Never send a reset in response to a reset. */
618         if (th->rst)
619                 return;
620 
621         /* If sk not NULL, it means we did a successful lookup and incoming
622          * route had to be correct. prequeue might have dropped our dst.
623          */
624         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
625                 return;
626 
627         /* Swap the send and the receive. */
628         memset(&rep, 0, sizeof(rep));
629         rep.th.dest   = th->source;
630         rep.th.source = th->dest;
631         rep.th.doff   = sizeof(struct tcphdr) / 4;
632         rep.th.rst    = 1;
633 
634         if (th->ack) {
635                 rep.th.seq = th->ack_seq;
636         } else {
637                 rep.th.ack = 1;
638                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
639                                        skb->len - (th->doff << 2));
640         }
641 
642         memset(&arg, 0, sizeof(arg));
643         arg.iov[0].iov_base = (unsigned char *)&rep;
644         arg.iov[0].iov_len  = sizeof(rep.th);
645 
646         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
647 #ifdef CONFIG_TCP_MD5SIG
648         rcu_read_lock();
649         hash_location = tcp_parse_md5sig_option(th);
650         if (sk && sk_fullsock(sk)) {
651                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
652                                         &ip_hdr(skb)->saddr, AF_INET);
653         } else if (hash_location) {
654                 /*
655                  * active side is lost. Try to find listening socket through
656                  * source port, and then find md5 key through listening socket.
657                  * we are not loose security here:
658                  * Incoming packet is checked with md5 hash with finding key,
659                  * no RST generated if md5 hash doesn't match.
660                  */
661                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
662                                              ip_hdr(skb)->saddr,
663                                              th->source, ip_hdr(skb)->daddr,
664                                              ntohs(th->source), inet_iif(skb),
665                                              tcp_v4_sdif(skb));
666                 /* don't send rst if it can't find key */
667                 if (!sk1)
668                         goto out;
669 
670                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
671                                         &ip_hdr(skb)->saddr, AF_INET);
672                 if (!key)
673                         goto out;
674 
675 
676                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
677                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
678                         goto out;
679 
680         }
681 
682         if (key) {
683                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
684                                    (TCPOPT_NOP << 16) |
685                                    (TCPOPT_MD5SIG << 8) |
686                                    TCPOLEN_MD5SIG);
687                 /* Update length and the length the header thinks exists */
688                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689                 rep.th.doff = arg.iov[0].iov_len / 4;
690 
691                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
692                                      key, ip_hdr(skb)->saddr,
693                                      ip_hdr(skb)->daddr, &rep.th);
694         }
695 #endif
696         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
697                                       ip_hdr(skb)->saddr, /* XXX */
698                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
699         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
700         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
701 
702         /* When socket is gone, all binding information is lost.
703          * routing might fail in this case. No choice here, if we choose to force
704          * input interface, we will misroute in case of asymmetric route.
705          */
706         if (sk) {
707                 arg.bound_dev_if = sk->sk_bound_dev_if;
708                 if (sk_fullsock(sk))
709                         trace_tcp_send_reset(sk, skb);
710         }
711 
712         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
713                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
714 
715         arg.tos = ip_hdr(skb)->tos;
716         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
717         local_bh_disable();
718         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
719                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
720                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
721                               &arg, arg.iov[0].iov_len);
722 
723         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
724         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
725         local_bh_enable();
726 
727 #ifdef CONFIG_TCP_MD5SIG
728 out:
729         rcu_read_unlock();
730 #endif
731 }
732 
733 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
734    outside socket context is ugly, certainly. What can I do?
735  */
736 
737 static void tcp_v4_send_ack(const struct sock *sk,
738                             struct sk_buff *skb, u32 seq, u32 ack,
739                             u32 win, u32 tsval, u32 tsecr, int oif,
740                             struct tcp_md5sig_key *key,
741                             int reply_flags, u8 tos)
742 {
743         const struct tcphdr *th = tcp_hdr(skb);
744         struct {
745                 struct tcphdr th;
746                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
747 #ifdef CONFIG_TCP_MD5SIG
748                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
749 #endif
750                         ];
751         } rep;
752         struct net *net = sock_net(sk);
753         struct ip_reply_arg arg;
754 
755         memset(&rep.th, 0, sizeof(struct tcphdr));
756         memset(&arg, 0, sizeof(arg));
757 
758         arg.iov[0].iov_base = (unsigned char *)&rep;
759         arg.iov[0].iov_len  = sizeof(rep.th);
760         if (tsecr) {
761                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
762                                    (TCPOPT_TIMESTAMP << 8) |
763                                    TCPOLEN_TIMESTAMP);
764                 rep.opt[1] = htonl(tsval);
765                 rep.opt[2] = htonl(tsecr);
766                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
767         }
768 
769         /* Swap the send and the receive. */
770         rep.th.dest    = th->source;
771         rep.th.source  = th->dest;
772         rep.th.doff    = arg.iov[0].iov_len / 4;
773         rep.th.seq     = htonl(seq);
774         rep.th.ack_seq = htonl(ack);
775         rep.th.ack     = 1;
776         rep.th.window  = htons(win);
777 
778 #ifdef CONFIG_TCP_MD5SIG
779         if (key) {
780                 int offset = (tsecr) ? 3 : 0;
781 
782                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
783                                           (TCPOPT_NOP << 16) |
784                                           (TCPOPT_MD5SIG << 8) |
785                                           TCPOLEN_MD5SIG);
786                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
787                 rep.th.doff = arg.iov[0].iov_len/4;
788 
789                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
790                                     key, ip_hdr(skb)->saddr,
791                                     ip_hdr(skb)->daddr, &rep.th);
792         }
793 #endif
794         arg.flags = reply_flags;
795         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
796                                       ip_hdr(skb)->saddr, /* XXX */
797                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
798         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
799         if (oif)
800                 arg.bound_dev_if = oif;
801         arg.tos = tos;
802         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
803         local_bh_disable();
804         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
805                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
806                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
807                               &arg, arg.iov[0].iov_len);
808 
809         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
810         local_bh_enable();
811 }
812 
813 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
814 {
815         struct inet_timewait_sock *tw = inet_twsk(sk);
816         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
817 
818         tcp_v4_send_ack(sk, skb,
819                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
820                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
821                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
822                         tcptw->tw_ts_recent,
823                         tw->tw_bound_dev_if,
824                         tcp_twsk_md5_key(tcptw),
825                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
826                         tw->tw_tos
827                         );
828 
829         inet_twsk_put(tw);
830 }
831 
832 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
833                                   struct request_sock *req)
834 {
835         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
836          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
837          */
838         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
839                                              tcp_sk(sk)->snd_nxt;
840 
841         /* RFC 7323 2.3
842          * The window field (SEG.WND) of every outgoing segment, with the
843          * exception of <SYN> segments, MUST be right-shifted by
844          * Rcv.Wind.Shift bits:
845          */
846         tcp_v4_send_ack(sk, skb, seq,
847                         tcp_rsk(req)->rcv_nxt,
848                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
849                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
850                         req->ts_recent,
851                         0,
852                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
853                                           AF_INET),
854                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
855                         ip_hdr(skb)->tos);
856 }
857 
858 /*
859  *      Send a SYN-ACK after having received a SYN.
860  *      This still operates on a request_sock only, not on a big
861  *      socket.
862  */
863 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
864                               struct flowi *fl,
865                               struct request_sock *req,
866                               struct tcp_fastopen_cookie *foc,
867                               enum tcp_synack_type synack_type)
868 {
869         const struct inet_request_sock *ireq = inet_rsk(req);
870         struct flowi4 fl4;
871         int err = -1;
872         struct sk_buff *skb;
873 
874         /* First, grab a route. */
875         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
876                 return -1;
877 
878         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
879 
880         if (skb) {
881                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
882 
883                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
884                                             ireq->ir_rmt_addr,
885                                             ireq_opt_deref(ireq));
886                 err = net_xmit_eval(err);
887         }
888 
889         return err;
890 }
891 
892 /*
893  *      IPv4 request_sock destructor.
894  */
895 static void tcp_v4_reqsk_destructor(struct request_sock *req)
896 {
897         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
898 }
899 
900 #ifdef CONFIG_TCP_MD5SIG
901 /*
902  * RFC2385 MD5 checksumming requires a mapping of
903  * IP address->MD5 Key.
904  * We need to maintain these in the sk structure.
905  */
906 
907 /* Find the Key structure for an address.  */
908 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
909                                          const union tcp_md5_addr *addr,
910                                          int family)
911 {
912         const struct tcp_sock *tp = tcp_sk(sk);
913         struct tcp_md5sig_key *key;
914         const struct tcp_md5sig_info *md5sig;
915         __be32 mask;
916         struct tcp_md5sig_key *best_match = NULL;
917         bool match;
918 
919         /* caller either holds rcu_read_lock() or socket lock */
920         md5sig = rcu_dereference_check(tp->md5sig_info,
921                                        lockdep_sock_is_held(sk));
922         if (!md5sig)
923                 return NULL;
924 
925         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
926                 if (key->family != family)
927                         continue;
928 
929                 if (family == AF_INET) {
930                         mask = inet_make_mask(key->prefixlen);
931                         match = (key->addr.a4.s_addr & mask) ==
932                                 (addr->a4.s_addr & mask);
933 #if IS_ENABLED(CONFIG_IPV6)
934                 } else if (family == AF_INET6) {
935                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
936                                                   key->prefixlen);
937 #endif
938                 } else {
939                         match = false;
940                 }
941 
942                 if (match && (!best_match ||
943                               key->prefixlen > best_match->prefixlen))
944                         best_match = key;
945         }
946         return best_match;
947 }
948 EXPORT_SYMBOL(tcp_md5_do_lookup);
949 
950 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
951                                                       const union tcp_md5_addr *addr,
952                                                       int family, u8 prefixlen)
953 {
954         const struct tcp_sock *tp = tcp_sk(sk);
955         struct tcp_md5sig_key *key;
956         unsigned int size = sizeof(struct in_addr);
957         const struct tcp_md5sig_info *md5sig;
958 
959         /* caller either holds rcu_read_lock() or socket lock */
960         md5sig = rcu_dereference_check(tp->md5sig_info,
961                                        lockdep_sock_is_held(sk));
962         if (!md5sig)
963                 return NULL;
964 #if IS_ENABLED(CONFIG_IPV6)
965         if (family == AF_INET6)
966                 size = sizeof(struct in6_addr);
967 #endif
968         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
969                 if (key->family != family)
970                         continue;
971                 if (!memcmp(&key->addr, addr, size) &&
972                     key->prefixlen == prefixlen)
973                         return key;
974         }
975         return NULL;
976 }
977 
978 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
979                                          const struct sock *addr_sk)
980 {
981         const union tcp_md5_addr *addr;
982 
983         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
984         return tcp_md5_do_lookup(sk, addr, AF_INET);
985 }
986 EXPORT_SYMBOL(tcp_v4_md5_lookup);
987 
988 /* This can be called on a newly created socket, from other files */
989 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
990                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
991                    gfp_t gfp)
992 {
993         /* Add Key to the list */
994         struct tcp_md5sig_key *key;
995         struct tcp_sock *tp = tcp_sk(sk);
996         struct tcp_md5sig_info *md5sig;
997 
998         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
999         if (key) {
1000                 /* Pre-existing entry - just update that one. */
1001                 memcpy(key->key, newkey, newkeylen);
1002                 key->keylen = newkeylen;
1003                 return 0;
1004         }
1005 
1006         md5sig = rcu_dereference_protected(tp->md5sig_info,
1007                                            lockdep_sock_is_held(sk));
1008         if (!md5sig) {
1009                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1010                 if (!md5sig)
1011                         return -ENOMEM;
1012 
1013                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1014                 INIT_HLIST_HEAD(&md5sig->head);
1015                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1016         }
1017 
1018         key = sock_kmalloc(sk, sizeof(*key), gfp);
1019         if (!key)
1020                 return -ENOMEM;
1021         if (!tcp_alloc_md5sig_pool()) {
1022                 sock_kfree_s(sk, key, sizeof(*key));
1023                 return -ENOMEM;
1024         }
1025 
1026         memcpy(key->key, newkey, newkeylen);
1027         key->keylen = newkeylen;
1028         key->family = family;
1029         key->prefixlen = prefixlen;
1030         memcpy(&key->addr, addr,
1031                (family == AF_INET6) ? sizeof(struct in6_addr) :
1032                                       sizeof(struct in_addr));
1033         hlist_add_head_rcu(&key->node, &md5sig->head);
1034         return 0;
1035 }
1036 EXPORT_SYMBOL(tcp_md5_do_add);
1037 
1038 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1039                    u8 prefixlen)
1040 {
1041         struct tcp_md5sig_key *key;
1042 
1043         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1044         if (!key)
1045                 return -ENOENT;
1046         hlist_del_rcu(&key->node);
1047         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1048         kfree_rcu(key, rcu);
1049         return 0;
1050 }
1051 EXPORT_SYMBOL(tcp_md5_do_del);
1052 
1053 static void tcp_clear_md5_list(struct sock *sk)
1054 {
1055         struct tcp_sock *tp = tcp_sk(sk);
1056         struct tcp_md5sig_key *key;
1057         struct hlist_node *n;
1058         struct tcp_md5sig_info *md5sig;
1059 
1060         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1061 
1062         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1063                 hlist_del_rcu(&key->node);
1064                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1065                 kfree_rcu(key, rcu);
1066         }
1067 }
1068 
1069 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1070                                  char __user *optval, int optlen)
1071 {
1072         struct tcp_md5sig cmd;
1073         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1074         u8 prefixlen = 32;
1075 
1076         if (optlen < sizeof(cmd))
1077                 return -EINVAL;
1078 
1079         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1080                 return -EFAULT;
1081 
1082         if (sin->sin_family != AF_INET)
1083                 return -EINVAL;
1084 
1085         if (optname == TCP_MD5SIG_EXT &&
1086             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1087                 prefixlen = cmd.tcpm_prefixlen;
1088                 if (prefixlen > 32)
1089                         return -EINVAL;
1090         }
1091 
1092         if (!cmd.tcpm_keylen)
1093                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1094                                       AF_INET, prefixlen);
1095 
1096         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1097                 return -EINVAL;
1098 
1099         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1100                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1101                               GFP_KERNEL);
1102 }
1103 
1104 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1105                                    __be32 daddr, __be32 saddr,
1106                                    const struct tcphdr *th, int nbytes)
1107 {
1108         struct tcp4_pseudohdr *bp;
1109         struct scatterlist sg;
1110         struct tcphdr *_th;
1111 
1112         bp = hp->scratch;
1113         bp->saddr = saddr;
1114         bp->daddr = daddr;
1115         bp->pad = 0;
1116         bp->protocol = IPPROTO_TCP;
1117         bp->len = cpu_to_be16(nbytes);
1118 
1119         _th = (struct tcphdr *)(bp + 1);
1120         memcpy(_th, th, sizeof(*th));
1121         _th->check = 0;
1122 
1123         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1124         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1125                                 sizeof(*bp) + sizeof(*th));
1126         return crypto_ahash_update(hp->md5_req);
1127 }
1128 
1129 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1130                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1131 {
1132         struct tcp_md5sig_pool *hp;
1133         struct ahash_request *req;
1134 
1135         hp = tcp_get_md5sig_pool();
1136         if (!hp)
1137                 goto clear_hash_noput;
1138         req = hp->md5_req;
1139 
1140         if (crypto_ahash_init(req))
1141                 goto clear_hash;
1142         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1143                 goto clear_hash;
1144         if (tcp_md5_hash_key(hp, key))
1145                 goto clear_hash;
1146         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1147         if (crypto_ahash_final(req))
1148                 goto clear_hash;
1149 
1150         tcp_put_md5sig_pool();
1151         return 0;
1152 
1153 clear_hash:
1154         tcp_put_md5sig_pool();
1155 clear_hash_noput:
1156         memset(md5_hash, 0, 16);
1157         return 1;
1158 }
1159 
1160 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1161                         const struct sock *sk,
1162                         const struct sk_buff *skb)
1163 {
1164         struct tcp_md5sig_pool *hp;
1165         struct ahash_request *req;
1166         const struct tcphdr *th = tcp_hdr(skb);
1167         __be32 saddr, daddr;
1168 
1169         if (sk) { /* valid for establish/request sockets */
1170                 saddr = sk->sk_rcv_saddr;
1171                 daddr = sk->sk_daddr;
1172         } else {
1173                 const struct iphdr *iph = ip_hdr(skb);
1174                 saddr = iph->saddr;
1175                 daddr = iph->daddr;
1176         }
1177 
1178         hp = tcp_get_md5sig_pool();
1179         if (!hp)
1180                 goto clear_hash_noput;
1181         req = hp->md5_req;
1182 
1183         if (crypto_ahash_init(req))
1184                 goto clear_hash;
1185 
1186         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1187                 goto clear_hash;
1188         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1189                 goto clear_hash;
1190         if (tcp_md5_hash_key(hp, key))
1191                 goto clear_hash;
1192         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1193         if (crypto_ahash_final(req))
1194                 goto clear_hash;
1195 
1196         tcp_put_md5sig_pool();
1197         return 0;
1198 
1199 clear_hash:
1200         tcp_put_md5sig_pool();
1201 clear_hash_noput:
1202         memset(md5_hash, 0, 16);
1203         return 1;
1204 }
1205 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1206 
1207 #endif
1208 
1209 /* Called with rcu_read_lock() */
1210 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1211                                     const struct sk_buff *skb)
1212 {
1213 #ifdef CONFIG_TCP_MD5SIG
1214         /*
1215          * This gets called for each TCP segment that arrives
1216          * so we want to be efficient.
1217          * We have 3 drop cases:
1218          * o No MD5 hash and one expected.
1219          * o MD5 hash and we're not expecting one.
1220          * o MD5 hash and its wrong.
1221          */
1222         const __u8 *hash_location = NULL;
1223         struct tcp_md5sig_key *hash_expected;
1224         const struct iphdr *iph = ip_hdr(skb);
1225         const struct tcphdr *th = tcp_hdr(skb);
1226         int genhash;
1227         unsigned char newhash[16];
1228 
1229         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1230                                           AF_INET);
1231         hash_location = tcp_parse_md5sig_option(th);
1232 
1233         /* We've parsed the options - do we have a hash? */
1234         if (!hash_expected && !hash_location)
1235                 return false;
1236 
1237         if (hash_expected && !hash_location) {
1238                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1239                 return true;
1240         }
1241 
1242         if (!hash_expected && hash_location) {
1243                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1244                 return true;
1245         }
1246 
1247         /* Okay, so this is hash_expected and hash_location -
1248          * so we need to calculate the checksum.
1249          */
1250         genhash = tcp_v4_md5_hash_skb(newhash,
1251                                       hash_expected,
1252                                       NULL, skb);
1253 
1254         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1255                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1256                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1257                                      &iph->saddr, ntohs(th->source),
1258                                      &iph->daddr, ntohs(th->dest),
1259                                      genhash ? " tcp_v4_calc_md5_hash failed"
1260                                      : "");
1261                 return true;
1262         }
1263         return false;
1264 #endif
1265         return false;
1266 }
1267 
1268 static void tcp_v4_init_req(struct request_sock *req,
1269                             const struct sock *sk_listener,
1270                             struct sk_buff *skb)
1271 {
1272         struct inet_request_sock *ireq = inet_rsk(req);
1273         struct net *net = sock_net(sk_listener);
1274 
1275         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1276         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1277         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1278 }
1279 
1280 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1281                                           struct flowi *fl,
1282                                           const struct request_sock *req)
1283 {
1284         return inet_csk_route_req(sk, &fl->u.ip4, req);
1285 }
1286 
1287 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1288         .family         =       PF_INET,
1289         .obj_size       =       sizeof(struct tcp_request_sock),
1290         .rtx_syn_ack    =       tcp_rtx_synack,
1291         .send_ack       =       tcp_v4_reqsk_send_ack,
1292         .destructor     =       tcp_v4_reqsk_destructor,
1293         .send_reset     =       tcp_v4_send_reset,
1294         .syn_ack_timeout =      tcp_syn_ack_timeout,
1295 };
1296 
1297 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1298         .mss_clamp      =       TCP_MSS_DEFAULT,
1299 #ifdef CONFIG_TCP_MD5SIG
1300         .req_md5_lookup =       tcp_v4_md5_lookup,
1301         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1302 #endif
1303         .init_req       =       tcp_v4_init_req,
1304 #ifdef CONFIG_SYN_COOKIES
1305         .cookie_init_seq =      cookie_v4_init_sequence,
1306 #endif
1307         .route_req      =       tcp_v4_route_req,
1308         .init_seq       =       tcp_v4_init_seq,
1309         .init_ts_off    =       tcp_v4_init_ts_off,
1310         .send_synack    =       tcp_v4_send_synack,
1311 };
1312 
1313 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1314 {
1315         /* Never answer to SYNs send to broadcast or multicast */
1316         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1317                 goto drop;
1318 
1319         return tcp_conn_request(&tcp_request_sock_ops,
1320                                 &tcp_request_sock_ipv4_ops, sk, skb);
1321 
1322 drop:
1323         tcp_listendrop(sk);
1324         return 0;
1325 }
1326 EXPORT_SYMBOL(tcp_v4_conn_request);
1327 
1328 
1329 /*
1330  * The three way handshake has completed - we got a valid synack -
1331  * now create the new socket.
1332  */
1333 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1334                                   struct request_sock *req,
1335                                   struct dst_entry *dst,
1336                                   struct request_sock *req_unhash,
1337                                   bool *own_req)
1338 {
1339         struct inet_request_sock *ireq;
1340         struct inet_sock *newinet;
1341         struct tcp_sock *newtp;
1342         struct sock *newsk;
1343 #ifdef CONFIG_TCP_MD5SIG
1344         struct tcp_md5sig_key *key;
1345 #endif
1346         struct ip_options_rcu *inet_opt;
1347 
1348         if (sk_acceptq_is_full(sk))
1349                 goto exit_overflow;
1350 
1351         newsk = tcp_create_openreq_child(sk, req, skb);
1352         if (!newsk)
1353                 goto exit_nonewsk;
1354 
1355         newsk->sk_gso_type = SKB_GSO_TCPV4;
1356         inet_sk_rx_dst_set(newsk, skb);
1357 
1358         newtp                 = tcp_sk(newsk);
1359         newinet               = inet_sk(newsk);
1360         ireq                  = inet_rsk(req);
1361         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1362         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1363         newsk->sk_bound_dev_if = ireq->ir_iif;
1364         newinet->inet_saddr   = ireq->ir_loc_addr;
1365         inet_opt              = rcu_dereference(ireq->ireq_opt);
1366         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1367         newinet->mc_index     = inet_iif(skb);
1368         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1369         newinet->rcv_tos      = ip_hdr(skb)->tos;
1370         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1371         if (inet_opt)
1372                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1373         newinet->inet_id = newtp->write_seq ^ jiffies;
1374 
1375         if (!dst) {
1376                 dst = inet_csk_route_child_sock(sk, newsk, req);
1377                 if (!dst)
1378                         goto put_and_exit;
1379         } else {
1380                 /* syncookie case : see end of cookie_v4_check() */
1381         }
1382         sk_setup_caps(newsk, dst);
1383 
1384         tcp_ca_openreq_child(newsk, dst);
1385 
1386         tcp_sync_mss(newsk, dst_mtu(dst));
1387         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1388 
1389         tcp_initialize_rcv_mss(newsk);
1390 
1391 #ifdef CONFIG_TCP_MD5SIG
1392         /* Copy over the MD5 key from the original socket */
1393         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1394                                 AF_INET);
1395         if (key) {
1396                 /*
1397                  * We're using one, so create a matching key
1398                  * on the newsk structure. If we fail to get
1399                  * memory, then we end up not copying the key
1400                  * across. Shucks.
1401                  */
1402                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1403                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1404                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1405         }
1406 #endif
1407 
1408         if (__inet_inherit_port(sk, newsk) < 0)
1409                 goto put_and_exit;
1410         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1411         if (likely(*own_req)) {
1412                 tcp_move_syn(newtp, req);
1413                 ireq->ireq_opt = NULL;
1414         } else {
1415                 newinet->inet_opt = NULL;
1416         }
1417         return newsk;
1418 
1419 exit_overflow:
1420         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1421 exit_nonewsk:
1422         dst_release(dst);
1423 exit:
1424         tcp_listendrop(sk);
1425         return NULL;
1426 put_and_exit:
1427         newinet->inet_opt = NULL;
1428         inet_csk_prepare_forced_close(newsk);
1429         tcp_done(newsk);
1430         goto exit;
1431 }
1432 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1433 
1434 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1435 {
1436 #ifdef CONFIG_SYN_COOKIES
1437         const struct tcphdr *th = tcp_hdr(skb);
1438 
1439         if (!th->syn)
1440                 sk = cookie_v4_check(sk, skb);
1441 #endif
1442         return sk;
1443 }
1444 
1445 /* The socket must have it's spinlock held when we get
1446  * here, unless it is a TCP_LISTEN socket.
1447  *
1448  * We have a potential double-lock case here, so even when
1449  * doing backlog processing we use the BH locking scheme.
1450  * This is because we cannot sleep with the original spinlock
1451  * held.
1452  */
1453 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1454 {
1455         struct sock *rsk;
1456 
1457         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1458                 struct dst_entry *dst = sk->sk_rx_dst;
1459 
1460                 sock_rps_save_rxhash(sk, skb);
1461                 sk_mark_napi_id(sk, skb);
1462                 if (dst) {
1463                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1464                             !dst->ops->check(dst, 0)) {
1465                                 dst_release(dst);
1466                                 sk->sk_rx_dst = NULL;
1467                         }
1468                 }
1469                 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1470                 return 0;
1471         }
1472 
1473         if (tcp_checksum_complete(skb))
1474                 goto csum_err;
1475 
1476         if (sk->sk_state == TCP_LISTEN) {
1477                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1478 
1479                 if (!nsk)
1480                         goto discard;
1481                 if (nsk != sk) {
1482                         if (tcp_child_process(sk, nsk, skb)) {
1483                                 rsk = nsk;
1484                                 goto reset;
1485                         }
1486                         return 0;
1487                 }
1488         } else
1489                 sock_rps_save_rxhash(sk, skb);
1490 
1491         if (tcp_rcv_state_process(sk, skb)) {
1492                 rsk = sk;
1493                 goto reset;
1494         }
1495         return 0;
1496 
1497 reset:
1498         tcp_v4_send_reset(rsk, skb);
1499 discard:
1500         kfree_skb(skb);
1501         /* Be careful here. If this function gets more complicated and
1502          * gcc suffers from register pressure on the x86, sk (in %ebx)
1503          * might be destroyed here. This current version compiles correctly,
1504          * but you have been warned.
1505          */
1506         return 0;
1507 
1508 csum_err:
1509         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1510         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1511         goto discard;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_do_rcv);
1514 
1515 int tcp_v4_early_demux(struct sk_buff *skb)
1516 {
1517         const struct iphdr *iph;
1518         const struct tcphdr *th;
1519         struct sock *sk;
1520 
1521         if (skb->pkt_type != PACKET_HOST)
1522                 return 0;
1523 
1524         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1525                 return 0;
1526 
1527         iph = ip_hdr(skb);
1528         th = tcp_hdr(skb);
1529 
1530         if (th->doff < sizeof(struct tcphdr) / 4)
1531                 return 0;
1532 
1533         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1534                                        iph->saddr, th->source,
1535                                        iph->daddr, ntohs(th->dest),
1536                                        skb->skb_iif, inet_sdif(skb));
1537         if (sk) {
1538                 skb->sk = sk;
1539                 skb->destructor = sock_edemux;
1540                 if (sk_fullsock(sk)) {
1541                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1542 
1543                         if (dst)
1544                                 dst = dst_check(dst, 0);
1545                         if (dst &&
1546                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1547                                 skb_dst_set_noref(skb, dst);
1548                 }
1549         }
1550         return 0;
1551 }
1552 
1553 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1554 {
1555         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1556 
1557         /* Only socket owner can try to collapse/prune rx queues
1558          * to reduce memory overhead, so add a little headroom here.
1559          * Few sockets backlog are possibly concurrently non empty.
1560          */
1561         limit += 64*1024;
1562 
1563         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1564          * we can fix skb->truesize to its real value to avoid future drops.
1565          * This is valid because skb is not yet charged to the socket.
1566          * It has been noticed pure SACK packets were sometimes dropped
1567          * (if cooked by drivers without copybreak feature).
1568          */
1569         skb_condense(skb);
1570 
1571         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1572                 bh_unlock_sock(sk);
1573                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1574                 return true;
1575         }
1576         return false;
1577 }
1578 EXPORT_SYMBOL(tcp_add_backlog);
1579 
1580 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1581 {
1582         struct tcphdr *th = (struct tcphdr *)skb->data;
1583         unsigned int eaten = skb->len;
1584         int err;
1585 
1586         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1587         if (!err) {
1588                 eaten -= skb->len;
1589                 TCP_SKB_CB(skb)->end_seq -= eaten;
1590         }
1591         return err;
1592 }
1593 EXPORT_SYMBOL(tcp_filter);
1594 
1595 static void tcp_v4_restore_cb(struct sk_buff *skb)
1596 {
1597         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1598                 sizeof(struct inet_skb_parm));
1599 }
1600 
1601 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1602                            const struct tcphdr *th)
1603 {
1604         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1605          * barrier() makes sure compiler wont play fool^Waliasing games.
1606          */
1607         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1608                 sizeof(struct inet_skb_parm));
1609         barrier();
1610 
1611         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1612         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1613                                     skb->len - th->doff * 4);
1614         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1615         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1616         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1617         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1618         TCP_SKB_CB(skb)->sacked  = 0;
1619         TCP_SKB_CB(skb)->has_rxtstamp =
1620                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1621 }
1622 
1623 /*
1624  *      From tcp_input.c
1625  */
1626 
1627 int tcp_v4_rcv(struct sk_buff *skb)
1628 {
1629         struct net *net = dev_net(skb->dev);
1630         int sdif = inet_sdif(skb);
1631         const struct iphdr *iph;
1632         const struct tcphdr *th;
1633         bool refcounted;
1634         struct sock *sk;
1635         int ret;
1636 
1637         if (skb->pkt_type != PACKET_HOST)
1638                 goto discard_it;
1639 
1640         /* Count it even if it's bad */
1641         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1642 
1643         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1644                 goto discard_it;
1645 
1646         th = (const struct tcphdr *)skb->data;
1647 
1648         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1649                 goto bad_packet;
1650         if (!pskb_may_pull(skb, th->doff * 4))
1651                 goto discard_it;
1652 
1653         /* An explanation is required here, I think.
1654          * Packet length and doff are validated by header prediction,
1655          * provided case of th->doff==0 is eliminated.
1656          * So, we defer the checks. */
1657 
1658         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1659                 goto csum_error;
1660 
1661         th = (const struct tcphdr *)skb->data;
1662         iph = ip_hdr(skb);
1663 lookup:
1664         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1665                                th->dest, sdif, &refcounted);
1666         if (!sk)
1667                 goto no_tcp_socket;
1668 
1669 process:
1670         if (sk->sk_state == TCP_TIME_WAIT)
1671                 goto do_time_wait;
1672 
1673         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1674                 struct request_sock *req = inet_reqsk(sk);
1675                 struct sock *nsk;
1676 
1677                 sk = req->rsk_listener;
1678                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1679                         sk_drops_add(sk, skb);
1680                         reqsk_put(req);
1681                         goto discard_it;
1682                 }
1683                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1684                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1685                         goto lookup;
1686                 }
1687                 /* We own a reference on the listener, increase it again
1688                  * as we might lose it too soon.
1689                  */
1690                 sock_hold(sk);
1691                 refcounted = true;
1692                 nsk = NULL;
1693                 if (!tcp_filter(sk, skb)) {
1694                         th = (const struct tcphdr *)skb->data;
1695                         iph = ip_hdr(skb);
1696                         tcp_v4_fill_cb(skb, iph, th);
1697                         nsk = tcp_check_req(sk, skb, req, false);
1698                 }
1699                 if (!nsk) {
1700                         reqsk_put(req);
1701                         goto discard_and_relse;
1702                 }
1703                 if (nsk == sk) {
1704                         reqsk_put(req);
1705                         tcp_v4_restore_cb(skb);
1706                 } else if (tcp_child_process(sk, nsk, skb)) {
1707                         tcp_v4_send_reset(nsk, skb);
1708                         goto discard_and_relse;
1709                 } else {
1710                         sock_put(sk);
1711                         return 0;
1712                 }
1713         }
1714         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1715                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1716                 goto discard_and_relse;
1717         }
1718 
1719         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1720                 goto discard_and_relse;
1721 
1722         if (tcp_v4_inbound_md5_hash(sk, skb))
1723                 goto discard_and_relse;
1724 
1725         nf_reset(skb);
1726 
1727         if (tcp_filter(sk, skb))
1728                 goto discard_and_relse;
1729         th = (const struct tcphdr *)skb->data;
1730         iph = ip_hdr(skb);
1731         tcp_v4_fill_cb(skb, iph, th);
1732 
1733         skb->dev = NULL;
1734 
1735         if (sk->sk_state == TCP_LISTEN) {
1736                 ret = tcp_v4_do_rcv(sk, skb);
1737                 goto put_and_return;
1738         }
1739 
1740         sk_incoming_cpu_update(sk);
1741 
1742         bh_lock_sock_nested(sk);
1743         tcp_segs_in(tcp_sk(sk), skb);
1744         ret = 0;
1745         if (!sock_owned_by_user(sk)) {
1746                 ret = tcp_v4_do_rcv(sk, skb);
1747         } else if (tcp_add_backlog(sk, skb)) {
1748                 goto discard_and_relse;
1749         }
1750         bh_unlock_sock(sk);
1751 
1752 put_and_return:
1753         if (refcounted)
1754                 sock_put(sk);
1755 
1756         return ret;
1757 
1758 no_tcp_socket:
1759         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1760                 goto discard_it;
1761 
1762         tcp_v4_fill_cb(skb, iph, th);
1763 
1764         if (tcp_checksum_complete(skb)) {
1765 csum_error:
1766                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1767 bad_packet:
1768                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1769         } else {
1770                 tcp_v4_send_reset(NULL, skb);
1771         }
1772 
1773 discard_it:
1774         /* Discard frame. */
1775         kfree_skb(skb);
1776         return 0;
1777 
1778 discard_and_relse:
1779         sk_drops_add(sk, skb);
1780         if (refcounted)
1781                 sock_put(sk);
1782         goto discard_it;
1783 
1784 do_time_wait:
1785         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1786                 inet_twsk_put(inet_twsk(sk));
1787                 goto discard_it;
1788         }
1789 
1790         tcp_v4_fill_cb(skb, iph, th);
1791 
1792         if (tcp_checksum_complete(skb)) {
1793                 inet_twsk_put(inet_twsk(sk));
1794                 goto csum_error;
1795         }
1796         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1797         case TCP_TW_SYN: {
1798                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1799                                                         &tcp_hashinfo, skb,
1800                                                         __tcp_hdrlen(th),
1801                                                         iph->saddr, th->source,
1802                                                         iph->daddr, th->dest,
1803                                                         inet_iif(skb),
1804                                                         sdif);
1805                 if (sk2) {
1806                         inet_twsk_deschedule_put(inet_twsk(sk));
1807                         sk = sk2;
1808                         tcp_v4_restore_cb(skb);
1809                         refcounted = false;
1810                         goto process;
1811                 }
1812         }
1813                 /* to ACK */
1814                 /* fall through */
1815         case TCP_TW_ACK:
1816                 tcp_v4_timewait_ack(sk, skb);
1817                 break;
1818         case TCP_TW_RST:
1819                 tcp_v4_send_reset(sk, skb);
1820                 inet_twsk_deschedule_put(inet_twsk(sk));
1821                 goto discard_it;
1822         case TCP_TW_SUCCESS:;
1823         }
1824         goto discard_it;
1825 }
1826 
1827 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1828         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1829         .twsk_unique    = tcp_twsk_unique,
1830         .twsk_destructor= tcp_twsk_destructor,
1831 };
1832 
1833 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1834 {
1835         struct dst_entry *dst = skb_dst(skb);
1836 
1837         if (dst && dst_hold_safe(dst)) {
1838                 sk->sk_rx_dst = dst;
1839                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1840         }
1841 }
1842 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1843 
1844 const struct inet_connection_sock_af_ops ipv4_specific = {
1845         .queue_xmit        = ip_queue_xmit,
1846         .send_check        = tcp_v4_send_check,
1847         .rebuild_header    = inet_sk_rebuild_header,
1848         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1849         .conn_request      = tcp_v4_conn_request,
1850         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1851         .net_header_len    = sizeof(struct iphdr),
1852         .setsockopt        = ip_setsockopt,
1853         .getsockopt        = ip_getsockopt,
1854         .addr2sockaddr     = inet_csk_addr2sockaddr,
1855         .sockaddr_len      = sizeof(struct sockaddr_in),
1856 #ifdef CONFIG_COMPAT
1857         .compat_setsockopt = compat_ip_setsockopt,
1858         .compat_getsockopt = compat_ip_getsockopt,
1859 #endif
1860         .mtu_reduced       = tcp_v4_mtu_reduced,
1861 };
1862 EXPORT_SYMBOL(ipv4_specific);
1863 
1864 #ifdef CONFIG_TCP_MD5SIG
1865 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1866         .md5_lookup             = tcp_v4_md5_lookup,
1867         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1868         .md5_parse              = tcp_v4_parse_md5_keys,
1869 };
1870 #endif
1871 
1872 /* NOTE: A lot of things set to zero explicitly by call to
1873  *       sk_alloc() so need not be done here.
1874  */
1875 static int tcp_v4_init_sock(struct sock *sk)
1876 {
1877         struct inet_connection_sock *icsk = inet_csk(sk);
1878 
1879         tcp_init_sock(sk);
1880 
1881         icsk->icsk_af_ops = &ipv4_specific;
1882 
1883 #ifdef CONFIG_TCP_MD5SIG
1884         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1885 #endif
1886 
1887         return 0;
1888 }
1889 
1890 void tcp_v4_destroy_sock(struct sock *sk)
1891 {
1892         struct tcp_sock *tp = tcp_sk(sk);
1893 
1894         trace_tcp_destroy_sock(sk);
1895 
1896         tcp_clear_xmit_timers(sk);
1897 
1898         tcp_cleanup_congestion_control(sk);
1899 
1900         tcp_cleanup_ulp(sk);
1901 
1902         /* Cleanup up the write buffer. */
1903         tcp_write_queue_purge(sk);
1904 
1905         /* Check if we want to disable active TFO */
1906         tcp_fastopen_active_disable_ofo_check(sk);
1907 
1908         /* Cleans up our, hopefully empty, out_of_order_queue. */
1909         skb_rbtree_purge(&tp->out_of_order_queue);
1910 
1911 #ifdef CONFIG_TCP_MD5SIG
1912         /* Clean up the MD5 key list, if any */
1913         if (tp->md5sig_info) {
1914                 tcp_clear_md5_list(sk);
1915                 kfree_rcu(tp->md5sig_info, rcu);
1916                 tp->md5sig_info = NULL;
1917         }
1918 #endif
1919 
1920         /* Clean up a referenced TCP bind bucket. */
1921         if (inet_csk(sk)->icsk_bind_hash)
1922                 inet_put_port(sk);
1923 
1924         BUG_ON(tp->fastopen_rsk);
1925 
1926         /* If socket is aborted during connect operation */
1927         tcp_free_fastopen_req(tp);
1928         tcp_fastopen_destroy_cipher(sk);
1929         tcp_saved_syn_free(tp);
1930 
1931         sk_sockets_allocated_dec(sk);
1932 }
1933 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1934 
1935 #ifdef CONFIG_PROC_FS
1936 /* Proc filesystem TCP sock list dumping. */
1937 
1938 /*
1939  * Get next listener socket follow cur.  If cur is NULL, get first socket
1940  * starting from bucket given in st->bucket; when st->bucket is zero the
1941  * very first socket in the hash table is returned.
1942  */
1943 static void *listening_get_next(struct seq_file *seq, void *cur)
1944 {
1945         struct tcp_iter_state *st = seq->private;
1946         struct net *net = seq_file_net(seq);
1947         struct inet_listen_hashbucket *ilb;
1948         struct sock *sk = cur;
1949 
1950         if (!sk) {
1951 get_head:
1952                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1953                 spin_lock(&ilb->lock);
1954                 sk = sk_head(&ilb->head);
1955                 st->offset = 0;
1956                 goto get_sk;
1957         }
1958         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1959         ++st->num;
1960         ++st->offset;
1961 
1962         sk = sk_next(sk);
1963 get_sk:
1964         sk_for_each_from(sk) {
1965                 if (!net_eq(sock_net(sk), net))
1966                         continue;
1967                 if (sk->sk_family == st->family)
1968                         return sk;
1969         }
1970         spin_unlock(&ilb->lock);
1971         st->offset = 0;
1972         if (++st->bucket < INET_LHTABLE_SIZE)
1973                 goto get_head;
1974         return NULL;
1975 }
1976 
1977 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1978 {
1979         struct tcp_iter_state *st = seq->private;
1980         void *rc;
1981 
1982         st->bucket = 0;
1983         st->offset = 0;
1984         rc = listening_get_next(seq, NULL);
1985 
1986         while (rc && *pos) {
1987                 rc = listening_get_next(seq, rc);
1988                 --*pos;
1989         }
1990         return rc;
1991 }
1992 
1993 static inline bool empty_bucket(const struct tcp_iter_state *st)
1994 {
1995         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1996 }
1997 
1998 /*
1999  * Get first established socket starting from bucket given in st->bucket.
2000  * If st->bucket is zero, the very first socket in the hash is returned.
2001  */
2002 static void *established_get_first(struct seq_file *seq)
2003 {
2004         struct tcp_iter_state *st = seq->private;
2005         struct net *net = seq_file_net(seq);
2006         void *rc = NULL;
2007 
2008         st->offset = 0;
2009         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2010                 struct sock *sk;
2011                 struct hlist_nulls_node *node;
2012                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2013 
2014                 /* Lockless fast path for the common case of empty buckets */
2015                 if (empty_bucket(st))
2016                         continue;
2017 
2018                 spin_lock_bh(lock);
2019                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2020                         if (sk->sk_family != st->family ||
2021                             !net_eq(sock_net(sk), net)) {
2022                                 continue;
2023                         }
2024                         rc = sk;
2025                         goto out;
2026                 }
2027                 spin_unlock_bh(lock);
2028         }
2029 out:
2030         return rc;
2031 }
2032 
2033 static void *established_get_next(struct seq_file *seq, void *cur)
2034 {
2035         struct sock *sk = cur;
2036         struct hlist_nulls_node *node;
2037         struct tcp_iter_state *st = seq->private;
2038         struct net *net = seq_file_net(seq);
2039 
2040         ++st->num;
2041         ++st->offset;
2042 
2043         sk = sk_nulls_next(sk);
2044 
2045         sk_nulls_for_each_from(sk, node) {
2046                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2047                         return sk;
2048         }
2049 
2050         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2051         ++st->bucket;
2052         return established_get_first(seq);
2053 }
2054 
2055 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2056 {
2057         struct tcp_iter_state *st = seq->private;
2058         void *rc;
2059 
2060         st->bucket = 0;
2061         rc = established_get_first(seq);
2062 
2063         while (rc && pos) {
2064                 rc = established_get_next(seq, rc);
2065                 --pos;
2066         }
2067         return rc;
2068 }
2069 
2070 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2071 {
2072         void *rc;
2073         struct tcp_iter_state *st = seq->private;
2074 
2075         st->state = TCP_SEQ_STATE_LISTENING;
2076         rc        = listening_get_idx(seq, &pos);
2077 
2078         if (!rc) {
2079                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2080                 rc        = established_get_idx(seq, pos);
2081         }
2082 
2083         return rc;
2084 }
2085 
2086 static void *tcp_seek_last_pos(struct seq_file *seq)
2087 {
2088         struct tcp_iter_state *st = seq->private;
2089         int offset = st->offset;
2090         int orig_num = st->num;
2091         void *rc = NULL;
2092 
2093         switch (st->state) {
2094         case TCP_SEQ_STATE_LISTENING:
2095                 if (st->bucket >= INET_LHTABLE_SIZE)
2096                         break;
2097                 st->state = TCP_SEQ_STATE_LISTENING;
2098                 rc = listening_get_next(seq, NULL);
2099                 while (offset-- && rc)
2100                         rc = listening_get_next(seq, rc);
2101                 if (rc)
2102                         break;
2103                 st->bucket = 0;
2104                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2105                 /* Fallthrough */
2106         case TCP_SEQ_STATE_ESTABLISHED:
2107                 if (st->bucket > tcp_hashinfo.ehash_mask)
2108                         break;
2109                 rc = established_get_first(seq);
2110                 while (offset-- && rc)
2111                         rc = established_get_next(seq, rc);
2112         }
2113 
2114         st->num = orig_num;
2115 
2116         return rc;
2117 }
2118 
2119 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2120 {
2121         struct tcp_iter_state *st = seq->private;
2122         void *rc;
2123 
2124         if (*pos && *pos == st->last_pos) {
2125                 rc = tcp_seek_last_pos(seq);
2126                 if (rc)
2127                         goto out;
2128         }
2129 
2130         st->state = TCP_SEQ_STATE_LISTENING;
2131         st->num = 0;
2132         st->bucket = 0;
2133         st->offset = 0;
2134         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2135 
2136 out:
2137         st->last_pos = *pos;
2138         return rc;
2139 }
2140 
2141 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2142 {
2143         struct tcp_iter_state *st = seq->private;
2144         void *rc = NULL;
2145 
2146         if (v == SEQ_START_TOKEN) {
2147                 rc = tcp_get_idx(seq, 0);
2148                 goto out;
2149         }
2150 
2151         switch (st->state) {
2152         case TCP_SEQ_STATE_LISTENING:
2153                 rc = listening_get_next(seq, v);
2154                 if (!rc) {
2155                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2156                         st->bucket = 0;
2157                         st->offset = 0;
2158                         rc        = established_get_first(seq);
2159                 }
2160                 break;
2161         case TCP_SEQ_STATE_ESTABLISHED:
2162                 rc = established_get_next(seq, v);
2163                 break;
2164         }
2165 out:
2166         ++*pos;
2167         st->last_pos = *pos;
2168         return rc;
2169 }
2170 
2171 static void tcp_seq_stop(struct seq_file *seq, void *v)
2172 {
2173         struct tcp_iter_state *st = seq->private;
2174 
2175         switch (st->state) {
2176         case TCP_SEQ_STATE_LISTENING:
2177                 if (v != SEQ_START_TOKEN)
2178                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2179                 break;
2180         case TCP_SEQ_STATE_ESTABLISHED:
2181                 if (v)
2182                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2183                 break;
2184         }
2185 }
2186 
2187 int tcp_seq_open(struct inode *inode, struct file *file)
2188 {
2189         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2190         struct tcp_iter_state *s;
2191         int err;
2192 
2193         err = seq_open_net(inode, file, &afinfo->seq_ops,
2194                           sizeof(struct tcp_iter_state));
2195         if (err < 0)
2196                 return err;
2197 
2198         s = ((struct seq_file *)file->private_data)->private;
2199         s->family               = afinfo->family;
2200         s->last_pos             = 0;
2201         return 0;
2202 }
2203 EXPORT_SYMBOL(tcp_seq_open);
2204 
2205 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2206 {
2207         int rc = 0;
2208         struct proc_dir_entry *p;
2209 
2210         afinfo->seq_ops.start           = tcp_seq_start;
2211         afinfo->seq_ops.next            = tcp_seq_next;
2212         afinfo->seq_ops.stop            = tcp_seq_stop;
2213 
2214         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2215                              afinfo->seq_fops, afinfo);
2216         if (!p)
2217                 rc = -ENOMEM;
2218         return rc;
2219 }
2220 EXPORT_SYMBOL(tcp_proc_register);
2221 
2222 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2223 {
2224         remove_proc_entry(afinfo->name, net->proc_net);
2225 }
2226 EXPORT_SYMBOL(tcp_proc_unregister);
2227 
2228 static void get_openreq4(const struct request_sock *req,
2229                          struct seq_file *f, int i)
2230 {
2231         const struct inet_request_sock *ireq = inet_rsk(req);
2232         long delta = req->rsk_timer.expires - jiffies;
2233 
2234         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2235                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2236                 i,
2237                 ireq->ir_loc_addr,
2238                 ireq->ir_num,
2239                 ireq->ir_rmt_addr,
2240                 ntohs(ireq->ir_rmt_port),
2241                 TCP_SYN_RECV,
2242                 0, 0, /* could print option size, but that is af dependent. */
2243                 1,    /* timers active (only the expire timer) */
2244                 jiffies_delta_to_clock_t(delta),
2245                 req->num_timeout,
2246                 from_kuid_munged(seq_user_ns(f),
2247                                  sock_i_uid(req->rsk_listener)),
2248                 0,  /* non standard timer */
2249                 0, /* open_requests have no inode */
2250                 0,
2251                 req);
2252 }
2253 
2254 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2255 {
2256         int timer_active;
2257         unsigned long timer_expires;
2258         const struct tcp_sock *tp = tcp_sk(sk);
2259         const struct inet_connection_sock *icsk = inet_csk(sk);
2260         const struct inet_sock *inet = inet_sk(sk);
2261         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2262         __be32 dest = inet->inet_daddr;
2263         __be32 src = inet->inet_rcv_saddr;
2264         __u16 destp = ntohs(inet->inet_dport);
2265         __u16 srcp = ntohs(inet->inet_sport);
2266         int rx_queue;
2267         int state;
2268 
2269         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2270             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2271             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2272                 timer_active    = 1;
2273                 timer_expires   = icsk->icsk_timeout;
2274         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2275                 timer_active    = 4;
2276                 timer_expires   = icsk->icsk_timeout;
2277         } else if (timer_pending(&sk->sk_timer)) {
2278                 timer_active    = 2;
2279                 timer_expires   = sk->sk_timer.expires;
2280         } else {
2281                 timer_active    = 0;
2282                 timer_expires = jiffies;
2283         }
2284 
2285         state = sk_state_load(sk);
2286         if (state == TCP_LISTEN)
2287                 rx_queue = sk->sk_ack_backlog;
2288         else
2289                 /* Because we don't lock the socket,
2290                  * we might find a transient negative value.
2291                  */
2292                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2293 
2294         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2295                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2296                 i, src, srcp, dest, destp, state,
2297                 tp->write_seq - tp->snd_una,
2298                 rx_queue,
2299                 timer_active,
2300                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2301                 icsk->icsk_retransmits,
2302                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2303                 icsk->icsk_probes_out,
2304                 sock_i_ino(sk),
2305                 refcount_read(&sk->sk_refcnt), sk,
2306                 jiffies_to_clock_t(icsk->icsk_rto),
2307                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2308                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2309                 tp->snd_cwnd,
2310                 state == TCP_LISTEN ?
2311                     fastopenq->max_qlen :
2312                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2313 }
2314 
2315 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2316                                struct seq_file *f, int i)
2317 {
2318         long delta = tw->tw_timer.expires - jiffies;
2319         __be32 dest, src;
2320         __u16 destp, srcp;
2321 
2322         dest  = tw->tw_daddr;
2323         src   = tw->tw_rcv_saddr;
2324         destp = ntohs(tw->tw_dport);
2325         srcp  = ntohs(tw->tw_sport);
2326 
2327         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2328                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2329                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2330                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2331                 refcount_read(&tw->tw_refcnt), tw);
2332 }
2333 
2334 #define TMPSZ 150
2335 
2336 static int tcp4_seq_show(struct seq_file *seq, void *v)
2337 {
2338         struct tcp_iter_state *st;
2339         struct sock *sk = v;
2340 
2341         seq_setwidth(seq, TMPSZ - 1);
2342         if (v == SEQ_START_TOKEN) {
2343                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2344                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2345                            "inode");
2346                 goto out;
2347         }
2348         st = seq->private;
2349 
2350         if (sk->sk_state == TCP_TIME_WAIT)
2351                 get_timewait4_sock(v, seq, st->num);
2352         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2353                 get_openreq4(v, seq, st->num);
2354         else
2355                 get_tcp4_sock(v, seq, st->num);
2356 out:
2357         seq_pad(seq, '\n');
2358         return 0;
2359 }
2360 
2361 static const struct file_operations tcp_afinfo_seq_fops = {
2362         .owner   = THIS_MODULE,
2363         .open    = tcp_seq_open,
2364         .read    = seq_read,
2365         .llseek  = seq_lseek,
2366         .release = seq_release_net
2367 };
2368 
2369 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2370         .name           = "tcp",
2371         .family         = AF_INET,
2372         .seq_fops       = &tcp_afinfo_seq_fops,
2373         .seq_ops        = {
2374                 .show           = tcp4_seq_show,
2375         },
2376 };
2377 
2378 static int __net_init tcp4_proc_init_net(struct net *net)
2379 {
2380         return tcp_proc_register(net, &tcp4_seq_afinfo);
2381 }
2382 
2383 static void __net_exit tcp4_proc_exit_net(struct net *net)
2384 {
2385         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2386 }
2387 
2388 static struct pernet_operations tcp4_net_ops = {
2389         .init = tcp4_proc_init_net,
2390         .exit = tcp4_proc_exit_net,
2391 };
2392 
2393 int __init tcp4_proc_init(void)
2394 {
2395         return register_pernet_subsys(&tcp4_net_ops);
2396 }
2397 
2398 void tcp4_proc_exit(void)
2399 {
2400         unregister_pernet_subsys(&tcp4_net_ops);
2401 }
2402 #endif /* CONFIG_PROC_FS */
2403 
2404 struct proto tcp_prot = {
2405         .name                   = "TCP",
2406         .owner                  = THIS_MODULE,
2407         .close                  = tcp_close,
2408         .connect                = tcp_v4_connect,
2409         .disconnect             = tcp_disconnect,
2410         .accept                 = inet_csk_accept,
2411         .ioctl                  = tcp_ioctl,
2412         .init                   = tcp_v4_init_sock,
2413         .destroy                = tcp_v4_destroy_sock,
2414         .shutdown               = tcp_shutdown,
2415         .setsockopt             = tcp_setsockopt,
2416         .getsockopt             = tcp_getsockopt,
2417         .keepalive              = tcp_set_keepalive,
2418         .recvmsg                = tcp_recvmsg,
2419         .sendmsg                = tcp_sendmsg,
2420         .sendpage               = tcp_sendpage,
2421         .backlog_rcv            = tcp_v4_do_rcv,
2422         .release_cb             = tcp_release_cb,
2423         .hash                   = inet_hash,
2424         .unhash                 = inet_unhash,
2425         .get_port               = inet_csk_get_port,
2426         .enter_memory_pressure  = tcp_enter_memory_pressure,
2427         .leave_memory_pressure  = tcp_leave_memory_pressure,
2428         .stream_memory_free     = tcp_stream_memory_free,
2429         .sockets_allocated      = &tcp_sockets_allocated,
2430         .orphan_count           = &tcp_orphan_count,
2431         .memory_allocated       = &tcp_memory_allocated,
2432         .memory_pressure        = &tcp_memory_pressure,
2433         .sysctl_mem             = sysctl_tcp_mem,
2434         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2435         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2436         .max_header             = MAX_TCP_HEADER,
2437         .obj_size               = sizeof(struct tcp_sock),
2438         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2439         .twsk_prot              = &tcp_timewait_sock_ops,
2440         .rsk_prot               = &tcp_request_sock_ops,
2441         .h.hashinfo             = &tcp_hashinfo,
2442         .no_autobind            = true,
2443 #ifdef CONFIG_COMPAT
2444         .compat_setsockopt      = compat_tcp_setsockopt,
2445         .compat_getsockopt      = compat_tcp_getsockopt,
2446 #endif
2447         .diag_destroy           = tcp_abort,
2448 };
2449 EXPORT_SYMBOL(tcp_prot);
2450 
2451 static void __net_exit tcp_sk_exit(struct net *net)
2452 {
2453         int cpu;
2454 
2455         module_put(net->ipv4.tcp_congestion_control->owner);
2456 
2457         for_each_possible_cpu(cpu)
2458                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2459         free_percpu(net->ipv4.tcp_sk);
2460 }
2461 
2462 static int __net_init tcp_sk_init(struct net *net)
2463 {
2464         int res, cpu, cnt;
2465 
2466         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2467         if (!net->ipv4.tcp_sk)
2468                 return -ENOMEM;
2469 
2470         for_each_possible_cpu(cpu) {
2471                 struct sock *sk;
2472 
2473                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2474                                            IPPROTO_TCP, net);
2475                 if (res)
2476                         goto fail;
2477                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2478                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2479         }
2480 
2481         net->ipv4.sysctl_tcp_ecn = 2;
2482         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2483 
2484         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2485         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2486         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2487 
2488         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2489         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2490         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2491 
2492         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2493         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2494         net->ipv4.sysctl_tcp_syncookies = 1;
2495         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2496         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2497         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2498         net->ipv4.sysctl_tcp_orphan_retries = 0;
2499         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2500         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2501         net->ipv4.sysctl_tcp_tw_reuse = 0;
2502 
2503         cnt = tcp_hashinfo.ehash_mask + 1;
2504         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2505         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2506 
2507         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2508         net->ipv4.sysctl_tcp_sack = 1;
2509         net->ipv4.sysctl_tcp_window_scaling = 1;
2510         net->ipv4.sysctl_tcp_timestamps = 1;
2511         net->ipv4.sysctl_tcp_early_retrans = 3;
2512         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2513         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2514         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2515         net->ipv4.sysctl_tcp_max_reordering = 300;
2516         net->ipv4.sysctl_tcp_dsack = 1;
2517         net->ipv4.sysctl_tcp_app_win = 31;
2518         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2519         net->ipv4.sysctl_tcp_frto = 2;
2520         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2521         /* This limits the percentage of the congestion window which we
2522          * will allow a single TSO frame to consume.  Building TSO frames
2523          * which are too large can cause TCP streams to be bursty.
2524          */
2525         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2526         /* Default TSQ limit of four TSO segments */
2527         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2528         /* rfc5961 challenge ack rate limiting */
2529         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2530         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2531         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2532         net->ipv4.sysctl_tcp_autocorking = 1;
2533         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2534         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2535         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2536         if (net != &init_net) {
2537                 memcpy(net->ipv4.sysctl_tcp_rmem,
2538                        init_net.ipv4.sysctl_tcp_rmem,
2539                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2540                 memcpy(net->ipv4.sysctl_tcp_wmem,
2541                        init_net.ipv4.sysctl_tcp_wmem,
2542                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2543         }
2544         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2545         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2546         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2547         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2548 
2549         /* Reno is always built in */
2550         if (!net_eq(net, &init_net) &&
2551             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2552                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2553         else
2554                 net->ipv4.tcp_congestion_control = &tcp_reno;
2555 
2556         return 0;
2557 fail:
2558         tcp_sk_exit(net);
2559 
2560         return res;
2561 }
2562 
2563 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2564 {
2565         struct net *net;
2566 
2567         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2568 
2569         list_for_each_entry(net, net_exit_list, exit_list)
2570                 tcp_fastopen_ctx_destroy(net);
2571 }
2572 
2573 static struct pernet_operations __net_initdata tcp_sk_ops = {
2574        .init       = tcp_sk_init,
2575        .exit       = tcp_sk_exit,
2576        .exit_batch = tcp_sk_exit_batch,
2577 };
2578 
2579 void __init tcp_v4_init(void)
2580 {
2581         if (register_pernet_subsys(&tcp_sk_ops))
2582                 panic("Failed to create the TCP control socket.\n");
2583 }
2584 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp