~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/tcp_ipv4.c

Version: ~ [ linux-5.7 ] ~ [ linux-5.6.15 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.43 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.125 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.182 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.225 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.225 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.84 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              Implementation of the Transmission Control Protocol(TCP).
  7  *
  8  *              IPv4 specific functions
  9  *
 10  *
 11  *              code split from:
 12  *              linux/ipv4/tcp.c
 13  *              linux/ipv4/tcp_input.c
 14  *              linux/ipv4/tcp_output.c
 15  *
 16  *              See tcp.c for author information
 17  *
 18  *      This program is free software; you can redistribute it and/or
 19  *      modify it under the terms of the GNU General Public License
 20  *      as published by the Free Software Foundation; either version
 21  *      2 of the License, or (at your option) any later version.
 22  */
 23 
 24 /*
 25  * Changes:
 26  *              David S. Miller :       New socket lookup architecture.
 27  *                                      This code is dedicated to John Dyson.
 28  *              David S. Miller :       Change semantics of established hash,
 29  *                                      half is devoted to TIME_WAIT sockets
 30  *                                      and the rest go in the other half.
 31  *              Andi Kleen :            Add support for syncookies and fixed
 32  *                                      some bugs: ip options weren't passed to
 33  *                                      the TCP layer, missed a check for an
 34  *                                      ACK bit.
 35  *              Andi Kleen :            Implemented fast path mtu discovery.
 36  *                                      Fixed many serious bugs in the
 37  *                                      request_sock handling and moved
 38  *                                      most of it into the af independent code.
 39  *                                      Added tail drop and some other bugfixes.
 40  *                                      Added new listen semantics.
 41  *              Mike McLagan    :       Routing by source
 42  *      Juan Jose Ciarlante:            ip_dynaddr bits
 43  *              Andi Kleen:             various fixes.
 44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
 45  *                                      coma.
 46  *      Andi Kleen              :       Fix new listen.
 47  *      Andi Kleen              :       Fix accept error reporting.
 48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
 49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 50  *                                      a single port at the same time.
 51  */
 52 
 53 #define pr_fmt(fmt) "TCP: " fmt
 54 
 55 #include <linux/bottom_half.h>
 56 #include <linux/types.h>
 57 #include <linux/fcntl.h>
 58 #include <linux/module.h>
 59 #include <linux/random.h>
 60 #include <linux/cache.h>
 61 #include <linux/jhash.h>
 62 #include <linux/init.h>
 63 #include <linux/times.h>
 64 #include <linux/slab.h>
 65 
 66 #include <net/net_namespace.h>
 67 #include <net/icmp.h>
 68 #include <net/inet_hashtables.h>
 69 #include <net/tcp.h>
 70 #include <net/transp_v6.h>
 71 #include <net/ipv6.h>
 72 #include <net/inet_common.h>
 73 #include <net/timewait_sock.h>
 74 #include <net/xfrm.h>
 75 #include <net/secure_seq.h>
 76 #include <net/busy_poll.h>
 77 
 78 #include <linux/inet.h>
 79 #include <linux/ipv6.h>
 80 #include <linux/stddef.h>
 81 #include <linux/proc_fs.h>
 82 #include <linux/seq_file.h>
 83 #include <linux/inetdevice.h>
 84 
 85 #include <crypto/hash.h>
 86 #include <linux/scatterlist.h>
 87 
 88 #include <trace/events/tcp.h>
 89 
 90 #ifdef CONFIG_TCP_MD5SIG
 91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
 93 #endif
 94 
 95 struct inet_hashinfo tcp_hashinfo;
 96 EXPORT_SYMBOL(tcp_hashinfo);
 97 
 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
 99 {
100         return secure_tcp_seq(ip_hdr(skb)->daddr,
101                               ip_hdr(skb)->saddr,
102                               tcp_hdr(skb)->dest,
103                               tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117 
118         if (reuse == 2) {
119                 /* Still does not detect *everything* that goes through
120                  * lo, since we require a loopback src or dst address
121                  * or direct binding to 'lo' interface.
122                  */
123                 bool loopback = false;
124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125                         loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127                 if (tw->tw_family == AF_INET6) {
128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134                                 loopback = true;
135                 } else
136 #endif
137                 {
138                         if (ipv4_is_loopback(tw->tw_daddr) ||
139                             ipv4_is_loopback(tw->tw_rcv_saddr))
140                                 loopback = true;
141                 }
142                 if (!loopback)
143                         reuse = 0;
144         }
145 
146         /* With PAWS, it is safe from the viewpoint
147            of data integrity. Even without PAWS it is safe provided sequence
148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149 
150            Actually, the idea is close to VJ's one, only timestamp cache is
151            held not per host, but per port pair and TW bucket is used as state
152            holder.
153 
154            If TW bucket has been already destroyed we fall back to VJ's scheme
155            and use initial timestamp retrieved from peer table.
156          */
157         if (tcptw->tw_ts_recent_stamp &&
158             (!twp || (reuse && time_after32(ktime_get_seconds(),
159                                             tcptw->tw_ts_recent_stamp)))) {
160                 /* In case of repair and re-using TIME-WAIT sockets we still
161                  * want to be sure that it is safe as above but honor the
162                  * sequence numbers and time stamps set as part of the repair
163                  * process.
164                  *
165                  * Without this check re-using a TIME-WAIT socket with TCP
166                  * repair would accumulate a -1 on the repair assigned
167                  * sequence number. The first time it is reused the sequence
168                  * is -1, the second time -2, etc. This fixes that issue
169                  * without appearing to create any others.
170                  */
171                 if (likely(!tp->repair)) {
172                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
173                         if (tp->write_seq == 0)
174                                 tp->write_seq = 1;
175                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
176                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
177                 }
178                 sock_hold(sktw);
179                 return 1;
180         }
181 
182         return 0;
183 }
184 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
185 
186 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187                               int addr_len)
188 {
189         /* This check is replicated from tcp_v4_connect() and intended to
190          * prevent BPF program called below from accessing bytes that are out
191          * of the bound specified by user in addr_len.
192          */
193         if (addr_len < sizeof(struct sockaddr_in))
194                 return -EINVAL;
195 
196         sock_owned_by_me(sk);
197 
198         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 }
200 
201 /* This will initiate an outgoing connection. */
202 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203 {
204         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
205         struct inet_sock *inet = inet_sk(sk);
206         struct tcp_sock *tp = tcp_sk(sk);
207         __be16 orig_sport, orig_dport;
208         __be32 daddr, nexthop;
209         struct flowi4 *fl4;
210         struct rtable *rt;
211         int err;
212         struct ip_options_rcu *inet_opt;
213         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
214 
215         if (addr_len < sizeof(struct sockaddr_in))
216                 return -EINVAL;
217 
218         if (usin->sin_family != AF_INET)
219                 return -EAFNOSUPPORT;
220 
221         nexthop = daddr = usin->sin_addr.s_addr;
222         inet_opt = rcu_dereference_protected(inet->inet_opt,
223                                              lockdep_sock_is_held(sk));
224         if (inet_opt && inet_opt->opt.srr) {
225                 if (!daddr)
226                         return -EINVAL;
227                 nexthop = inet_opt->opt.faddr;
228         }
229 
230         orig_sport = inet->inet_sport;
231         orig_dport = usin->sin_port;
232         fl4 = &inet->cork.fl.u.ip4;
233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
235                               IPPROTO_TCP,
236                               orig_sport, orig_dport, sk);
237         if (IS_ERR(rt)) {
238                 err = PTR_ERR(rt);
239                 if (err == -ENETUNREACH)
240                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
241                 return err;
242         }
243 
244         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245                 ip_rt_put(rt);
246                 return -ENETUNREACH;
247         }
248 
249         if (!inet_opt || !inet_opt->opt.srr)
250                 daddr = fl4->daddr;
251 
252         if (!inet->inet_saddr)
253                 inet->inet_saddr = fl4->saddr;
254         sk_rcv_saddr_set(sk, inet->inet_saddr);
255 
256         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
257                 /* Reset inherited state */
258                 tp->rx_opt.ts_recent       = 0;
259                 tp->rx_opt.ts_recent_stamp = 0;
260                 if (likely(!tp->repair))
261                         tp->write_seq      = 0;
262         }
263 
264         inet->inet_dport = usin->sin_port;
265         sk_daddr_set(sk, daddr);
266 
267         inet_csk(sk)->icsk_ext_hdr_len = 0;
268         if (inet_opt)
269                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
270 
271         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
272 
273         /* Socket identity is still unknown (sport may be zero).
274          * However we set state to SYN-SENT and not releasing socket
275          * lock select source port, enter ourselves into the hash tables and
276          * complete initialization after this.
277          */
278         tcp_set_state(sk, TCP_SYN_SENT);
279         err = inet_hash_connect(tcp_death_row, sk);
280         if (err)
281                 goto failure;
282 
283         sk_set_txhash(sk);
284 
285         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
286                                inet->inet_sport, inet->inet_dport, sk);
287         if (IS_ERR(rt)) {
288                 err = PTR_ERR(rt);
289                 rt = NULL;
290                 goto failure;
291         }
292         /* OK, now commit destination to socket.  */
293         sk->sk_gso_type = SKB_GSO_TCPV4;
294         sk_setup_caps(sk, &rt->dst);
295         rt = NULL;
296 
297         if (likely(!tp->repair)) {
298                 if (!tp->write_seq)
299                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
300                                                        inet->inet_daddr,
301                                                        inet->inet_sport,
302                                                        usin->sin_port);
303                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
304                                                  inet->inet_saddr,
305                                                  inet->inet_daddr);
306         }
307 
308         inet->inet_id = tp->write_seq ^ jiffies;
309 
310         if (tcp_fastopen_defer_connect(sk, &err))
311                 return err;
312         if (err)
313                 goto failure;
314 
315         err = tcp_connect(sk);
316 
317         if (err)
318                 goto failure;
319 
320         return 0;
321 
322 failure:
323         /*
324          * This unhashes the socket and releases the local port,
325          * if necessary.
326          */
327         tcp_set_state(sk, TCP_CLOSE);
328         ip_rt_put(rt);
329         sk->sk_route_caps = 0;
330         inet->inet_dport = 0;
331         return err;
332 }
333 EXPORT_SYMBOL(tcp_v4_connect);
334 
335 /*
336  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337  * It can be called through tcp_release_cb() if socket was owned by user
338  * at the time tcp_v4_err() was called to handle ICMP message.
339  */
340 void tcp_v4_mtu_reduced(struct sock *sk)
341 {
342         struct inet_sock *inet = inet_sk(sk);
343         struct dst_entry *dst;
344         u32 mtu;
345 
346         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347                 return;
348         mtu = tcp_sk(sk)->mtu_info;
349         dst = inet_csk_update_pmtu(sk, mtu);
350         if (!dst)
351                 return;
352 
353         /* Something is about to be wrong... Remember soft error
354          * for the case, if this connection will not able to recover.
355          */
356         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357                 sk->sk_err_soft = EMSGSIZE;
358 
359         mtu = dst_mtu(dst);
360 
361         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
362             ip_sk_accept_pmtu(sk) &&
363             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
364                 tcp_sync_mss(sk, mtu);
365 
366                 /* Resend the TCP packet because it's
367                  * clear that the old packet has been
368                  * dropped. This is the new "fast" path mtu
369                  * discovery.
370                  */
371                 tcp_simple_retransmit(sk);
372         } /* else let the usual retransmit timer handle it */
373 }
374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
375 
376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
377 {
378         struct dst_entry *dst = __sk_dst_check(sk, 0);
379 
380         if (dst)
381                 dst->ops->redirect(dst, sk, skb);
382 }
383 
384 
385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
387 {
388         struct request_sock *req = inet_reqsk(sk);
389         struct net *net = sock_net(sk);
390 
391         /* ICMPs are not backlogged, hence we cannot get
392          * an established socket here.
393          */
394         if (seq != tcp_rsk(req)->snt_isn) {
395                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
396         } else if (abort) {
397                 /*
398                  * Still in SYN_RECV, just remove it silently.
399                  * There is no good way to pass the error to the newly
400                  * created socket, and POSIX does not want network
401                  * errors returned from accept().
402                  */
403                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
404                 tcp_listendrop(req->rsk_listener);
405         }
406         reqsk_put(req);
407 }
408 EXPORT_SYMBOL(tcp_req_err);
409 
410 /*
411  * This routine is called by the ICMP module when it gets some
412  * sort of error condition.  If err < 0 then the socket should
413  * be closed and the error returned to the user.  If err > 0
414  * it's just the icmp type << 8 | icmp code.  After adjustment
415  * header points to the first 8 bytes of the tcp header.  We need
416  * to find the appropriate port.
417  *
418  * The locking strategy used here is very "optimistic". When
419  * someone else accesses the socket the ICMP is just dropped
420  * and for some paths there is no check at all.
421  * A more general error queue to queue errors for later handling
422  * is probably better.
423  *
424  */
425 
426 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
427 {
428         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
429         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
430         struct inet_connection_sock *icsk;
431         struct tcp_sock *tp;
432         struct inet_sock *inet;
433         const int type = icmp_hdr(icmp_skb)->type;
434         const int code = icmp_hdr(icmp_skb)->code;
435         struct sock *sk;
436         struct sk_buff *skb;
437         struct request_sock *fastopen;
438         u32 seq, snd_una;
439         s32 remaining;
440         u32 delta_us;
441         int err;
442         struct net *net = dev_net(icmp_skb->dev);
443 
444         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445                                        th->dest, iph->saddr, ntohs(th->source),
446                                        inet_iif(icmp_skb), 0);
447         if (!sk) {
448                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
449                 return -ENOENT;
450         }
451         if (sk->sk_state == TCP_TIME_WAIT) {
452                 inet_twsk_put(inet_twsk(sk));
453                 return 0;
454         }
455         seq = ntohl(th->seq);
456         if (sk->sk_state == TCP_NEW_SYN_RECV) {
457                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
458                                      type == ICMP_TIME_EXCEEDED ||
459                                      (type == ICMP_DEST_UNREACH &&
460                                       (code == ICMP_NET_UNREACH ||
461                                        code == ICMP_HOST_UNREACH)));
462                 return 0;
463         }
464 
465         bh_lock_sock(sk);
466         /* If too many ICMPs get dropped on busy
467          * servers this needs to be solved differently.
468          * We do take care of PMTU discovery (RFC1191) special case :
469          * we can receive locally generated ICMP messages while socket is held.
470          */
471         if (sock_owned_by_user(sk)) {
472                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
473                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
474         }
475         if (sk->sk_state == TCP_CLOSE)
476                 goto out;
477 
478         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
479                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
480                 goto out;
481         }
482 
483         icsk = inet_csk(sk);
484         tp = tcp_sk(sk);
485         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
486         fastopen = tp->fastopen_rsk;
487         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
488         if (sk->sk_state != TCP_LISTEN &&
489             !between(seq, snd_una, tp->snd_nxt)) {
490                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
491                 goto out;
492         }
493 
494         switch (type) {
495         case ICMP_REDIRECT:
496                 if (!sock_owned_by_user(sk))
497                         do_redirect(icmp_skb, sk);
498                 goto out;
499         case ICMP_SOURCE_QUENCH:
500                 /* Just silently ignore these. */
501                 goto out;
502         case ICMP_PARAMETERPROB:
503                 err = EPROTO;
504                 break;
505         case ICMP_DEST_UNREACH:
506                 if (code > NR_ICMP_UNREACH)
507                         goto out;
508 
509                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
510                         /* We are not interested in TCP_LISTEN and open_requests
511                          * (SYN-ACKs send out by Linux are always <576bytes so
512                          * they should go through unfragmented).
513                          */
514                         if (sk->sk_state == TCP_LISTEN)
515                                 goto out;
516 
517                         tp->mtu_info = info;
518                         if (!sock_owned_by_user(sk)) {
519                                 tcp_v4_mtu_reduced(sk);
520                         } else {
521                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
522                                         sock_hold(sk);
523                         }
524                         goto out;
525                 }
526 
527                 err = icmp_err_convert[code].errno;
528                 /* check if icmp_skb allows revert of backoff
529                  * (see draft-zimmermann-tcp-lcd) */
530                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
531                         break;
532                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
533                     !icsk->icsk_backoff || fastopen)
534                         break;
535 
536                 if (sock_owned_by_user(sk))
537                         break;
538 
539                 skb = tcp_rtx_queue_head(sk);
540                 if (WARN_ON_ONCE(!skb))
541                         break;
542 
543                 icsk->icsk_backoff--;
544                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
545                                                TCP_TIMEOUT_INIT;
546                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
547 
548 
549                 tcp_mstamp_refresh(tp);
550                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
551                 remaining = icsk->icsk_rto -
552                             usecs_to_jiffies(delta_us);
553 
554                 if (remaining > 0) {
555                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
556                                                   remaining, TCP_RTO_MAX);
557                 } else {
558                         /* RTO revert clocked out retransmission.
559                          * Will retransmit now */
560                         tcp_retransmit_timer(sk);
561                 }
562 
563                 break;
564         case ICMP_TIME_EXCEEDED:
565                 err = EHOSTUNREACH;
566                 break;
567         default:
568                 goto out;
569         }
570 
571         switch (sk->sk_state) {
572         case TCP_SYN_SENT:
573         case TCP_SYN_RECV:
574                 /* Only in fast or simultaneous open. If a fast open socket is
575                  * is already accepted it is treated as a connected one below.
576                  */
577                 if (fastopen && !fastopen->sk)
578                         break;
579 
580                 if (!sock_owned_by_user(sk)) {
581                         sk->sk_err = err;
582 
583                         sk->sk_error_report(sk);
584 
585                         tcp_done(sk);
586                 } else {
587                         sk->sk_err_soft = err;
588                 }
589                 goto out;
590         }
591 
592         /* If we've already connected we will keep trying
593          * until we time out, or the user gives up.
594          *
595          * rfc1122 4.2.3.9 allows to consider as hard errors
596          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
597          * but it is obsoleted by pmtu discovery).
598          *
599          * Note, that in modern internet, where routing is unreliable
600          * and in each dark corner broken firewalls sit, sending random
601          * errors ordered by their masters even this two messages finally lose
602          * their original sense (even Linux sends invalid PORT_UNREACHs)
603          *
604          * Now we are in compliance with RFCs.
605          *                                                      --ANK (980905)
606          */
607 
608         inet = inet_sk(sk);
609         if (!sock_owned_by_user(sk) && inet->recverr) {
610                 sk->sk_err = err;
611                 sk->sk_error_report(sk);
612         } else  { /* Only an error on timeout */
613                 sk->sk_err_soft = err;
614         }
615 
616 out:
617         bh_unlock_sock(sk);
618         sock_put(sk);
619         return 0;
620 }
621 
622 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
623 {
624         struct tcphdr *th = tcp_hdr(skb);
625 
626         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
627         skb->csum_start = skb_transport_header(skb) - skb->head;
628         skb->csum_offset = offsetof(struct tcphdr, check);
629 }
630 
631 /* This routine computes an IPv4 TCP checksum. */
632 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
633 {
634         const struct inet_sock *inet = inet_sk(sk);
635 
636         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
637 }
638 EXPORT_SYMBOL(tcp_v4_send_check);
639 
640 /*
641  *      This routine will send an RST to the other tcp.
642  *
643  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
644  *                    for reset.
645  *      Answer: if a packet caused RST, it is not for a socket
646  *              existing in our system, if it is matched to a socket,
647  *              it is just duplicate segment or bug in other side's TCP.
648  *              So that we build reply only basing on parameters
649  *              arrived with segment.
650  *      Exception: precedence violation. We do not implement it in any case.
651  */
652 
653 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
654 {
655         const struct tcphdr *th = tcp_hdr(skb);
656         struct {
657                 struct tcphdr th;
658 #ifdef CONFIG_TCP_MD5SIG
659                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
660 #endif
661         } rep;
662         struct ip_reply_arg arg;
663 #ifdef CONFIG_TCP_MD5SIG
664         struct tcp_md5sig_key *key = NULL;
665         const __u8 *hash_location = NULL;
666         unsigned char newhash[16];
667         int genhash;
668         struct sock *sk1 = NULL;
669 #endif
670         struct net *net;
671         struct sock *ctl_sk;
672 
673         /* Never send a reset in response to a reset. */
674         if (th->rst)
675                 return;
676 
677         /* If sk not NULL, it means we did a successful lookup and incoming
678          * route had to be correct. prequeue might have dropped our dst.
679          */
680         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
681                 return;
682 
683         /* Swap the send and the receive. */
684         memset(&rep, 0, sizeof(rep));
685         rep.th.dest   = th->source;
686         rep.th.source = th->dest;
687         rep.th.doff   = sizeof(struct tcphdr) / 4;
688         rep.th.rst    = 1;
689 
690         if (th->ack) {
691                 rep.th.seq = th->ack_seq;
692         } else {
693                 rep.th.ack = 1;
694                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
695                                        skb->len - (th->doff << 2));
696         }
697 
698         memset(&arg, 0, sizeof(arg));
699         arg.iov[0].iov_base = (unsigned char *)&rep;
700         arg.iov[0].iov_len  = sizeof(rep.th);
701 
702         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
703 #ifdef CONFIG_TCP_MD5SIG
704         rcu_read_lock();
705         hash_location = tcp_parse_md5sig_option(th);
706         if (sk && sk_fullsock(sk)) {
707                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
708                                         &ip_hdr(skb)->saddr, AF_INET);
709         } else if (hash_location) {
710                 /*
711                  * active side is lost. Try to find listening socket through
712                  * source port, and then find md5 key through listening socket.
713                  * we are not loose security here:
714                  * Incoming packet is checked with md5 hash with finding key,
715                  * no RST generated if md5 hash doesn't match.
716                  */
717                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
718                                              ip_hdr(skb)->saddr,
719                                              th->source, ip_hdr(skb)->daddr,
720                                              ntohs(th->source), inet_iif(skb),
721                                              tcp_v4_sdif(skb));
722                 /* don't send rst if it can't find key */
723                 if (!sk1)
724                         goto out;
725 
726                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
727                                         &ip_hdr(skb)->saddr, AF_INET);
728                 if (!key)
729                         goto out;
730 
731 
732                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
733                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
734                         goto out;
735 
736         }
737 
738         if (key) {
739                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
740                                    (TCPOPT_NOP << 16) |
741                                    (TCPOPT_MD5SIG << 8) |
742                                    TCPOLEN_MD5SIG);
743                 /* Update length and the length the header thinks exists */
744                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
745                 rep.th.doff = arg.iov[0].iov_len / 4;
746 
747                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
748                                      key, ip_hdr(skb)->saddr,
749                                      ip_hdr(skb)->daddr, &rep.th);
750         }
751 #endif
752         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
753                                       ip_hdr(skb)->saddr, /* XXX */
754                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
755         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
756         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
757 
758         /* When socket is gone, all binding information is lost.
759          * routing might fail in this case. No choice here, if we choose to force
760          * input interface, we will misroute in case of asymmetric route.
761          */
762         if (sk) {
763                 arg.bound_dev_if = sk->sk_bound_dev_if;
764                 if (sk_fullsock(sk))
765                         trace_tcp_send_reset(sk, skb);
766         }
767 
768         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
769                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
770 
771         arg.tos = ip_hdr(skb)->tos;
772         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
773         local_bh_disable();
774         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
775         if (sk)
776                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
777                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
778         ip_send_unicast_reply(ctl_sk,
779                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
780                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
781                               &arg, arg.iov[0].iov_len);
782 
783         ctl_sk->sk_mark = 0;
784         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
785         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
786         local_bh_enable();
787 
788 #ifdef CONFIG_TCP_MD5SIG
789 out:
790         rcu_read_unlock();
791 #endif
792 }
793 
794 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
795    outside socket context is ugly, certainly. What can I do?
796  */
797 
798 static void tcp_v4_send_ack(const struct sock *sk,
799                             struct sk_buff *skb, u32 seq, u32 ack,
800                             u32 win, u32 tsval, u32 tsecr, int oif,
801                             struct tcp_md5sig_key *key,
802                             int reply_flags, u8 tos)
803 {
804         const struct tcphdr *th = tcp_hdr(skb);
805         struct {
806                 struct tcphdr th;
807                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
808 #ifdef CONFIG_TCP_MD5SIG
809                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
810 #endif
811                         ];
812         } rep;
813         struct net *net = sock_net(sk);
814         struct ip_reply_arg arg;
815         struct sock *ctl_sk;
816 
817         memset(&rep.th, 0, sizeof(struct tcphdr));
818         memset(&arg, 0, sizeof(arg));
819 
820         arg.iov[0].iov_base = (unsigned char *)&rep;
821         arg.iov[0].iov_len  = sizeof(rep.th);
822         if (tsecr) {
823                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
824                                    (TCPOPT_TIMESTAMP << 8) |
825                                    TCPOLEN_TIMESTAMP);
826                 rep.opt[1] = htonl(tsval);
827                 rep.opt[2] = htonl(tsecr);
828                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
829         }
830 
831         /* Swap the send and the receive. */
832         rep.th.dest    = th->source;
833         rep.th.source  = th->dest;
834         rep.th.doff    = arg.iov[0].iov_len / 4;
835         rep.th.seq     = htonl(seq);
836         rep.th.ack_seq = htonl(ack);
837         rep.th.ack     = 1;
838         rep.th.window  = htons(win);
839 
840 #ifdef CONFIG_TCP_MD5SIG
841         if (key) {
842                 int offset = (tsecr) ? 3 : 0;
843 
844                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
845                                           (TCPOPT_NOP << 16) |
846                                           (TCPOPT_MD5SIG << 8) |
847                                           TCPOLEN_MD5SIG);
848                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
849                 rep.th.doff = arg.iov[0].iov_len/4;
850 
851                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
852                                     key, ip_hdr(skb)->saddr,
853                                     ip_hdr(skb)->daddr, &rep.th);
854         }
855 #endif
856         arg.flags = reply_flags;
857         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
858                                       ip_hdr(skb)->saddr, /* XXX */
859                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
860         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
861         if (oif)
862                 arg.bound_dev_if = oif;
863         arg.tos = tos;
864         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
865         local_bh_disable();
866         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
867         if (sk)
868                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
869                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
870         ip_send_unicast_reply(ctl_sk,
871                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
872                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
873                               &arg, arg.iov[0].iov_len);
874 
875         ctl_sk->sk_mark = 0;
876         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
877         local_bh_enable();
878 }
879 
880 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
881 {
882         struct inet_timewait_sock *tw = inet_twsk(sk);
883         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
884 
885         tcp_v4_send_ack(sk, skb,
886                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
887                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
888                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
889                         tcptw->tw_ts_recent,
890                         tw->tw_bound_dev_if,
891                         tcp_twsk_md5_key(tcptw),
892                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
893                         tw->tw_tos
894                         );
895 
896         inet_twsk_put(tw);
897 }
898 
899 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
900                                   struct request_sock *req)
901 {
902         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
903          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
904          */
905         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
906                                              tcp_sk(sk)->snd_nxt;
907 
908         /* RFC 7323 2.3
909          * The window field (SEG.WND) of every outgoing segment, with the
910          * exception of <SYN> segments, MUST be right-shifted by
911          * Rcv.Wind.Shift bits:
912          */
913         tcp_v4_send_ack(sk, skb, seq,
914                         tcp_rsk(req)->rcv_nxt,
915                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
916                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
917                         req->ts_recent,
918                         0,
919                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
920                                           AF_INET),
921                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
922                         ip_hdr(skb)->tos);
923 }
924 
925 /*
926  *      Send a SYN-ACK after having received a SYN.
927  *      This still operates on a request_sock only, not on a big
928  *      socket.
929  */
930 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
931                               struct flowi *fl,
932                               struct request_sock *req,
933                               struct tcp_fastopen_cookie *foc,
934                               enum tcp_synack_type synack_type)
935 {
936         const struct inet_request_sock *ireq = inet_rsk(req);
937         struct flowi4 fl4;
938         int err = -1;
939         struct sk_buff *skb;
940 
941         /* First, grab a route. */
942         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
943                 return -1;
944 
945         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
946 
947         if (skb) {
948                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
949 
950                 rcu_read_lock();
951                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
952                                             ireq->ir_rmt_addr,
953                                             rcu_dereference(ireq->ireq_opt));
954                 rcu_read_unlock();
955                 err = net_xmit_eval(err);
956         }
957 
958         return err;
959 }
960 
961 /*
962  *      IPv4 request_sock destructor.
963  */
964 static void tcp_v4_reqsk_destructor(struct request_sock *req)
965 {
966         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
967 }
968 
969 #ifdef CONFIG_TCP_MD5SIG
970 /*
971  * RFC2385 MD5 checksumming requires a mapping of
972  * IP address->MD5 Key.
973  * We need to maintain these in the sk structure.
974  */
975 
976 struct static_key tcp_md5_needed __read_mostly;
977 EXPORT_SYMBOL(tcp_md5_needed);
978 
979 /* Find the Key structure for an address.  */
980 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
981                                            const union tcp_md5_addr *addr,
982                                            int family)
983 {
984         const struct tcp_sock *tp = tcp_sk(sk);
985         struct tcp_md5sig_key *key;
986         const struct tcp_md5sig_info *md5sig;
987         __be32 mask;
988         struct tcp_md5sig_key *best_match = NULL;
989         bool match;
990 
991         /* caller either holds rcu_read_lock() or socket lock */
992         md5sig = rcu_dereference_check(tp->md5sig_info,
993                                        lockdep_sock_is_held(sk));
994         if (!md5sig)
995                 return NULL;
996 
997         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
998                 if (key->family != family)
999                         continue;
1000 
1001                 if (family == AF_INET) {
1002                         mask = inet_make_mask(key->prefixlen);
1003                         match = (key->addr.a4.s_addr & mask) ==
1004                                 (addr->a4.s_addr & mask);
1005 #if IS_ENABLED(CONFIG_IPV6)
1006                 } else if (family == AF_INET6) {
1007                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1008                                                   key->prefixlen);
1009 #endif
1010                 } else {
1011                         match = false;
1012                 }
1013 
1014                 if (match && (!best_match ||
1015                               key->prefixlen > best_match->prefixlen))
1016                         best_match = key;
1017         }
1018         return best_match;
1019 }
1020 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1021 
1022 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1023                                                       const union tcp_md5_addr *addr,
1024                                                       int family, u8 prefixlen)
1025 {
1026         const struct tcp_sock *tp = tcp_sk(sk);
1027         struct tcp_md5sig_key *key;
1028         unsigned int size = sizeof(struct in_addr);
1029         const struct tcp_md5sig_info *md5sig;
1030 
1031         /* caller either holds rcu_read_lock() or socket lock */
1032         md5sig = rcu_dereference_check(tp->md5sig_info,
1033                                        lockdep_sock_is_held(sk));
1034         if (!md5sig)
1035                 return NULL;
1036 #if IS_ENABLED(CONFIG_IPV6)
1037         if (family == AF_INET6)
1038                 size = sizeof(struct in6_addr);
1039 #endif
1040         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1041                 if (key->family != family)
1042                         continue;
1043                 if (!memcmp(&key->addr, addr, size) &&
1044                     key->prefixlen == prefixlen)
1045                         return key;
1046         }
1047         return NULL;
1048 }
1049 
1050 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1051                                          const struct sock *addr_sk)
1052 {
1053         const union tcp_md5_addr *addr;
1054 
1055         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1056         return tcp_md5_do_lookup(sk, addr, AF_INET);
1057 }
1058 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1059 
1060 /* This can be called on a newly created socket, from other files */
1061 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1062                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1063                    gfp_t gfp)
1064 {
1065         /* Add Key to the list */
1066         struct tcp_md5sig_key *key;
1067         struct tcp_sock *tp = tcp_sk(sk);
1068         struct tcp_md5sig_info *md5sig;
1069 
1070         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1071         if (key) {
1072                 /* Pre-existing entry - just update that one. */
1073                 memcpy(key->key, newkey, newkeylen);
1074                 key->keylen = newkeylen;
1075                 return 0;
1076         }
1077 
1078         md5sig = rcu_dereference_protected(tp->md5sig_info,
1079                                            lockdep_sock_is_held(sk));
1080         if (!md5sig) {
1081                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1082                 if (!md5sig)
1083                         return -ENOMEM;
1084 
1085                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1086                 INIT_HLIST_HEAD(&md5sig->head);
1087                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1088         }
1089 
1090         key = sock_kmalloc(sk, sizeof(*key), gfp);
1091         if (!key)
1092                 return -ENOMEM;
1093         if (!tcp_alloc_md5sig_pool()) {
1094                 sock_kfree_s(sk, key, sizeof(*key));
1095                 return -ENOMEM;
1096         }
1097 
1098         memcpy(key->key, newkey, newkeylen);
1099         key->keylen = newkeylen;
1100         key->family = family;
1101         key->prefixlen = prefixlen;
1102         memcpy(&key->addr, addr,
1103                (family == AF_INET6) ? sizeof(struct in6_addr) :
1104                                       sizeof(struct in_addr));
1105         hlist_add_head_rcu(&key->node, &md5sig->head);
1106         return 0;
1107 }
1108 EXPORT_SYMBOL(tcp_md5_do_add);
1109 
1110 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1111                    u8 prefixlen)
1112 {
1113         struct tcp_md5sig_key *key;
1114 
1115         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1116         if (!key)
1117                 return -ENOENT;
1118         hlist_del_rcu(&key->node);
1119         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1120         kfree_rcu(key, rcu);
1121         return 0;
1122 }
1123 EXPORT_SYMBOL(tcp_md5_do_del);
1124 
1125 static void tcp_clear_md5_list(struct sock *sk)
1126 {
1127         struct tcp_sock *tp = tcp_sk(sk);
1128         struct tcp_md5sig_key *key;
1129         struct hlist_node *n;
1130         struct tcp_md5sig_info *md5sig;
1131 
1132         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1133 
1134         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1135                 hlist_del_rcu(&key->node);
1136                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1137                 kfree_rcu(key, rcu);
1138         }
1139 }
1140 
1141 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1142                                  char __user *optval, int optlen)
1143 {
1144         struct tcp_md5sig cmd;
1145         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1146         u8 prefixlen = 32;
1147 
1148         if (optlen < sizeof(cmd))
1149                 return -EINVAL;
1150 
1151         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1152                 return -EFAULT;
1153 
1154         if (sin->sin_family != AF_INET)
1155                 return -EINVAL;
1156 
1157         if (optname == TCP_MD5SIG_EXT &&
1158             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1159                 prefixlen = cmd.tcpm_prefixlen;
1160                 if (prefixlen > 32)
1161                         return -EINVAL;
1162         }
1163 
1164         if (!cmd.tcpm_keylen)
1165                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1166                                       AF_INET, prefixlen);
1167 
1168         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1169                 return -EINVAL;
1170 
1171         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1172                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1173                               GFP_KERNEL);
1174 }
1175 
1176 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1177                                    __be32 daddr, __be32 saddr,
1178                                    const struct tcphdr *th, int nbytes)
1179 {
1180         struct tcp4_pseudohdr *bp;
1181         struct scatterlist sg;
1182         struct tcphdr *_th;
1183 
1184         bp = hp->scratch;
1185         bp->saddr = saddr;
1186         bp->daddr = daddr;
1187         bp->pad = 0;
1188         bp->protocol = IPPROTO_TCP;
1189         bp->len = cpu_to_be16(nbytes);
1190 
1191         _th = (struct tcphdr *)(bp + 1);
1192         memcpy(_th, th, sizeof(*th));
1193         _th->check = 0;
1194 
1195         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1196         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1197                                 sizeof(*bp) + sizeof(*th));
1198         return crypto_ahash_update(hp->md5_req);
1199 }
1200 
1201 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1202                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1203 {
1204         struct tcp_md5sig_pool *hp;
1205         struct ahash_request *req;
1206 
1207         hp = tcp_get_md5sig_pool();
1208         if (!hp)
1209                 goto clear_hash_noput;
1210         req = hp->md5_req;
1211 
1212         if (crypto_ahash_init(req))
1213                 goto clear_hash;
1214         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1215                 goto clear_hash;
1216         if (tcp_md5_hash_key(hp, key))
1217                 goto clear_hash;
1218         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1219         if (crypto_ahash_final(req))
1220                 goto clear_hash;
1221 
1222         tcp_put_md5sig_pool();
1223         return 0;
1224 
1225 clear_hash:
1226         tcp_put_md5sig_pool();
1227 clear_hash_noput:
1228         memset(md5_hash, 0, 16);
1229         return 1;
1230 }
1231 
1232 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1233                         const struct sock *sk,
1234                         const struct sk_buff *skb)
1235 {
1236         struct tcp_md5sig_pool *hp;
1237         struct ahash_request *req;
1238         const struct tcphdr *th = tcp_hdr(skb);
1239         __be32 saddr, daddr;
1240 
1241         if (sk) { /* valid for establish/request sockets */
1242                 saddr = sk->sk_rcv_saddr;
1243                 daddr = sk->sk_daddr;
1244         } else {
1245                 const struct iphdr *iph = ip_hdr(skb);
1246                 saddr = iph->saddr;
1247                 daddr = iph->daddr;
1248         }
1249 
1250         hp = tcp_get_md5sig_pool();
1251         if (!hp)
1252                 goto clear_hash_noput;
1253         req = hp->md5_req;
1254 
1255         if (crypto_ahash_init(req))
1256                 goto clear_hash;
1257 
1258         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1259                 goto clear_hash;
1260         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1261                 goto clear_hash;
1262         if (tcp_md5_hash_key(hp, key))
1263                 goto clear_hash;
1264         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1265         if (crypto_ahash_final(req))
1266                 goto clear_hash;
1267 
1268         tcp_put_md5sig_pool();
1269         return 0;
1270 
1271 clear_hash:
1272         tcp_put_md5sig_pool();
1273 clear_hash_noput:
1274         memset(md5_hash, 0, 16);
1275         return 1;
1276 }
1277 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1278 
1279 #endif
1280 
1281 /* Called with rcu_read_lock() */
1282 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1283                                     const struct sk_buff *skb)
1284 {
1285 #ifdef CONFIG_TCP_MD5SIG
1286         /*
1287          * This gets called for each TCP segment that arrives
1288          * so we want to be efficient.
1289          * We have 3 drop cases:
1290          * o No MD5 hash and one expected.
1291          * o MD5 hash and we're not expecting one.
1292          * o MD5 hash and its wrong.
1293          */
1294         const __u8 *hash_location = NULL;
1295         struct tcp_md5sig_key *hash_expected;
1296         const struct iphdr *iph = ip_hdr(skb);
1297         const struct tcphdr *th = tcp_hdr(skb);
1298         int genhash;
1299         unsigned char newhash[16];
1300 
1301         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1302                                           AF_INET);
1303         hash_location = tcp_parse_md5sig_option(th);
1304 
1305         /* We've parsed the options - do we have a hash? */
1306         if (!hash_expected && !hash_location)
1307                 return false;
1308 
1309         if (hash_expected && !hash_location) {
1310                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1311                 return true;
1312         }
1313 
1314         if (!hash_expected && hash_location) {
1315                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1316                 return true;
1317         }
1318 
1319         /* Okay, so this is hash_expected and hash_location -
1320          * so we need to calculate the checksum.
1321          */
1322         genhash = tcp_v4_md5_hash_skb(newhash,
1323                                       hash_expected,
1324                                       NULL, skb);
1325 
1326         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1327                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1328                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1329                                      &iph->saddr, ntohs(th->source),
1330                                      &iph->daddr, ntohs(th->dest),
1331                                      genhash ? " tcp_v4_calc_md5_hash failed"
1332                                      : "");
1333                 return true;
1334         }
1335         return false;
1336 #endif
1337         return false;
1338 }
1339 
1340 static void tcp_v4_init_req(struct request_sock *req,
1341                             const struct sock *sk_listener,
1342                             struct sk_buff *skb)
1343 {
1344         struct inet_request_sock *ireq = inet_rsk(req);
1345         struct net *net = sock_net(sk_listener);
1346 
1347         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1348         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1349         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1350 }
1351 
1352 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1353                                           struct flowi *fl,
1354                                           const struct request_sock *req)
1355 {
1356         return inet_csk_route_req(sk, &fl->u.ip4, req);
1357 }
1358 
1359 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1360         .family         =       PF_INET,
1361         .obj_size       =       sizeof(struct tcp_request_sock),
1362         .rtx_syn_ack    =       tcp_rtx_synack,
1363         .send_ack       =       tcp_v4_reqsk_send_ack,
1364         .destructor     =       tcp_v4_reqsk_destructor,
1365         .send_reset     =       tcp_v4_send_reset,
1366         .syn_ack_timeout =      tcp_syn_ack_timeout,
1367 };
1368 
1369 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1370         .mss_clamp      =       TCP_MSS_DEFAULT,
1371 #ifdef CONFIG_TCP_MD5SIG
1372         .req_md5_lookup =       tcp_v4_md5_lookup,
1373         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1374 #endif
1375         .init_req       =       tcp_v4_init_req,
1376 #ifdef CONFIG_SYN_COOKIES
1377         .cookie_init_seq =      cookie_v4_init_sequence,
1378 #endif
1379         .route_req      =       tcp_v4_route_req,
1380         .init_seq       =       tcp_v4_init_seq,
1381         .init_ts_off    =       tcp_v4_init_ts_off,
1382         .send_synack    =       tcp_v4_send_synack,
1383 };
1384 
1385 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1386 {
1387         /* Never answer to SYNs send to broadcast or multicast */
1388         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1389                 goto drop;
1390 
1391         return tcp_conn_request(&tcp_request_sock_ops,
1392                                 &tcp_request_sock_ipv4_ops, sk, skb);
1393 
1394 drop:
1395         tcp_listendrop(sk);
1396         return 0;
1397 }
1398 EXPORT_SYMBOL(tcp_v4_conn_request);
1399 
1400 
1401 /*
1402  * The three way handshake has completed - we got a valid synack -
1403  * now create the new socket.
1404  */
1405 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1406                                   struct request_sock *req,
1407                                   struct dst_entry *dst,
1408                                   struct request_sock *req_unhash,
1409                                   bool *own_req)
1410 {
1411         struct inet_request_sock *ireq;
1412         struct inet_sock *newinet;
1413         struct tcp_sock *newtp;
1414         struct sock *newsk;
1415 #ifdef CONFIG_TCP_MD5SIG
1416         struct tcp_md5sig_key *key;
1417 #endif
1418         struct ip_options_rcu *inet_opt;
1419 
1420         if (sk_acceptq_is_full(sk))
1421                 goto exit_overflow;
1422 
1423         newsk = tcp_create_openreq_child(sk, req, skb);
1424         if (!newsk)
1425                 goto exit_nonewsk;
1426 
1427         newsk->sk_gso_type = SKB_GSO_TCPV4;
1428         inet_sk_rx_dst_set(newsk, skb);
1429 
1430         newtp                 = tcp_sk(newsk);
1431         newinet               = inet_sk(newsk);
1432         ireq                  = inet_rsk(req);
1433         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1434         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1435         newsk->sk_bound_dev_if = ireq->ir_iif;
1436         newinet->inet_saddr   = ireq->ir_loc_addr;
1437         inet_opt              = rcu_dereference(ireq->ireq_opt);
1438         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1439         newinet->mc_index     = inet_iif(skb);
1440         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1441         newinet->rcv_tos      = ip_hdr(skb)->tos;
1442         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1443         if (inet_opt)
1444                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1445         newinet->inet_id = newtp->write_seq ^ jiffies;
1446 
1447         if (!dst) {
1448                 dst = inet_csk_route_child_sock(sk, newsk, req);
1449                 if (!dst)
1450                         goto put_and_exit;
1451         } else {
1452                 /* syncookie case : see end of cookie_v4_check() */
1453         }
1454         sk_setup_caps(newsk, dst);
1455 
1456         tcp_ca_openreq_child(newsk, dst);
1457 
1458         tcp_sync_mss(newsk, dst_mtu(dst));
1459         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1460 
1461         tcp_initialize_rcv_mss(newsk);
1462 
1463 #ifdef CONFIG_TCP_MD5SIG
1464         /* Copy over the MD5 key from the original socket */
1465         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1466                                 AF_INET);
1467         if (key) {
1468                 /*
1469                  * We're using one, so create a matching key
1470                  * on the newsk structure. If we fail to get
1471                  * memory, then we end up not copying the key
1472                  * across. Shucks.
1473                  */
1474                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1475                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1476                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1477         }
1478 #endif
1479 
1480         if (__inet_inherit_port(sk, newsk) < 0)
1481                 goto put_and_exit;
1482         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1483         if (likely(*own_req)) {
1484                 tcp_move_syn(newtp, req);
1485                 ireq->ireq_opt = NULL;
1486         } else {
1487                 newinet->inet_opt = NULL;
1488         }
1489         return newsk;
1490 
1491 exit_overflow:
1492         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1493 exit_nonewsk:
1494         dst_release(dst);
1495 exit:
1496         tcp_listendrop(sk);
1497         return NULL;
1498 put_and_exit:
1499         newinet->inet_opt = NULL;
1500         inet_csk_prepare_forced_close(newsk);
1501         tcp_done(newsk);
1502         goto exit;
1503 }
1504 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1505 
1506 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1507 {
1508 #ifdef CONFIG_SYN_COOKIES
1509         const struct tcphdr *th = tcp_hdr(skb);
1510 
1511         if (!th->syn)
1512                 sk = cookie_v4_check(sk, skb);
1513 #endif
1514         return sk;
1515 }
1516 
1517 /* The socket must have it's spinlock held when we get
1518  * here, unless it is a TCP_LISTEN socket.
1519  *
1520  * We have a potential double-lock case here, so even when
1521  * doing backlog processing we use the BH locking scheme.
1522  * This is because we cannot sleep with the original spinlock
1523  * held.
1524  */
1525 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1526 {
1527         struct sock *rsk;
1528 
1529         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1530                 struct dst_entry *dst = sk->sk_rx_dst;
1531 
1532                 sock_rps_save_rxhash(sk, skb);
1533                 sk_mark_napi_id(sk, skb);
1534                 if (dst) {
1535                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1536                             !dst->ops->check(dst, 0)) {
1537                                 dst_release(dst);
1538                                 sk->sk_rx_dst = NULL;
1539                         }
1540                 }
1541                 tcp_rcv_established(sk, skb);
1542                 return 0;
1543         }
1544 
1545         if (tcp_checksum_complete(skb))
1546                 goto csum_err;
1547 
1548         if (sk->sk_state == TCP_LISTEN) {
1549                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1550 
1551                 if (!nsk)
1552                         goto discard;
1553                 if (nsk != sk) {
1554                         if (tcp_child_process(sk, nsk, skb)) {
1555                                 rsk = nsk;
1556                                 goto reset;
1557                         }
1558                         return 0;
1559                 }
1560         } else
1561                 sock_rps_save_rxhash(sk, skb);
1562 
1563         if (tcp_rcv_state_process(sk, skb)) {
1564                 rsk = sk;
1565                 goto reset;
1566         }
1567         return 0;
1568 
1569 reset:
1570         tcp_v4_send_reset(rsk, skb);
1571 discard:
1572         kfree_skb(skb);
1573         /* Be careful here. If this function gets more complicated and
1574          * gcc suffers from register pressure on the x86, sk (in %ebx)
1575          * might be destroyed here. This current version compiles correctly,
1576          * but you have been warned.
1577          */
1578         return 0;
1579 
1580 csum_err:
1581         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1582         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1583         goto discard;
1584 }
1585 EXPORT_SYMBOL(tcp_v4_do_rcv);
1586 
1587 int tcp_v4_early_demux(struct sk_buff *skb)
1588 {
1589         const struct iphdr *iph;
1590         const struct tcphdr *th;
1591         struct sock *sk;
1592 
1593         if (skb->pkt_type != PACKET_HOST)
1594                 return 0;
1595 
1596         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1597                 return 0;
1598 
1599         iph = ip_hdr(skb);
1600         th = tcp_hdr(skb);
1601 
1602         if (th->doff < sizeof(struct tcphdr) / 4)
1603                 return 0;
1604 
1605         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1606                                        iph->saddr, th->source,
1607                                        iph->daddr, ntohs(th->dest),
1608                                        skb->skb_iif, inet_sdif(skb));
1609         if (sk) {
1610                 skb->sk = sk;
1611                 skb->destructor = sock_edemux;
1612                 if (sk_fullsock(sk)) {
1613                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1614 
1615                         if (dst)
1616                                 dst = dst_check(dst, 0);
1617                         if (dst &&
1618                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1619                                 skb_dst_set_noref(skb, dst);
1620                 }
1621         }
1622         return 0;
1623 }
1624 
1625 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1626 {
1627         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1628         struct skb_shared_info *shinfo;
1629         const struct tcphdr *th;
1630         struct tcphdr *thtail;
1631         struct sk_buff *tail;
1632         unsigned int hdrlen;
1633         bool fragstolen;
1634         u32 gso_segs;
1635         int delta;
1636 
1637         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1638          * we can fix skb->truesize to its real value to avoid future drops.
1639          * This is valid because skb is not yet charged to the socket.
1640          * It has been noticed pure SACK packets were sometimes dropped
1641          * (if cooked by drivers without copybreak feature).
1642          */
1643         skb_condense(skb);
1644 
1645         skb_dst_drop(skb);
1646 
1647         if (unlikely(tcp_checksum_complete(skb))) {
1648                 bh_unlock_sock(sk);
1649                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1650                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1651                 return true;
1652         }
1653 
1654         /* Attempt coalescing to last skb in backlog, even if we are
1655          * above the limits.
1656          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1657          */
1658         th = (const struct tcphdr *)skb->data;
1659         hdrlen = th->doff * 4;
1660         shinfo = skb_shinfo(skb);
1661 
1662         if (!shinfo->gso_size)
1663                 shinfo->gso_size = skb->len - hdrlen;
1664 
1665         if (!shinfo->gso_segs)
1666                 shinfo->gso_segs = 1;
1667 
1668         tail = sk->sk_backlog.tail;
1669         if (!tail)
1670                 goto no_coalesce;
1671         thtail = (struct tcphdr *)tail->data;
1672 
1673         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1674             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1675             ((TCP_SKB_CB(tail)->tcp_flags |
1676               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1677             !((TCP_SKB_CB(tail)->tcp_flags &
1678               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1679             ((TCP_SKB_CB(tail)->tcp_flags ^
1680               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1681 #ifdef CONFIG_TLS_DEVICE
1682             tail->decrypted != skb->decrypted ||
1683 #endif
1684             thtail->doff != th->doff ||
1685             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1686                 goto no_coalesce;
1687 
1688         __skb_pull(skb, hdrlen);
1689         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1690                 thtail->window = th->window;
1691 
1692                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1693 
1694                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1695                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1696 
1697                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1698                  * thtail->fin, so that the fast path in tcp_rcv_established()
1699                  * is not entered if we append a packet with a FIN.
1700                  * SYN, RST, URG are not present.
1701                  * ACK is set on both packets.
1702                  * PSH : we do not really care in TCP stack,
1703                  *       at least for 'GRO' packets.
1704                  */
1705                 thtail->fin |= th->fin;
1706                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1707 
1708                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1709                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1710                         tail->tstamp = skb->tstamp;
1711                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1712                 }
1713 
1714                 /* Not as strict as GRO. We only need to carry mss max value */
1715                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1716                                                  skb_shinfo(tail)->gso_size);
1717 
1718                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1719                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1720 
1721                 sk->sk_backlog.len += delta;
1722                 __NET_INC_STATS(sock_net(sk),
1723                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1724                 kfree_skb_partial(skb, fragstolen);
1725                 return false;
1726         }
1727         __skb_push(skb, hdrlen);
1728 
1729 no_coalesce:
1730         /* Only socket owner can try to collapse/prune rx queues
1731          * to reduce memory overhead, so add a little headroom here.
1732          * Few sockets backlog are possibly concurrently non empty.
1733          */
1734         limit += 64*1024;
1735 
1736         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1737                 bh_unlock_sock(sk);
1738                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1739                 return true;
1740         }
1741         return false;
1742 }
1743 EXPORT_SYMBOL(tcp_add_backlog);
1744 
1745 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1746 {
1747         struct tcphdr *th = (struct tcphdr *)skb->data;
1748 
1749         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1750 }
1751 EXPORT_SYMBOL(tcp_filter);
1752 
1753 static void tcp_v4_restore_cb(struct sk_buff *skb)
1754 {
1755         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1756                 sizeof(struct inet_skb_parm));
1757 }
1758 
1759 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1760                            const struct tcphdr *th)
1761 {
1762         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1763          * barrier() makes sure compiler wont play fool^Waliasing games.
1764          */
1765         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1766                 sizeof(struct inet_skb_parm));
1767         barrier();
1768 
1769         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1770         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1771                                     skb->len - th->doff * 4);
1772         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1773         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1774         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1775         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1776         TCP_SKB_CB(skb)->sacked  = 0;
1777         TCP_SKB_CB(skb)->has_rxtstamp =
1778                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1779 }
1780 
1781 /*
1782  *      From tcp_input.c
1783  */
1784 
1785 int tcp_v4_rcv(struct sk_buff *skb)
1786 {
1787         struct net *net = dev_net(skb->dev);
1788         int sdif = inet_sdif(skb);
1789         const struct iphdr *iph;
1790         const struct tcphdr *th;
1791         bool refcounted;
1792         struct sock *sk;
1793         int ret;
1794 
1795         if (skb->pkt_type != PACKET_HOST)
1796                 goto discard_it;
1797 
1798         /* Count it even if it's bad */
1799         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1800 
1801         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1802                 goto discard_it;
1803 
1804         th = (const struct tcphdr *)skb->data;
1805 
1806         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1807                 goto bad_packet;
1808         if (!pskb_may_pull(skb, th->doff * 4))
1809                 goto discard_it;
1810 
1811         /* An explanation is required here, I think.
1812          * Packet length and doff are validated by header prediction,
1813          * provided case of th->doff==0 is eliminated.
1814          * So, we defer the checks. */
1815 
1816         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1817                 goto csum_error;
1818 
1819         th = (const struct tcphdr *)skb->data;
1820         iph = ip_hdr(skb);
1821 lookup:
1822         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1823                                th->dest, sdif, &refcounted);
1824         if (!sk)
1825                 goto no_tcp_socket;
1826 
1827 process:
1828         if (sk->sk_state == TCP_TIME_WAIT)
1829                 goto do_time_wait;
1830 
1831         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1832                 struct request_sock *req = inet_reqsk(sk);
1833                 bool req_stolen = false;
1834                 struct sock *nsk;
1835 
1836                 sk = req->rsk_listener;
1837                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1838                         sk_drops_add(sk, skb);
1839                         reqsk_put(req);
1840                         goto discard_it;
1841                 }
1842                 if (tcp_checksum_complete(skb)) {
1843                         reqsk_put(req);
1844                         goto csum_error;
1845                 }
1846                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1847                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1848                         goto lookup;
1849                 }
1850                 /* We own a reference on the listener, increase it again
1851                  * as we might lose it too soon.
1852                  */
1853                 sock_hold(sk);
1854                 refcounted = true;
1855                 nsk = NULL;
1856                 if (!tcp_filter(sk, skb)) {
1857                         th = (const struct tcphdr *)skb->data;
1858                         iph = ip_hdr(skb);
1859                         tcp_v4_fill_cb(skb, iph, th);
1860                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1861                 }
1862                 if (!nsk) {
1863                         reqsk_put(req);
1864                         if (req_stolen) {
1865                                 /* Another cpu got exclusive access to req
1866                                  * and created a full blown socket.
1867                                  * Try to feed this packet to this socket
1868                                  * instead of discarding it.
1869                                  */
1870                                 tcp_v4_restore_cb(skb);
1871                                 sock_put(sk);
1872                                 goto lookup;
1873                         }
1874                         goto discard_and_relse;
1875                 }
1876                 if (nsk == sk) {
1877                         reqsk_put(req);
1878                         tcp_v4_restore_cb(skb);
1879                 } else if (tcp_child_process(sk, nsk, skb)) {
1880                         tcp_v4_send_reset(nsk, skb);
1881                         goto discard_and_relse;
1882                 } else {
1883                         sock_put(sk);
1884                         return 0;
1885                 }
1886         }
1887         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1888                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1889                 goto discard_and_relse;
1890         }
1891 
1892         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1893                 goto discard_and_relse;
1894 
1895         if (tcp_v4_inbound_md5_hash(sk, skb))
1896                 goto discard_and_relse;
1897 
1898         nf_reset(skb);
1899 
1900         if (tcp_filter(sk, skb))
1901                 goto discard_and_relse;
1902         th = (const struct tcphdr *)skb->data;
1903         iph = ip_hdr(skb);
1904         tcp_v4_fill_cb(skb, iph, th);
1905 
1906         skb->dev = NULL;
1907 
1908         if (sk->sk_state == TCP_LISTEN) {
1909                 ret = tcp_v4_do_rcv(sk, skb);
1910                 goto put_and_return;
1911         }
1912 
1913         sk_incoming_cpu_update(sk);
1914 
1915         bh_lock_sock_nested(sk);
1916         tcp_segs_in(tcp_sk(sk), skb);
1917         ret = 0;
1918         if (!sock_owned_by_user(sk)) {
1919                 ret = tcp_v4_do_rcv(sk, skb);
1920         } else if (tcp_add_backlog(sk, skb)) {
1921                 goto discard_and_relse;
1922         }
1923         bh_unlock_sock(sk);
1924 
1925 put_and_return:
1926         if (refcounted)
1927                 sock_put(sk);
1928 
1929         return ret;
1930 
1931 no_tcp_socket:
1932         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1933                 goto discard_it;
1934 
1935         tcp_v4_fill_cb(skb, iph, th);
1936 
1937         if (tcp_checksum_complete(skb)) {
1938 csum_error:
1939                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1940 bad_packet:
1941                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1942         } else {
1943                 tcp_v4_send_reset(NULL, skb);
1944         }
1945 
1946 discard_it:
1947         /* Discard frame. */
1948         kfree_skb(skb);
1949         return 0;
1950 
1951 discard_and_relse:
1952         sk_drops_add(sk, skb);
1953         if (refcounted)
1954                 sock_put(sk);
1955         goto discard_it;
1956 
1957 do_time_wait:
1958         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1959                 inet_twsk_put(inet_twsk(sk));
1960                 goto discard_it;
1961         }
1962 
1963         tcp_v4_fill_cb(skb, iph, th);
1964 
1965         if (tcp_checksum_complete(skb)) {
1966                 inet_twsk_put(inet_twsk(sk));
1967                 goto csum_error;
1968         }
1969         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1970         case TCP_TW_SYN: {
1971                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1972                                                         &tcp_hashinfo, skb,
1973                                                         __tcp_hdrlen(th),
1974                                                         iph->saddr, th->source,
1975                                                         iph->daddr, th->dest,
1976                                                         inet_iif(skb),
1977                                                         sdif);
1978                 if (sk2) {
1979                         inet_twsk_deschedule_put(inet_twsk(sk));
1980                         sk = sk2;
1981                         tcp_v4_restore_cb(skb);
1982                         refcounted = false;
1983                         goto process;
1984                 }
1985         }
1986                 /* to ACK */
1987                 /* fall through */
1988         case TCP_TW_ACK:
1989                 tcp_v4_timewait_ack(sk, skb);
1990                 break;
1991         case TCP_TW_RST:
1992                 tcp_v4_send_reset(sk, skb);
1993                 inet_twsk_deschedule_put(inet_twsk(sk));
1994                 goto discard_it;
1995         case TCP_TW_SUCCESS:;
1996         }
1997         goto discard_it;
1998 }
1999 
2000 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2001         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2002         .twsk_unique    = tcp_twsk_unique,
2003         .twsk_destructor= tcp_twsk_destructor,
2004 };
2005 
2006 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2007 {
2008         struct dst_entry *dst = skb_dst(skb);
2009 
2010         if (dst && dst_hold_safe(dst)) {
2011                 sk->sk_rx_dst = dst;
2012                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2013         }
2014 }
2015 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2016 
2017 const struct inet_connection_sock_af_ops ipv4_specific = {
2018         .queue_xmit        = ip_queue_xmit,
2019         .send_check        = tcp_v4_send_check,
2020         .rebuild_header    = inet_sk_rebuild_header,
2021         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2022         .conn_request      = tcp_v4_conn_request,
2023         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2024         .net_header_len    = sizeof(struct iphdr),
2025         .setsockopt        = ip_setsockopt,
2026         .getsockopt        = ip_getsockopt,
2027         .addr2sockaddr     = inet_csk_addr2sockaddr,
2028         .sockaddr_len      = sizeof(struct sockaddr_in),
2029 #ifdef CONFIG_COMPAT
2030         .compat_setsockopt = compat_ip_setsockopt,
2031         .compat_getsockopt = compat_ip_getsockopt,
2032 #endif
2033         .mtu_reduced       = tcp_v4_mtu_reduced,
2034 };
2035 EXPORT_SYMBOL(ipv4_specific);
2036 
2037 #ifdef CONFIG_TCP_MD5SIG
2038 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2039         .md5_lookup             = tcp_v4_md5_lookup,
2040         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2041         .md5_parse              = tcp_v4_parse_md5_keys,
2042 };
2043 #endif
2044 
2045 /* NOTE: A lot of things set to zero explicitly by call to
2046  *       sk_alloc() so need not be done here.
2047  */
2048 static int tcp_v4_init_sock(struct sock *sk)
2049 {
2050         struct inet_connection_sock *icsk = inet_csk(sk);
2051 
2052         tcp_init_sock(sk);
2053 
2054         icsk->icsk_af_ops = &ipv4_specific;
2055 
2056 #ifdef CONFIG_TCP_MD5SIG
2057         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2058 #endif
2059 
2060         return 0;
2061 }
2062 
2063 void tcp_v4_destroy_sock(struct sock *sk)
2064 {
2065         struct tcp_sock *tp = tcp_sk(sk);
2066 
2067         trace_tcp_destroy_sock(sk);
2068 
2069         tcp_clear_xmit_timers(sk);
2070 
2071         tcp_cleanup_congestion_control(sk);
2072 
2073         tcp_cleanup_ulp(sk);
2074 
2075         /* Cleanup up the write buffer. */
2076         tcp_write_queue_purge(sk);
2077 
2078         /* Check if we want to disable active TFO */
2079         tcp_fastopen_active_disable_ofo_check(sk);
2080 
2081         /* Cleans up our, hopefully empty, out_of_order_queue. */
2082         skb_rbtree_purge(&tp->out_of_order_queue);
2083 
2084 #ifdef CONFIG_TCP_MD5SIG
2085         /* Clean up the MD5 key list, if any */
2086         if (tp->md5sig_info) {
2087                 tcp_clear_md5_list(sk);
2088                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2089                 tp->md5sig_info = NULL;
2090         }
2091 #endif
2092 
2093         /* Clean up a referenced TCP bind bucket. */
2094         if (inet_csk(sk)->icsk_bind_hash)
2095                 inet_put_port(sk);
2096 
2097         BUG_ON(tp->fastopen_rsk);
2098 
2099         /* If socket is aborted during connect operation */
2100         tcp_free_fastopen_req(tp);
2101         tcp_fastopen_destroy_cipher(sk);
2102         tcp_saved_syn_free(tp);
2103 
2104         sk_sockets_allocated_dec(sk);
2105 }
2106 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2107 
2108 #ifdef CONFIG_PROC_FS
2109 /* Proc filesystem TCP sock list dumping. */
2110 
2111 /*
2112  * Get next listener socket follow cur.  If cur is NULL, get first socket
2113  * starting from bucket given in st->bucket; when st->bucket is zero the
2114  * very first socket in the hash table is returned.
2115  */
2116 static void *listening_get_next(struct seq_file *seq, void *cur)
2117 {
2118         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2119         struct tcp_iter_state *st = seq->private;
2120         struct net *net = seq_file_net(seq);
2121         struct inet_listen_hashbucket *ilb;
2122         struct sock *sk = cur;
2123 
2124         if (!sk) {
2125 get_head:
2126                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2127                 spin_lock(&ilb->lock);
2128                 sk = sk_head(&ilb->head);
2129                 st->offset = 0;
2130                 goto get_sk;
2131         }
2132         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2133         ++st->num;
2134         ++st->offset;
2135 
2136         sk = sk_next(sk);
2137 get_sk:
2138         sk_for_each_from(sk) {
2139                 if (!net_eq(sock_net(sk), net))
2140                         continue;
2141                 if (sk->sk_family == afinfo->family)
2142                         return sk;
2143         }
2144         spin_unlock(&ilb->lock);
2145         st->offset = 0;
2146         if (++st->bucket < INET_LHTABLE_SIZE)
2147                 goto get_head;
2148         return NULL;
2149 }
2150 
2151 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2152 {
2153         struct tcp_iter_state *st = seq->private;
2154         void *rc;
2155 
2156         st->bucket = 0;
2157         st->offset = 0;
2158         rc = listening_get_next(seq, NULL);
2159 
2160         while (rc && *pos) {
2161                 rc = listening_get_next(seq, rc);
2162                 --*pos;
2163         }
2164         return rc;
2165 }
2166 
2167 static inline bool empty_bucket(const struct tcp_iter_state *st)
2168 {
2169         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2170 }
2171 
2172 /*
2173  * Get first established socket starting from bucket given in st->bucket.
2174  * If st->bucket is zero, the very first socket in the hash is returned.
2175  */
2176 static void *established_get_first(struct seq_file *seq)
2177 {
2178         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2179         struct tcp_iter_state *st = seq->private;
2180         struct net *net = seq_file_net(seq);
2181         void *rc = NULL;
2182 
2183         st->offset = 0;
2184         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2185                 struct sock *sk;
2186                 struct hlist_nulls_node *node;
2187                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2188 
2189                 /* Lockless fast path for the common case of empty buckets */
2190                 if (empty_bucket(st))
2191                         continue;
2192 
2193                 spin_lock_bh(lock);
2194                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2195                         if (sk->sk_family != afinfo->family ||
2196                             !net_eq(sock_net(sk), net)) {
2197                                 continue;
2198                         }
2199                         rc = sk;
2200                         goto out;
2201                 }
2202                 spin_unlock_bh(lock);
2203         }
2204 out:
2205         return rc;
2206 }
2207 
2208 static void *established_get_next(struct seq_file *seq, void *cur)
2209 {
2210         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2211         struct sock *sk = cur;
2212         struct hlist_nulls_node *node;
2213         struct tcp_iter_state *st = seq->private;
2214         struct net *net = seq_file_net(seq);
2215 
2216         ++st->num;
2217         ++st->offset;
2218 
2219         sk = sk_nulls_next(sk);
2220 
2221         sk_nulls_for_each_from(sk, node) {
2222                 if (sk->sk_family == afinfo->family &&
2223                     net_eq(sock_net(sk), net))
2224                         return sk;
2225         }
2226 
2227         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2228         ++st->bucket;
2229         return established_get_first(seq);
2230 }
2231 
2232 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2233 {
2234         struct tcp_iter_state *st = seq->private;
2235         void *rc;
2236 
2237         st->bucket = 0;
2238         rc = established_get_first(seq);
2239 
2240         while (rc && pos) {
2241                 rc = established_get_next(seq, rc);
2242                 --pos;
2243         }
2244         return rc;
2245 }
2246 
2247 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2248 {
2249         void *rc;
2250         struct tcp_iter_state *st = seq->private;
2251 
2252         st->state = TCP_SEQ_STATE_LISTENING;
2253         rc        = listening_get_idx(seq, &pos);
2254 
2255         if (!rc) {
2256                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2257                 rc        = established_get_idx(seq, pos);
2258         }
2259 
2260         return rc;
2261 }
2262 
2263 static void *tcp_seek_last_pos(struct seq_file *seq)
2264 {
2265         struct tcp_iter_state *st = seq->private;
2266         int offset = st->offset;
2267         int orig_num = st->num;
2268         void *rc = NULL;
2269 
2270         switch (st->state) {
2271         case TCP_SEQ_STATE_LISTENING:
2272                 if (st->bucket >= INET_LHTABLE_SIZE)
2273                         break;
2274                 st->state = TCP_SEQ_STATE_LISTENING;
2275                 rc = listening_get_next(seq, NULL);
2276                 while (offset-- && rc)
2277                         rc = listening_get_next(seq, rc);
2278                 if (rc)
2279                         break;
2280                 st->bucket = 0;
2281                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2282                 /* Fallthrough */
2283         case TCP_SEQ_STATE_ESTABLISHED:
2284                 if (st->bucket > tcp_hashinfo.ehash_mask)
2285                         break;
2286                 rc = established_get_first(seq);
2287                 while (offset-- && rc)
2288                         rc = established_get_next(seq, rc);
2289         }
2290 
2291         st->num = orig_num;
2292 
2293         return rc;
2294 }
2295 
2296 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2297 {
2298         struct tcp_iter_state *st = seq->private;
2299         void *rc;
2300 
2301         if (*pos && *pos == st->last_pos) {
2302                 rc = tcp_seek_last_pos(seq);
2303                 if (rc)
2304                         goto out;
2305         }
2306 
2307         st->state = TCP_SEQ_STATE_LISTENING;
2308         st->num = 0;
2309         st->bucket = 0;
2310         st->offset = 0;
2311         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2312 
2313 out:
2314         st->last_pos = *pos;
2315         return rc;
2316 }
2317 EXPORT_SYMBOL(tcp_seq_start);
2318 
2319 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2320 {
2321         struct tcp_iter_state *st = seq->private;
2322         void *rc = NULL;
2323 
2324         if (v == SEQ_START_TOKEN) {
2325                 rc = tcp_get_idx(seq, 0);
2326                 goto out;
2327         }
2328 
2329         switch (st->state) {
2330         case TCP_SEQ_STATE_LISTENING:
2331                 rc = listening_get_next(seq, v);
2332                 if (!rc) {
2333                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2334                         st->bucket = 0;
2335                         st->offset = 0;
2336                         rc        = established_get_first(seq);
2337                 }
2338                 break;
2339         case TCP_SEQ_STATE_ESTABLISHED:
2340                 rc = established_get_next(seq, v);
2341                 break;
2342         }
2343 out:
2344         ++*pos;
2345         st->last_pos = *pos;
2346         return rc;
2347 }
2348 EXPORT_SYMBOL(tcp_seq_next);
2349 
2350 void tcp_seq_stop(struct seq_file *seq, void *v)
2351 {
2352         struct tcp_iter_state *st = seq->private;
2353 
2354         switch (st->state) {
2355         case TCP_SEQ_STATE_LISTENING:
2356                 if (v != SEQ_START_TOKEN)
2357                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2358                 break;
2359         case TCP_SEQ_STATE_ESTABLISHED:
2360                 if (v)
2361                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2362                 break;
2363         }
2364 }
2365 EXPORT_SYMBOL(tcp_seq_stop);
2366 
2367 static void get_openreq4(const struct request_sock *req,
2368                          struct seq_file *f, int i)
2369 {
2370         const struct inet_request_sock *ireq = inet_rsk(req);
2371         long delta = req->rsk_timer.expires - jiffies;
2372 
2373         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2374                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2375                 i,
2376                 ireq->ir_loc_addr,
2377                 ireq->ir_num,
2378                 ireq->ir_rmt_addr,
2379                 ntohs(ireq->ir_rmt_port),
2380                 TCP_SYN_RECV,
2381                 0, 0, /* could print option size, but that is af dependent. */
2382                 1,    /* timers active (only the expire timer) */
2383                 jiffies_delta_to_clock_t(delta),
2384                 req->num_timeout,
2385                 from_kuid_munged(seq_user_ns(f),
2386                                  sock_i_uid(req->rsk_listener)),
2387                 0,  /* non standard timer */
2388                 0, /* open_requests have no inode */
2389                 0,
2390                 req);
2391 }
2392 
2393 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2394 {
2395         int timer_active;
2396         unsigned long timer_expires;
2397         const struct tcp_sock *tp = tcp_sk(sk);
2398         const struct inet_connection_sock *icsk = inet_csk(sk);
2399         const struct inet_sock *inet = inet_sk(sk);
2400         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2401         __be32 dest = inet->inet_daddr;
2402         __be32 src = inet->inet_rcv_saddr;
2403         __u16 destp = ntohs(inet->inet_dport);
2404         __u16 srcp = ntohs(inet->inet_sport);
2405         int rx_queue;
2406         int state;
2407 
2408         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2409             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2410             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2411                 timer_active    = 1;
2412                 timer_expires   = icsk->icsk_timeout;
2413         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2414                 timer_active    = 4;
2415                 timer_expires   = icsk->icsk_timeout;
2416         } else if (timer_pending(&sk->sk_timer)) {
2417                 timer_active    = 2;
2418                 timer_expires   = sk->sk_timer.expires;
2419         } else {
2420                 timer_active    = 0;
2421                 timer_expires = jiffies;
2422         }
2423 
2424         state = inet_sk_state_load(sk);
2425         if (state == TCP_LISTEN)
2426                 rx_queue = sk->sk_ack_backlog;
2427         else
2428                 /* Because we don't lock the socket,
2429                  * we might find a transient negative value.
2430                  */
2431                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2432 
2433         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2434                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2435                 i, src, srcp, dest, destp, state,
2436                 tp->write_seq - tp->snd_una,
2437                 rx_queue,
2438                 timer_active,
2439                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2440                 icsk->icsk_retransmits,
2441                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2442                 icsk->icsk_probes_out,
2443                 sock_i_ino(sk),
2444                 refcount_read(&sk->sk_refcnt), sk,
2445                 jiffies_to_clock_t(icsk->icsk_rto),
2446                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2447                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2448                 tp->snd_cwnd,
2449                 state == TCP_LISTEN ?
2450                     fastopenq->max_qlen :
2451                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2452 }
2453 
2454 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2455                                struct seq_file *f, int i)
2456 {
2457         long delta = tw->tw_timer.expires - jiffies;
2458         __be32 dest, src;
2459         __u16 destp, srcp;
2460 
2461         dest  = tw->tw_daddr;
2462         src   = tw->tw_rcv_saddr;
2463         destp = ntohs(tw->tw_dport);
2464         srcp  = ntohs(tw->tw_sport);
2465 
2466         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2467                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2468                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2469                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2470                 refcount_read(&tw->tw_refcnt), tw);
2471 }
2472 
2473 #define TMPSZ 150
2474 
2475 static int tcp4_seq_show(struct seq_file *seq, void *v)
2476 {
2477         struct tcp_iter_state *st;
2478         struct sock *sk = v;
2479 
2480         seq_setwidth(seq, TMPSZ - 1);
2481         if (v == SEQ_START_TOKEN) {
2482                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2483                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2484                            "inode");
2485                 goto out;
2486         }
2487         st = seq->private;
2488 
2489         if (sk->sk_state == TCP_TIME_WAIT)
2490                 get_timewait4_sock(v, seq, st->num);
2491         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2492                 get_openreq4(v, seq, st->num);
2493         else
2494                 get_tcp4_sock(v, seq, st->num);
2495 out:
2496         seq_pad(seq, '\n');
2497         return 0;
2498 }
2499 
2500 static const struct seq_operations tcp4_seq_ops = {
2501         .show           = tcp4_seq_show,
2502         .start          = tcp_seq_start,
2503         .next           = tcp_seq_next,
2504         .stop           = tcp_seq_stop,
2505 };
2506 
2507 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2508         .family         = AF_INET,
2509 };
2510 
2511 static int __net_init tcp4_proc_init_net(struct net *net)
2512 {
2513         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2514                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2515                 return -ENOMEM;
2516         return 0;
2517 }
2518 
2519 static void __net_exit tcp4_proc_exit_net(struct net *net)
2520 {
2521         remove_proc_entry("tcp", net->proc_net);
2522 }
2523 
2524 static struct pernet_operations tcp4_net_ops = {
2525         .init = tcp4_proc_init_net,
2526         .exit = tcp4_proc_exit_net,
2527 };
2528 
2529 int __init tcp4_proc_init(void)
2530 {
2531         return register_pernet_subsys(&tcp4_net_ops);
2532 }
2533 
2534 void tcp4_proc_exit(void)
2535 {
2536         unregister_pernet_subsys(&tcp4_net_ops);
2537 }
2538 #endif /* CONFIG_PROC_FS */
2539 
2540 struct proto tcp_prot = {
2541         .name                   = "TCP",
2542         .owner                  = THIS_MODULE,
2543         .close                  = tcp_close,
2544         .pre_connect            = tcp_v4_pre_connect,
2545         .connect                = tcp_v4_connect,
2546         .disconnect             = tcp_disconnect,
2547         .accept                 = inet_csk_accept,
2548         .ioctl                  = tcp_ioctl,
2549         .init                   = tcp_v4_init_sock,
2550         .destroy                = tcp_v4_destroy_sock,
2551         .shutdown               = tcp_shutdown,
2552         .setsockopt             = tcp_setsockopt,
2553         .getsockopt             = tcp_getsockopt,
2554         .keepalive              = tcp_set_keepalive,
2555         .recvmsg                = tcp_recvmsg,
2556         .sendmsg                = tcp_sendmsg,
2557         .sendpage               = tcp_sendpage,
2558         .backlog_rcv            = tcp_v4_do_rcv,
2559         .release_cb             = tcp_release_cb,
2560         .hash                   = inet_hash,
2561         .unhash                 = inet_unhash,
2562         .get_port               = inet_csk_get_port,
2563         .enter_memory_pressure  = tcp_enter_memory_pressure,
2564         .leave_memory_pressure  = tcp_leave_memory_pressure,
2565         .stream_memory_free     = tcp_stream_memory_free,
2566         .sockets_allocated      = &tcp_sockets_allocated,
2567         .orphan_count           = &tcp_orphan_count,
2568         .memory_allocated       = &tcp_memory_allocated,
2569         .memory_pressure        = &tcp_memory_pressure,
2570         .sysctl_mem             = sysctl_tcp_mem,
2571         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2572         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2573         .max_header             = MAX_TCP_HEADER,
2574         .obj_size               = sizeof(struct tcp_sock),
2575         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2576         .twsk_prot              = &tcp_timewait_sock_ops,
2577         .rsk_prot               = &tcp_request_sock_ops,
2578         .h.hashinfo             = &tcp_hashinfo,
2579         .no_autobind            = true,
2580 #ifdef CONFIG_COMPAT
2581         .compat_setsockopt      = compat_tcp_setsockopt,
2582         .compat_getsockopt      = compat_tcp_getsockopt,
2583 #endif
2584         .diag_destroy           = tcp_abort,
2585 };
2586 EXPORT_SYMBOL(tcp_prot);
2587 
2588 static void __net_exit tcp_sk_exit(struct net *net)
2589 {
2590         int cpu;
2591 
2592         if (net->ipv4.tcp_congestion_control)
2593                 module_put(net->ipv4.tcp_congestion_control->owner);
2594 
2595         for_each_possible_cpu(cpu)
2596                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2597         free_percpu(net->ipv4.tcp_sk);
2598 }
2599 
2600 static int __net_init tcp_sk_init(struct net *net)
2601 {
2602         int res, cpu, cnt;
2603 
2604         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2605         if (!net->ipv4.tcp_sk)
2606                 return -ENOMEM;
2607 
2608         for_each_possible_cpu(cpu) {
2609                 struct sock *sk;
2610 
2611                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2612                                            IPPROTO_TCP, net);
2613                 if (res)
2614                         goto fail;
2615                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2616 
2617                 /* Please enforce IP_DF and IPID==0 for RST and
2618                  * ACK sent in SYN-RECV and TIME-WAIT state.
2619                  */
2620                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2621 
2622                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2623         }
2624 
2625         net->ipv4.sysctl_tcp_ecn = 2;
2626         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2627 
2628         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2629         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2630         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2631 
2632         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2633         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2634         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2635 
2636         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2637         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2638         net->ipv4.sysctl_tcp_syncookies = 1;
2639         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2640         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2641         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2642         net->ipv4.sysctl_tcp_orphan_retries = 0;
2643         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2644         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2645         net->ipv4.sysctl_tcp_tw_reuse = 2;
2646 
2647         cnt = tcp_hashinfo.ehash_mask + 1;
2648         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2649         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2650 
2651         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2652         net->ipv4.sysctl_tcp_sack = 1;
2653         net->ipv4.sysctl_tcp_window_scaling = 1;
2654         net->ipv4.sysctl_tcp_timestamps = 1;
2655         net->ipv4.sysctl_tcp_early_retrans = 3;
2656         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2657         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2658         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2659         net->ipv4.sysctl_tcp_max_reordering = 300;
2660         net->ipv4.sysctl_tcp_dsack = 1;
2661         net->ipv4.sysctl_tcp_app_win = 31;
2662         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2663         net->ipv4.sysctl_tcp_frto = 2;
2664         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2665         /* This limits the percentage of the congestion window which we
2666          * will allow a single TSO frame to consume.  Building TSO frames
2667          * which are too large can cause TCP streams to be bursty.
2668          */
2669         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2670         /* Default TSQ limit of 16 TSO segments */
2671         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2672         /* rfc5961 challenge ack rate limiting */
2673         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2674         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2675         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2676         net->ipv4.sysctl_tcp_autocorking = 1;
2677         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2678         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2679         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2680         if (net != &init_net) {
2681                 memcpy(net->ipv4.sysctl_tcp_rmem,
2682                        init_net.ipv4.sysctl_tcp_rmem,
2683                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2684                 memcpy(net->ipv4.sysctl_tcp_wmem,
2685                        init_net.ipv4.sysctl_tcp_wmem,
2686                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2687         }
2688         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2689         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2690         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2691         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2692         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2693         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2694 
2695         /* Reno is always built in */
2696         if (!net_eq(net, &init_net) &&
2697             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2698                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2699         else
2700                 net->ipv4.tcp_congestion_control = &tcp_reno;
2701 
2702         return 0;
2703 fail:
2704         tcp_sk_exit(net);
2705 
2706         return res;
2707 }
2708 
2709 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2710 {
2711         struct net *net;
2712 
2713         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2714 
2715         list_for_each_entry(net, net_exit_list, exit_list)
2716                 tcp_fastopen_ctx_destroy(net);
2717 }
2718 
2719 static struct pernet_operations __net_initdata tcp_sk_ops = {
2720        .init       = tcp_sk_init,
2721        .exit       = tcp_sk_exit,
2722        .exit_batch = tcp_sk_exit_batch,
2723 };
2724 
2725 void __init tcp_v4_init(void)
2726 {
2727         if (register_pernet_subsys(&tcp_sk_ops))
2728                 panic("Failed to create the TCP control socket.\n");
2729 }
2730 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp