~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/tcp_ipv4.c

Version: ~ [ linux-5.19-rc8 ] ~ [ linux-5.18.14 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.57 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.133 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.207 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.253 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.289 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.324 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.302 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  4  *              operating system.  INET is implemented using the  BSD Socket
  5  *              interface as the means of communication with the user level.
  6  *
  7  *              Implementation of the Transmission Control Protocol(TCP).
  8  *
  9  *              IPv4 specific functions
 10  *
 11  *              code split from:
 12  *              linux/ipv4/tcp.c
 13  *              linux/ipv4/tcp_input.c
 14  *              linux/ipv4/tcp_output.c
 15  *
 16  *              See tcp.c for author information
 17  */
 18 
 19 /*
 20  * Changes:
 21  *              David S. Miller :       New socket lookup architecture.
 22  *                                      This code is dedicated to John Dyson.
 23  *              David S. Miller :       Change semantics of established hash,
 24  *                                      half is devoted to TIME_WAIT sockets
 25  *                                      and the rest go in the other half.
 26  *              Andi Kleen :            Add support for syncookies and fixed
 27  *                                      some bugs: ip options weren't passed to
 28  *                                      the TCP layer, missed a check for an
 29  *                                      ACK bit.
 30  *              Andi Kleen :            Implemented fast path mtu discovery.
 31  *                                      Fixed many serious bugs in the
 32  *                                      request_sock handling and moved
 33  *                                      most of it into the af independent code.
 34  *                                      Added tail drop and some other bugfixes.
 35  *                                      Added new listen semantics.
 36  *              Mike McLagan    :       Routing by source
 37  *      Juan Jose Ciarlante:            ip_dynaddr bits
 38  *              Andi Kleen:             various fixes.
 39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
 40  *                                      coma.
 41  *      Andi Kleen              :       Fix new listen.
 42  *      Andi Kleen              :       Fix accept error reporting.
 43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
 44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 45  *                                      a single port at the same time.
 46  */
 47 
 48 #define pr_fmt(fmt) "TCP: " fmt
 49 
 50 #include <linux/bottom_half.h>
 51 #include <linux/types.h>
 52 #include <linux/fcntl.h>
 53 #include <linux/module.h>
 54 #include <linux/random.h>
 55 #include <linux/cache.h>
 56 #include <linux/jhash.h>
 57 #include <linux/init.h>
 58 #include <linux/times.h>
 59 #include <linux/slab.h>
 60 
 61 #include <net/net_namespace.h>
 62 #include <net/icmp.h>
 63 #include <net/inet_hashtables.h>
 64 #include <net/tcp.h>
 65 #include <net/transp_v6.h>
 66 #include <net/ipv6.h>
 67 #include <net/inet_common.h>
 68 #include <net/timewait_sock.h>
 69 #include <net/xfrm.h>
 70 #include <net/secure_seq.h>
 71 #include <net/busy_poll.h>
 72 
 73 #include <linux/inet.h>
 74 #include <linux/ipv6.h>
 75 #include <linux/stddef.h>
 76 #include <linux/proc_fs.h>
 77 #include <linux/seq_file.h>
 78 #include <linux/inetdevice.h>
 79 #include <linux/btf_ids.h>
 80 
 81 #include <crypto/hash.h>
 82 #include <linux/scatterlist.h>
 83 
 84 #include <trace/events/tcp.h>
 85 
 86 #ifdef CONFIG_TCP_MD5SIG
 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
 89 #endif
 90 
 91 struct inet_hashinfo tcp_hashinfo;
 92 EXPORT_SYMBOL(tcp_hashinfo);
 93 
 94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
 95 {
 96         return secure_tcp_seq(ip_hdr(skb)->daddr,
 97                               ip_hdr(skb)->saddr,
 98                               tcp_hdr(skb)->dest,
 99                               tcp_hdr(skb)->source);
100 }
101 
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111         struct tcp_sock *tp = tcp_sk(sk);
112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114         if (reuse == 2) {
115                 /* Still does not detect *everything* that goes through
116                  * lo, since we require a loopback src or dst address
117                  * or direct binding to 'lo' interface.
118                  */
119                 bool loopback = false;
120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121                         loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123                 if (tw->tw_family == AF_INET6) {
124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128                                 loopback = true;
129                 } else
130 #endif
131                 {
132                         if (ipv4_is_loopback(tw->tw_daddr) ||
133                             ipv4_is_loopback(tw->tw_rcv_saddr))
134                                 loopback = true;
135                 }
136                 if (!loopback)
137                         reuse = 0;
138         }
139 
140         /* With PAWS, it is safe from the viewpoint
141            of data integrity. Even without PAWS it is safe provided sequence
142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144            Actually, the idea is close to VJ's one, only timestamp cache is
145            held not per host, but per port pair and TW bucket is used as state
146            holder.
147 
148            If TW bucket has been already destroyed we fall back to VJ's scheme
149            and use initial timestamp retrieved from peer table.
150          */
151         if (tcptw->tw_ts_recent_stamp &&
152             (!twp || (reuse && time_after32(ktime_get_seconds(),
153                                             tcptw->tw_ts_recent_stamp)))) {
154                 /* In case of repair and re-using TIME-WAIT sockets we still
155                  * want to be sure that it is safe as above but honor the
156                  * sequence numbers and time stamps set as part of the repair
157                  * process.
158                  *
159                  * Without this check re-using a TIME-WAIT socket with TCP
160                  * repair would accumulate a -1 on the repair assigned
161                  * sequence number. The first time it is reused the sequence
162                  * is -1, the second time -2, etc. This fixes that issue
163                  * without appearing to create any others.
164                  */
165                 if (likely(!tp->repair)) {
166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168                         if (!seq)
169                                 seq = 1;
170                         WRITE_ONCE(tp->write_seq, seq);
171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173                 }
174                 sock_hold(sktw);
175                 return 1;
176         }
177 
178         return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183                               int addr_len)
184 {
185         /* This check is replicated from tcp_v4_connect() and intended to
186          * prevent BPF program called below from accessing bytes that are out
187          * of the bound specified by user in addr_len.
188          */
189         if (addr_len < sizeof(struct sockaddr_in))
190                 return -EINVAL;
191 
192         sock_owned_by_me(sk);
193 
194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201         struct inet_sock *inet = inet_sk(sk);
202         struct tcp_sock *tp = tcp_sk(sk);
203         __be16 orig_sport, orig_dport;
204         __be32 daddr, nexthop;
205         struct flowi4 *fl4;
206         struct rtable *rt;
207         int err;
208         struct ip_options_rcu *inet_opt;
209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211         if (addr_len < sizeof(struct sockaddr_in))
212                 return -EINVAL;
213 
214         if (usin->sin_family != AF_INET)
215                 return -EAFNOSUPPORT;
216 
217         nexthop = daddr = usin->sin_addr.s_addr;
218         inet_opt = rcu_dereference_protected(inet->inet_opt,
219                                              lockdep_sock_is_held(sk));
220         if (inet_opt && inet_opt->opt.srr) {
221                 if (!daddr)
222                         return -EINVAL;
223                 nexthop = inet_opt->opt.faddr;
224         }
225 
226         orig_sport = inet->inet_sport;
227         orig_dport = usin->sin_port;
228         fl4 = &inet->cork.fl.u.ip4;
229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231                               IPPROTO_TCP,
232                               orig_sport, orig_dport, sk);
233         if (IS_ERR(rt)) {
234                 err = PTR_ERR(rt);
235                 if (err == -ENETUNREACH)
236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237                 return err;
238         }
239 
240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241                 ip_rt_put(rt);
242                 return -ENETUNREACH;
243         }
244 
245         if (!inet_opt || !inet_opt->opt.srr)
246                 daddr = fl4->daddr;
247 
248         if (!inet->inet_saddr)
249                 inet->inet_saddr = fl4->saddr;
250         sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253                 /* Reset inherited state */
254                 tp->rx_opt.ts_recent       = 0;
255                 tp->rx_opt.ts_recent_stamp = 0;
256                 if (likely(!tp->repair))
257                         WRITE_ONCE(tp->write_seq, 0);
258         }
259 
260         inet->inet_dport = usin->sin_port;
261         sk_daddr_set(sk, daddr);
262 
263         inet_csk(sk)->icsk_ext_hdr_len = 0;
264         if (inet_opt)
265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269         /* Socket identity is still unknown (sport may be zero).
270          * However we set state to SYN-SENT and not releasing socket
271          * lock select source port, enter ourselves into the hash tables and
272          * complete initialization after this.
273          */
274         tcp_set_state(sk, TCP_SYN_SENT);
275         err = inet_hash_connect(tcp_death_row, sk);
276         if (err)
277                 goto failure;
278 
279         sk_set_txhash(sk);
280 
281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282                                inet->inet_sport, inet->inet_dport, sk);
283         if (IS_ERR(rt)) {
284                 err = PTR_ERR(rt);
285                 rt = NULL;
286                 goto failure;
287         }
288         /* OK, now commit destination to socket.  */
289         sk->sk_gso_type = SKB_GSO_TCPV4;
290         sk_setup_caps(sk, &rt->dst);
291         rt = NULL;
292 
293         if (likely(!tp->repair)) {
294                 if (!tp->write_seq)
295                         WRITE_ONCE(tp->write_seq,
296                                    secure_tcp_seq(inet->inet_saddr,
297                                                   inet->inet_daddr,
298                                                   inet->inet_sport,
299                                                   usin->sin_port));
300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301                                                  inet->inet_saddr,
302                                                  inet->inet_daddr);
303         }
304 
305         inet->inet_id = prandom_u32();
306 
307         if (tcp_fastopen_defer_connect(sk, &err))
308                 return err;
309         if (err)
310                 goto failure;
311 
312         err = tcp_connect(sk);
313 
314         if (err)
315                 goto failure;
316 
317         return 0;
318 
319 failure:
320         /*
321          * This unhashes the socket and releases the local port,
322          * if necessary.
323          */
324         tcp_set_state(sk, TCP_CLOSE);
325         ip_rt_put(rt);
326         sk->sk_route_caps = 0;
327         inet->inet_dport = 0;
328         return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339         struct inet_sock *inet = inet_sk(sk);
340         struct dst_entry *dst;
341         u32 mtu;
342 
343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344                 return;
345         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346         dst = inet_csk_update_pmtu(sk, mtu);
347         if (!dst)
348                 return;
349 
350         /* Something is about to be wrong... Remember soft error
351          * for the case, if this connection will not able to recover.
352          */
353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354                 sk->sk_err_soft = EMSGSIZE;
355 
356         mtu = dst_mtu(dst);
357 
358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359             ip_sk_accept_pmtu(sk) &&
360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361                 tcp_sync_mss(sk, mtu);
362 
363                 /* Resend the TCP packet because it's
364                  * clear that the old packet has been
365                  * dropped. This is the new "fast" path mtu
366                  * discovery.
367                  */
368                 tcp_simple_retransmit(sk);
369         } /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375         struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377         if (dst)
378                 dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385         struct request_sock *req = inet_reqsk(sk);
386         struct net *net = sock_net(sk);
387 
388         /* ICMPs are not backlogged, hence we cannot get
389          * an established socket here.
390          */
391         if (seq != tcp_rsk(req)->snt_isn) {
392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393         } else if (abort) {
394                 /*
395                  * Still in SYN_RECV, just remove it silently.
396                  * There is no good way to pass the error to the newly
397                  * created socket, and POSIX does not want network
398                  * errors returned from accept().
399                  */
400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401                 tcp_listendrop(req->rsk_listener);
402         }
403         reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410         struct inet_connection_sock *icsk = inet_csk(sk);
411         struct tcp_sock *tp = tcp_sk(sk);
412         struct sk_buff *skb;
413         s32 remaining;
414         u32 delta_us;
415 
416         if (sock_owned_by_user(sk))
417                 return;
418 
419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420             !icsk->icsk_backoff)
421                 return;
422 
423         skb = tcp_rtx_queue_head(sk);
424         if (WARN_ON_ONCE(!skb))
425                 return;
426 
427         icsk->icsk_backoff--;
428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431         tcp_mstamp_refresh(tp);
432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435         if (remaining > 0) {
436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437                                           remaining, TCP_RTO_MAX);
438         } else {
439                 /* RTO revert clocked out retransmission.
440                  * Will retransmit now.
441                  */
442                 tcp_retransmit_timer(sk);
443         }
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465         const struct iphdr *iph = (const struct iphdr *)skb->data;
466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467         struct tcp_sock *tp;
468         struct inet_sock *inet;
469         const int type = icmp_hdr(skb)->type;
470         const int code = icmp_hdr(skb)->code;
471         struct sock *sk;
472         struct request_sock *fastopen;
473         u32 seq, snd_una;
474         int err;
475         struct net *net = dev_net(skb->dev);
476 
477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478                                        th->dest, iph->saddr, ntohs(th->source),
479                                        inet_iif(skb), 0);
480         if (!sk) {
481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482                 return -ENOENT;
483         }
484         if (sk->sk_state == TCP_TIME_WAIT) {
485                 inet_twsk_put(inet_twsk(sk));
486                 return 0;
487         }
488         seq = ntohl(th->seq);
489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491                                      type == ICMP_TIME_EXCEEDED ||
492                                      (type == ICMP_DEST_UNREACH &&
493                                       (code == ICMP_NET_UNREACH ||
494                                        code == ICMP_HOST_UNREACH)));
495                 return 0;
496         }
497 
498         bh_lock_sock(sk);
499         /* If too many ICMPs get dropped on busy
500          * servers this needs to be solved differently.
501          * We do take care of PMTU discovery (RFC1191) special case :
502          * we can receive locally generated ICMP messages while socket is held.
503          */
504         if (sock_owned_by_user(sk)) {
505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507         }
508         if (sk->sk_state == TCP_CLOSE)
509                 goto out;
510 
511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513                 goto out;
514         }
515 
516         tp = tcp_sk(sk);
517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518         fastopen = rcu_dereference(tp->fastopen_rsk);
519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520         if (sk->sk_state != TCP_LISTEN &&
521             !between(seq, snd_una, tp->snd_nxt)) {
522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523                 goto out;
524         }
525 
526         switch (type) {
527         case ICMP_REDIRECT:
528                 if (!sock_owned_by_user(sk))
529                         do_redirect(skb, sk);
530                 goto out;
531         case ICMP_SOURCE_QUENCH:
532                 /* Just silently ignore these. */
533                 goto out;
534         case ICMP_PARAMETERPROB:
535                 err = EPROTO;
536                 break;
537         case ICMP_DEST_UNREACH:
538                 if (code > NR_ICMP_UNREACH)
539                         goto out;
540 
541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542                         /* We are not interested in TCP_LISTEN and open_requests
543                          * (SYN-ACKs send out by Linux are always <576bytes so
544                          * they should go through unfragmented).
545                          */
546                         if (sk->sk_state == TCP_LISTEN)
547                                 goto out;
548 
549                         WRITE_ONCE(tp->mtu_info, info);
550                         if (!sock_owned_by_user(sk)) {
551                                 tcp_v4_mtu_reduced(sk);
552                         } else {
553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554                                         sock_hold(sk);
555                         }
556                         goto out;
557                 }
558 
559                 err = icmp_err_convert[code].errno;
560                 /* check if this ICMP message allows revert of backoff.
561                  * (see RFC 6069)
562                  */
563                 if (!fastopen &&
564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565                         tcp_ld_RTO_revert(sk, seq);
566                 break;
567         case ICMP_TIME_EXCEEDED:
568                 err = EHOSTUNREACH;
569                 break;
570         default:
571                 goto out;
572         }
573 
574         switch (sk->sk_state) {
575         case TCP_SYN_SENT:
576         case TCP_SYN_RECV:
577                 /* Only in fast or simultaneous open. If a fast open socket is
578                  * already accepted it is treated as a connected one below.
579                  */
580                 if (fastopen && !fastopen->sk)
581                         break;
582 
583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585                 if (!sock_owned_by_user(sk)) {
586                         sk->sk_err = err;
587 
588                         sk_error_report(sk);
589 
590                         tcp_done(sk);
591                 } else {
592                         sk->sk_err_soft = err;
593                 }
594                 goto out;
595         }
596 
597         /* If we've already connected we will keep trying
598          * until we time out, or the user gives up.
599          *
600          * rfc1122 4.2.3.9 allows to consider as hard errors
601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602          * but it is obsoleted by pmtu discovery).
603          *
604          * Note, that in modern internet, where routing is unreliable
605          * and in each dark corner broken firewalls sit, sending random
606          * errors ordered by their masters even this two messages finally lose
607          * their original sense (even Linux sends invalid PORT_UNREACHs)
608          *
609          * Now we are in compliance with RFCs.
610          *                                                      --ANK (980905)
611          */
612 
613         inet = inet_sk(sk);
614         if (!sock_owned_by_user(sk) && inet->recverr) {
615                 sk->sk_err = err;
616                 sk_error_report(sk);
617         } else  { /* Only an error on timeout */
618                 sk->sk_err_soft = err;
619         }
620 
621 out:
622         bh_unlock_sock(sk);
623         sock_put(sk);
624         return 0;
625 }
626 
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629         struct tcphdr *th = tcp_hdr(skb);
630 
631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632         skb->csum_start = skb_transport_header(skb) - skb->head;
633         skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639         const struct inet_sock *inet = inet_sk(sk);
640 
641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *      This routine will send an RST to the other tcp.
647  *
648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *                    for reset.
650  *      Answer: if a packet caused RST, it is not for a socket
651  *              existing in our system, if it is matched to a socket,
652  *              it is just duplicate segment or bug in other side's TCP.
653  *              So that we build reply only basing on parameters
654  *              arrived with segment.
655  *      Exception: precedence violation. We do not implement it in any case.
656  */
657 
658 #ifdef CONFIG_TCP_MD5SIG
659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
660 #else
661 #define OPTION_BYTES sizeof(__be32)
662 #endif
663 
664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
665 {
666         const struct tcphdr *th = tcp_hdr(skb);
667         struct {
668                 struct tcphdr th;
669                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
670         } rep;
671         struct ip_reply_arg arg;
672 #ifdef CONFIG_TCP_MD5SIG
673         struct tcp_md5sig_key *key = NULL;
674         const __u8 *hash_location = NULL;
675         unsigned char newhash[16];
676         int genhash;
677         struct sock *sk1 = NULL;
678 #endif
679         u64 transmit_time = 0;
680         struct sock *ctl_sk;
681         struct net *net;
682 
683         /* Never send a reset in response to a reset. */
684         if (th->rst)
685                 return;
686 
687         /* If sk not NULL, it means we did a successful lookup and incoming
688          * route had to be correct. prequeue might have dropped our dst.
689          */
690         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691                 return;
692 
693         /* Swap the send and the receive. */
694         memset(&rep, 0, sizeof(rep));
695         rep.th.dest   = th->source;
696         rep.th.source = th->dest;
697         rep.th.doff   = sizeof(struct tcphdr) / 4;
698         rep.th.rst    = 1;
699 
700         if (th->ack) {
701                 rep.th.seq = th->ack_seq;
702         } else {
703                 rep.th.ack = 1;
704                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705                                        skb->len - (th->doff << 2));
706         }
707 
708         memset(&arg, 0, sizeof(arg));
709         arg.iov[0].iov_base = (unsigned char *)&rep;
710         arg.iov[0].iov_len  = sizeof(rep.th);
711 
712         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713 #ifdef CONFIG_TCP_MD5SIG
714         rcu_read_lock();
715         hash_location = tcp_parse_md5sig_option(th);
716         if (sk && sk_fullsock(sk)) {
717                 const union tcp_md5_addr *addr;
718                 int l3index;
719 
720                 /* sdif set, means packet ingressed via a device
721                  * in an L3 domain and inet_iif is set to it.
722                  */
723                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726         } else if (hash_location) {
727                 const union tcp_md5_addr *addr;
728                 int sdif = tcp_v4_sdif(skb);
729                 int dif = inet_iif(skb);
730                 int l3index;
731 
732                 /*
733                  * active side is lost. Try to find listening socket through
734                  * source port, and then find md5 key through listening socket.
735                  * we are not loose security here:
736                  * Incoming packet is checked with md5 hash with finding key,
737                  * no RST generated if md5 hash doesn't match.
738                  */
739                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740                                              ip_hdr(skb)->saddr,
741                                              th->source, ip_hdr(skb)->daddr,
742                                              ntohs(th->source), dif, sdif);
743                 /* don't send rst if it can't find key */
744                 if (!sk1)
745                         goto out;
746 
747                 /* sdif set, means packet ingressed via a device
748                  * in an L3 domain and dif is set to it.
749                  */
750                 l3index = sdif ? dif : 0;
751                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753                 if (!key)
754                         goto out;
755 
756 
757                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
759                         goto out;
760 
761         }
762 
763         if (key) {
764                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765                                    (TCPOPT_NOP << 16) |
766                                    (TCPOPT_MD5SIG << 8) |
767                                    TCPOLEN_MD5SIG);
768                 /* Update length and the length the header thinks exists */
769                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770                 rep.th.doff = arg.iov[0].iov_len / 4;
771 
772                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773                                      key, ip_hdr(skb)->saddr,
774                                      ip_hdr(skb)->daddr, &rep.th);
775         }
776 #endif
777         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
778         if (rep.opt[0] == 0) {
779                 __be32 mrst = mptcp_reset_option(skb);
780 
781                 if (mrst) {
782                         rep.opt[0] = mrst;
783                         arg.iov[0].iov_len += sizeof(mrst);
784                         rep.th.doff = arg.iov[0].iov_len / 4;
785                 }
786         }
787 
788         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789                                       ip_hdr(skb)->saddr, /* XXX */
790                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
791         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
793 
794         /* When socket is gone, all binding information is lost.
795          * routing might fail in this case. No choice here, if we choose to force
796          * input interface, we will misroute in case of asymmetric route.
797          */
798         if (sk) {
799                 arg.bound_dev_if = sk->sk_bound_dev_if;
800                 if (sk_fullsock(sk))
801                         trace_tcp_send_reset(sk, skb);
802         }
803 
804         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
806 
807         arg.tos = ip_hdr(skb)->tos;
808         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
809         local_bh_disable();
810         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
811         if (sk) {
812                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
814                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
816                 transmit_time = tcp_transmit_time(sk);
817         }
818         ip_send_unicast_reply(ctl_sk,
819                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
820                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821                               &arg, arg.iov[0].iov_len,
822                               transmit_time);
823 
824         ctl_sk->sk_mark = 0;
825         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
827         local_bh_enable();
828 
829 #ifdef CONFIG_TCP_MD5SIG
830 out:
831         rcu_read_unlock();
832 #endif
833 }
834 
835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
836    outside socket context is ugly, certainly. What can I do?
837  */
838 
839 static void tcp_v4_send_ack(const struct sock *sk,
840                             struct sk_buff *skb, u32 seq, u32 ack,
841                             u32 win, u32 tsval, u32 tsecr, int oif,
842                             struct tcp_md5sig_key *key,
843                             int reply_flags, u8 tos)
844 {
845         const struct tcphdr *th = tcp_hdr(skb);
846         struct {
847                 struct tcphdr th;
848                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
849 #ifdef CONFIG_TCP_MD5SIG
850                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
851 #endif
852                         ];
853         } rep;
854         struct net *net = sock_net(sk);
855         struct ip_reply_arg arg;
856         struct sock *ctl_sk;
857         u64 transmit_time;
858 
859         memset(&rep.th, 0, sizeof(struct tcphdr));
860         memset(&arg, 0, sizeof(arg));
861 
862         arg.iov[0].iov_base = (unsigned char *)&rep;
863         arg.iov[0].iov_len  = sizeof(rep.th);
864         if (tsecr) {
865                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866                                    (TCPOPT_TIMESTAMP << 8) |
867                                    TCPOLEN_TIMESTAMP);
868                 rep.opt[1] = htonl(tsval);
869                 rep.opt[2] = htonl(tsecr);
870                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
871         }
872 
873         /* Swap the send and the receive. */
874         rep.th.dest    = th->source;
875         rep.th.source  = th->dest;
876         rep.th.doff    = arg.iov[0].iov_len / 4;
877         rep.th.seq     = htonl(seq);
878         rep.th.ack_seq = htonl(ack);
879         rep.th.ack     = 1;
880         rep.th.window  = htons(win);
881 
882 #ifdef CONFIG_TCP_MD5SIG
883         if (key) {
884                 int offset = (tsecr) ? 3 : 0;
885 
886                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
887                                           (TCPOPT_NOP << 16) |
888                                           (TCPOPT_MD5SIG << 8) |
889                                           TCPOLEN_MD5SIG);
890                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891                 rep.th.doff = arg.iov[0].iov_len/4;
892 
893                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
894                                     key, ip_hdr(skb)->saddr,
895                                     ip_hdr(skb)->daddr, &rep.th);
896         }
897 #endif
898         arg.flags = reply_flags;
899         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900                                       ip_hdr(skb)->saddr, /* XXX */
901                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
902         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
903         if (oif)
904                 arg.bound_dev_if = oif;
905         arg.tos = tos;
906         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
907         local_bh_disable();
908         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
909         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910                            inet_twsk(sk)->tw_mark : sk->sk_mark;
911         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912                            inet_twsk(sk)->tw_priority : sk->sk_priority;
913         transmit_time = tcp_transmit_time(sk);
914         ip_send_unicast_reply(ctl_sk,
915                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
916                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917                               &arg, arg.iov[0].iov_len,
918                               transmit_time);
919 
920         ctl_sk->sk_mark = 0;
921         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922         local_bh_enable();
923 }
924 
925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
926 {
927         struct inet_timewait_sock *tw = inet_twsk(sk);
928         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
929 
930         tcp_v4_send_ack(sk, skb,
931                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
932                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
933                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
934                         tcptw->tw_ts_recent,
935                         tw->tw_bound_dev_if,
936                         tcp_twsk_md5_key(tcptw),
937                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
938                         tw->tw_tos
939                         );
940 
941         inet_twsk_put(tw);
942 }
943 
944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
945                                   struct request_sock *req)
946 {
947         const union tcp_md5_addr *addr;
948         int l3index;
949 
950         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
951          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
952          */
953         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
954                                              tcp_sk(sk)->snd_nxt;
955 
956         /* RFC 7323 2.3
957          * The window field (SEG.WND) of every outgoing segment, with the
958          * exception of <SYN> segments, MUST be right-shifted by
959          * Rcv.Wind.Shift bits:
960          */
961         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
962         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
963         tcp_v4_send_ack(sk, skb, seq,
964                         tcp_rsk(req)->rcv_nxt,
965                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
966                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
967                         req->ts_recent,
968                         0,
969                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
970                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
971                         ip_hdr(skb)->tos);
972 }
973 
974 /*
975  *      Send a SYN-ACK after having received a SYN.
976  *      This still operates on a request_sock only, not on a big
977  *      socket.
978  */
979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
980                               struct flowi *fl,
981                               struct request_sock *req,
982                               struct tcp_fastopen_cookie *foc,
983                               enum tcp_synack_type synack_type,
984                               struct sk_buff *syn_skb)
985 {
986         const struct inet_request_sock *ireq = inet_rsk(req);
987         struct flowi4 fl4;
988         int err = -1;
989         struct sk_buff *skb;
990         u8 tos;
991 
992         /* First, grab a route. */
993         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
994                 return -1;
995 
996         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
997 
998         if (skb) {
999                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000 
1001                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1004                                 inet_sk(sk)->tos;
1005 
1006                 if (!INET_ECN_is_capable(tos) &&
1007                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1008                         tos |= INET_ECN_ECT_0;
1009 
1010                 rcu_read_lock();
1011                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012                                             ireq->ir_rmt_addr,
1013                                             rcu_dereference(ireq->ireq_opt),
1014                                             tos);
1015                 rcu_read_unlock();
1016                 err = net_xmit_eval(err);
1017         }
1018 
1019         return err;
1020 }
1021 
1022 /*
1023  *      IPv4 request_sock destructor.
1024  */
1025 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026 {
1027         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028 }
1029 
1030 #ifdef CONFIG_TCP_MD5SIG
1031 /*
1032  * RFC2385 MD5 checksumming requires a mapping of
1033  * IP address->MD5 Key.
1034  * We need to maintain these in the sk structure.
1035  */
1036 
1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 EXPORT_SYMBOL(tcp_md5_needed);
1039 
1040 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1041 {
1042         if (!old)
1043                 return true;
1044 
1045         /* l3index always overrides non-l3index */
1046         if (old->l3index && new->l3index == 0)
1047                 return false;
1048         if (old->l3index == 0 && new->l3index)
1049                 return true;
1050 
1051         return old->prefixlen < new->prefixlen;
1052 }
1053 
1054 /* Find the Key structure for an address.  */
1055 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1056                                            const union tcp_md5_addr *addr,
1057                                            int family)
1058 {
1059         const struct tcp_sock *tp = tcp_sk(sk);
1060         struct tcp_md5sig_key *key;
1061         const struct tcp_md5sig_info *md5sig;
1062         __be32 mask;
1063         struct tcp_md5sig_key *best_match = NULL;
1064         bool match;
1065 
1066         /* caller either holds rcu_read_lock() or socket lock */
1067         md5sig = rcu_dereference_check(tp->md5sig_info,
1068                                        lockdep_sock_is_held(sk));
1069         if (!md5sig)
1070                 return NULL;
1071 
1072         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1073                                  lockdep_sock_is_held(sk)) {
1074                 if (key->family != family)
1075                         continue;
1076                 if (key->l3index && key->l3index != l3index)
1077                         continue;
1078                 if (family == AF_INET) {
1079                         mask = inet_make_mask(key->prefixlen);
1080                         match = (key->addr.a4.s_addr & mask) ==
1081                                 (addr->a4.s_addr & mask);
1082 #if IS_ENABLED(CONFIG_IPV6)
1083                 } else if (family == AF_INET6) {
1084                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1085                                                   key->prefixlen);
1086 #endif
1087                 } else {
1088                         match = false;
1089                 }
1090 
1091                 if (match && better_md5_match(best_match, key))
1092                         best_match = key;
1093         }
1094         return best_match;
1095 }
1096 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1097 
1098 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1099                                                       const union tcp_md5_addr *addr,
1100                                                       int family, u8 prefixlen,
1101                                                       int l3index)
1102 {
1103         const struct tcp_sock *tp = tcp_sk(sk);
1104         struct tcp_md5sig_key *key;
1105         unsigned int size = sizeof(struct in_addr);
1106         const struct tcp_md5sig_info *md5sig;
1107 
1108         /* caller either holds rcu_read_lock() or socket lock */
1109         md5sig = rcu_dereference_check(tp->md5sig_info,
1110                                        lockdep_sock_is_held(sk));
1111         if (!md5sig)
1112                 return NULL;
1113 #if IS_ENABLED(CONFIG_IPV6)
1114         if (family == AF_INET6)
1115                 size = sizeof(struct in6_addr);
1116 #endif
1117         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1118                                  lockdep_sock_is_held(sk)) {
1119                 if (key->family != family)
1120                         continue;
1121                 if (key->l3index != l3index)
1122                         continue;
1123                 if (!memcmp(&key->addr, addr, size) &&
1124                     key->prefixlen == prefixlen)
1125                         return key;
1126         }
1127         return NULL;
1128 }
1129 
1130 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1131                                          const struct sock *addr_sk)
1132 {
1133         const union tcp_md5_addr *addr;
1134         int l3index;
1135 
1136         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1137                                                  addr_sk->sk_bound_dev_if);
1138         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1139         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1140 }
1141 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1142 
1143 /* This can be called on a newly created socket, from other files */
1144 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1145                    int family, u8 prefixlen, int l3index,
1146                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1147 {
1148         /* Add Key to the list */
1149         struct tcp_md5sig_key *key;
1150         struct tcp_sock *tp = tcp_sk(sk);
1151         struct tcp_md5sig_info *md5sig;
1152 
1153         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1154         if (key) {
1155                 /* Pre-existing entry - just update that one.
1156                  * Note that the key might be used concurrently.
1157                  * data_race() is telling kcsan that we do not care of
1158                  * key mismatches, since changing MD5 key on live flows
1159                  * can lead to packet drops.
1160                  */
1161                 data_race(memcpy(key->key, newkey, newkeylen));
1162 
1163                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1164                  * Also note that a reader could catch new key->keylen value
1165                  * but old key->key[], this is the reason we use __GFP_ZERO
1166                  * at sock_kmalloc() time below these lines.
1167                  */
1168                 WRITE_ONCE(key->keylen, newkeylen);
1169 
1170                 return 0;
1171         }
1172 
1173         md5sig = rcu_dereference_protected(tp->md5sig_info,
1174                                            lockdep_sock_is_held(sk));
1175         if (!md5sig) {
1176                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1177                 if (!md5sig)
1178                         return -ENOMEM;
1179 
1180                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1181                 INIT_HLIST_HEAD(&md5sig->head);
1182                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1183         }
1184 
1185         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1186         if (!key)
1187                 return -ENOMEM;
1188         if (!tcp_alloc_md5sig_pool()) {
1189                 sock_kfree_s(sk, key, sizeof(*key));
1190                 return -ENOMEM;
1191         }
1192 
1193         memcpy(key->key, newkey, newkeylen);
1194         key->keylen = newkeylen;
1195         key->family = family;
1196         key->prefixlen = prefixlen;
1197         key->l3index = l3index;
1198         memcpy(&key->addr, addr,
1199                (family == AF_INET6) ? sizeof(struct in6_addr) :
1200                                       sizeof(struct in_addr));
1201         hlist_add_head_rcu(&key->node, &md5sig->head);
1202         return 0;
1203 }
1204 EXPORT_SYMBOL(tcp_md5_do_add);
1205 
1206 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1207                    u8 prefixlen, int l3index)
1208 {
1209         struct tcp_md5sig_key *key;
1210 
1211         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1212         if (!key)
1213                 return -ENOENT;
1214         hlist_del_rcu(&key->node);
1215         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1216         kfree_rcu(key, rcu);
1217         return 0;
1218 }
1219 EXPORT_SYMBOL(tcp_md5_do_del);
1220 
1221 static void tcp_clear_md5_list(struct sock *sk)
1222 {
1223         struct tcp_sock *tp = tcp_sk(sk);
1224         struct tcp_md5sig_key *key;
1225         struct hlist_node *n;
1226         struct tcp_md5sig_info *md5sig;
1227 
1228         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1229 
1230         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1231                 hlist_del_rcu(&key->node);
1232                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1233                 kfree_rcu(key, rcu);
1234         }
1235 }
1236 
1237 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1238                                  sockptr_t optval, int optlen)
1239 {
1240         struct tcp_md5sig cmd;
1241         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1242         const union tcp_md5_addr *addr;
1243         u8 prefixlen = 32;
1244         int l3index = 0;
1245 
1246         if (optlen < sizeof(cmd))
1247                 return -EINVAL;
1248 
1249         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1250                 return -EFAULT;
1251 
1252         if (sin->sin_family != AF_INET)
1253                 return -EINVAL;
1254 
1255         if (optname == TCP_MD5SIG_EXT &&
1256             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1257                 prefixlen = cmd.tcpm_prefixlen;
1258                 if (prefixlen > 32)
1259                         return -EINVAL;
1260         }
1261 
1262         if (optname == TCP_MD5SIG_EXT &&
1263             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1264                 struct net_device *dev;
1265 
1266                 rcu_read_lock();
1267                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1268                 if (dev && netif_is_l3_master(dev))
1269                         l3index = dev->ifindex;
1270 
1271                 rcu_read_unlock();
1272 
1273                 /* ok to reference set/not set outside of rcu;
1274                  * right now device MUST be an L3 master
1275                  */
1276                 if (!dev || !l3index)
1277                         return -EINVAL;
1278         }
1279 
1280         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1281 
1282         if (!cmd.tcpm_keylen)
1283                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1284 
1285         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1286                 return -EINVAL;
1287 
1288         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1289                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1290 }
1291 
1292 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1293                                    __be32 daddr, __be32 saddr,
1294                                    const struct tcphdr *th, int nbytes)
1295 {
1296         struct tcp4_pseudohdr *bp;
1297         struct scatterlist sg;
1298         struct tcphdr *_th;
1299 
1300         bp = hp->scratch;
1301         bp->saddr = saddr;
1302         bp->daddr = daddr;
1303         bp->pad = 0;
1304         bp->protocol = IPPROTO_TCP;
1305         bp->len = cpu_to_be16(nbytes);
1306 
1307         _th = (struct tcphdr *)(bp + 1);
1308         memcpy(_th, th, sizeof(*th));
1309         _th->check = 0;
1310 
1311         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1312         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1313                                 sizeof(*bp) + sizeof(*th));
1314         return crypto_ahash_update(hp->md5_req);
1315 }
1316 
1317 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1318                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1319 {
1320         struct tcp_md5sig_pool *hp;
1321         struct ahash_request *req;
1322 
1323         hp = tcp_get_md5sig_pool();
1324         if (!hp)
1325                 goto clear_hash_noput;
1326         req = hp->md5_req;
1327 
1328         if (crypto_ahash_init(req))
1329                 goto clear_hash;
1330         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1331                 goto clear_hash;
1332         if (tcp_md5_hash_key(hp, key))
1333                 goto clear_hash;
1334         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1335         if (crypto_ahash_final(req))
1336                 goto clear_hash;
1337 
1338         tcp_put_md5sig_pool();
1339         return 0;
1340 
1341 clear_hash:
1342         tcp_put_md5sig_pool();
1343 clear_hash_noput:
1344         memset(md5_hash, 0, 16);
1345         return 1;
1346 }
1347 
1348 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1349                         const struct sock *sk,
1350                         const struct sk_buff *skb)
1351 {
1352         struct tcp_md5sig_pool *hp;
1353         struct ahash_request *req;
1354         const struct tcphdr *th = tcp_hdr(skb);
1355         __be32 saddr, daddr;
1356 
1357         if (sk) { /* valid for establish/request sockets */
1358                 saddr = sk->sk_rcv_saddr;
1359                 daddr = sk->sk_daddr;
1360         } else {
1361                 const struct iphdr *iph = ip_hdr(skb);
1362                 saddr = iph->saddr;
1363                 daddr = iph->daddr;
1364         }
1365 
1366         hp = tcp_get_md5sig_pool();
1367         if (!hp)
1368                 goto clear_hash_noput;
1369         req = hp->md5_req;
1370 
1371         if (crypto_ahash_init(req))
1372                 goto clear_hash;
1373 
1374         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1375                 goto clear_hash;
1376         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1377                 goto clear_hash;
1378         if (tcp_md5_hash_key(hp, key))
1379                 goto clear_hash;
1380         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1381         if (crypto_ahash_final(req))
1382                 goto clear_hash;
1383 
1384         tcp_put_md5sig_pool();
1385         return 0;
1386 
1387 clear_hash:
1388         tcp_put_md5sig_pool();
1389 clear_hash_noput:
1390         memset(md5_hash, 0, 16);
1391         return 1;
1392 }
1393 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1394 
1395 #endif
1396 
1397 /* Called with rcu_read_lock() */
1398 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1399                                     const struct sk_buff *skb,
1400                                     int dif, int sdif)
1401 {
1402 #ifdef CONFIG_TCP_MD5SIG
1403         /*
1404          * This gets called for each TCP segment that arrives
1405          * so we want to be efficient.
1406          * We have 3 drop cases:
1407          * o No MD5 hash and one expected.
1408          * o MD5 hash and we're not expecting one.
1409          * o MD5 hash and its wrong.
1410          */
1411         const __u8 *hash_location = NULL;
1412         struct tcp_md5sig_key *hash_expected;
1413         const struct iphdr *iph = ip_hdr(skb);
1414         const struct tcphdr *th = tcp_hdr(skb);
1415         const union tcp_md5_addr *addr;
1416         unsigned char newhash[16];
1417         int genhash, l3index;
1418 
1419         /* sdif set, means packet ingressed via a device
1420          * in an L3 domain and dif is set to the l3mdev
1421          */
1422         l3index = sdif ? dif : 0;
1423 
1424         addr = (union tcp_md5_addr *)&iph->saddr;
1425         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1426         hash_location = tcp_parse_md5sig_option(th);
1427 
1428         /* We've parsed the options - do we have a hash? */
1429         if (!hash_expected && !hash_location)
1430                 return false;
1431 
1432         if (hash_expected && !hash_location) {
1433                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1434                 return true;
1435         }
1436 
1437         if (!hash_expected && hash_location) {
1438                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1439                 return true;
1440         }
1441 
1442         /* Okay, so this is hash_expected and hash_location -
1443          * so we need to calculate the checksum.
1444          */
1445         genhash = tcp_v4_md5_hash_skb(newhash,
1446                                       hash_expected,
1447                                       NULL, skb);
1448 
1449         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1450                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1451                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1452                                      &iph->saddr, ntohs(th->source),
1453                                      &iph->daddr, ntohs(th->dest),
1454                                      genhash ? " tcp_v4_calc_md5_hash failed"
1455                                      : "", l3index);
1456                 return true;
1457         }
1458         return false;
1459 #endif
1460         return false;
1461 }
1462 
1463 static void tcp_v4_init_req(struct request_sock *req,
1464                             const struct sock *sk_listener,
1465                             struct sk_buff *skb)
1466 {
1467         struct inet_request_sock *ireq = inet_rsk(req);
1468         struct net *net = sock_net(sk_listener);
1469 
1470         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1471         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1472         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1473 }
1474 
1475 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1476                                           struct sk_buff *skb,
1477                                           struct flowi *fl,
1478                                           struct request_sock *req)
1479 {
1480         tcp_v4_init_req(req, sk, skb);
1481 
1482         if (security_inet_conn_request(sk, skb, req))
1483                 return NULL;
1484 
1485         return inet_csk_route_req(sk, &fl->u.ip4, req);
1486 }
1487 
1488 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1489         .family         =       PF_INET,
1490         .obj_size       =       sizeof(struct tcp_request_sock),
1491         .rtx_syn_ack    =       tcp_rtx_synack,
1492         .send_ack       =       tcp_v4_reqsk_send_ack,
1493         .destructor     =       tcp_v4_reqsk_destructor,
1494         .send_reset     =       tcp_v4_send_reset,
1495         .syn_ack_timeout =      tcp_syn_ack_timeout,
1496 };
1497 
1498 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1499         .mss_clamp      =       TCP_MSS_DEFAULT,
1500 #ifdef CONFIG_TCP_MD5SIG
1501         .req_md5_lookup =       tcp_v4_md5_lookup,
1502         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1503 #endif
1504 #ifdef CONFIG_SYN_COOKIES
1505         .cookie_init_seq =      cookie_v4_init_sequence,
1506 #endif
1507         .route_req      =       tcp_v4_route_req,
1508         .init_seq       =       tcp_v4_init_seq,
1509         .init_ts_off    =       tcp_v4_init_ts_off,
1510         .send_synack    =       tcp_v4_send_synack,
1511 };
1512 
1513 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1514 {
1515         /* Never answer to SYNs send to broadcast or multicast */
1516         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1517                 goto drop;
1518 
1519         return tcp_conn_request(&tcp_request_sock_ops,
1520                                 &tcp_request_sock_ipv4_ops, sk, skb);
1521 
1522 drop:
1523         tcp_listendrop(sk);
1524         return 0;
1525 }
1526 EXPORT_SYMBOL(tcp_v4_conn_request);
1527 
1528 
1529 /*
1530  * The three way handshake has completed - we got a valid synack -
1531  * now create the new socket.
1532  */
1533 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1534                                   struct request_sock *req,
1535                                   struct dst_entry *dst,
1536                                   struct request_sock *req_unhash,
1537                                   bool *own_req)
1538 {
1539         struct inet_request_sock *ireq;
1540         bool found_dup_sk = false;
1541         struct inet_sock *newinet;
1542         struct tcp_sock *newtp;
1543         struct sock *newsk;
1544 #ifdef CONFIG_TCP_MD5SIG
1545         const union tcp_md5_addr *addr;
1546         struct tcp_md5sig_key *key;
1547         int l3index;
1548 #endif
1549         struct ip_options_rcu *inet_opt;
1550 
1551         if (sk_acceptq_is_full(sk))
1552                 goto exit_overflow;
1553 
1554         newsk = tcp_create_openreq_child(sk, req, skb);
1555         if (!newsk)
1556                 goto exit_nonewsk;
1557 
1558         newsk->sk_gso_type = SKB_GSO_TCPV4;
1559         inet_sk_rx_dst_set(newsk, skb);
1560 
1561         newtp                 = tcp_sk(newsk);
1562         newinet               = inet_sk(newsk);
1563         ireq                  = inet_rsk(req);
1564         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1565         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1566         newsk->sk_bound_dev_if = ireq->ir_iif;
1567         newinet->inet_saddr   = ireq->ir_loc_addr;
1568         inet_opt              = rcu_dereference(ireq->ireq_opt);
1569         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1570         newinet->mc_index     = inet_iif(skb);
1571         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1572         newinet->rcv_tos      = ip_hdr(skb)->tos;
1573         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1574         if (inet_opt)
1575                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1576         newinet->inet_id = prandom_u32();
1577 
1578         /* Set ToS of the new socket based upon the value of incoming SYN.
1579          * ECT bits are set later in tcp_init_transfer().
1580          */
1581         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1582                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1583 
1584         if (!dst) {
1585                 dst = inet_csk_route_child_sock(sk, newsk, req);
1586                 if (!dst)
1587                         goto put_and_exit;
1588         } else {
1589                 /* syncookie case : see end of cookie_v4_check() */
1590         }
1591         sk_setup_caps(newsk, dst);
1592 
1593         tcp_ca_openreq_child(newsk, dst);
1594 
1595         tcp_sync_mss(newsk, dst_mtu(dst));
1596         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1597 
1598         tcp_initialize_rcv_mss(newsk);
1599 
1600 #ifdef CONFIG_TCP_MD5SIG
1601         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1602         /* Copy over the MD5 key from the original socket */
1603         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1604         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1605         if (key) {
1606                 /*
1607                  * We're using one, so create a matching key
1608                  * on the newsk structure. If we fail to get
1609                  * memory, then we end up not copying the key
1610                  * across. Shucks.
1611                  */
1612                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1613                                key->key, key->keylen, GFP_ATOMIC);
1614                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1615         }
1616 #endif
1617 
1618         if (__inet_inherit_port(sk, newsk) < 0)
1619                 goto put_and_exit;
1620         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1621                                        &found_dup_sk);
1622         if (likely(*own_req)) {
1623                 tcp_move_syn(newtp, req);
1624                 ireq->ireq_opt = NULL;
1625         } else {
1626                 newinet->inet_opt = NULL;
1627 
1628                 if (!req_unhash && found_dup_sk) {
1629                         /* This code path should only be executed in the
1630                          * syncookie case only
1631                          */
1632                         bh_unlock_sock(newsk);
1633                         sock_put(newsk);
1634                         newsk = NULL;
1635                 }
1636         }
1637         return newsk;
1638 
1639 exit_overflow:
1640         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1641 exit_nonewsk:
1642         dst_release(dst);
1643 exit:
1644         tcp_listendrop(sk);
1645         return NULL;
1646 put_and_exit:
1647         newinet->inet_opt = NULL;
1648         inet_csk_prepare_forced_close(newsk);
1649         tcp_done(newsk);
1650         goto exit;
1651 }
1652 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1653 
1654 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1655 {
1656 #ifdef CONFIG_SYN_COOKIES
1657         const struct tcphdr *th = tcp_hdr(skb);
1658 
1659         if (!th->syn)
1660                 sk = cookie_v4_check(sk, skb);
1661 #endif
1662         return sk;
1663 }
1664 
1665 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1666                          struct tcphdr *th, u32 *cookie)
1667 {
1668         u16 mss = 0;
1669 #ifdef CONFIG_SYN_COOKIES
1670         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1671                                     &tcp_request_sock_ipv4_ops, sk, th);
1672         if (mss) {
1673                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1674                 tcp_synq_overflow(sk);
1675         }
1676 #endif
1677         return mss;
1678 }
1679 
1680 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1681                                                            u32));
1682 /* The socket must have it's spinlock held when we get
1683  * here, unless it is a TCP_LISTEN socket.
1684  *
1685  * We have a potential double-lock case here, so even when
1686  * doing backlog processing we use the BH locking scheme.
1687  * This is because we cannot sleep with the original spinlock
1688  * held.
1689  */
1690 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1691 {
1692         struct sock *rsk;
1693 
1694         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1695                 struct dst_entry *dst = sk->sk_rx_dst;
1696 
1697                 sock_rps_save_rxhash(sk, skb);
1698                 sk_mark_napi_id(sk, skb);
1699                 if (dst) {
1700                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1701                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1702                                              dst, 0)) {
1703                                 dst_release(dst);
1704                                 sk->sk_rx_dst = NULL;
1705                         }
1706                 }
1707                 tcp_rcv_established(sk, skb);
1708                 return 0;
1709         }
1710 
1711         if (tcp_checksum_complete(skb))
1712                 goto csum_err;
1713 
1714         if (sk->sk_state == TCP_LISTEN) {
1715                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1716 
1717                 if (!nsk)
1718                         goto discard;
1719                 if (nsk != sk) {
1720                         if (tcp_child_process(sk, nsk, skb)) {
1721                                 rsk = nsk;
1722                                 goto reset;
1723                         }
1724                         return 0;
1725                 }
1726         } else
1727                 sock_rps_save_rxhash(sk, skb);
1728 
1729         if (tcp_rcv_state_process(sk, skb)) {
1730                 rsk = sk;
1731                 goto reset;
1732         }
1733         return 0;
1734 
1735 reset:
1736         tcp_v4_send_reset(rsk, skb);
1737 discard:
1738         kfree_skb(skb);
1739         /* Be careful here. If this function gets more complicated and
1740          * gcc suffers from register pressure on the x86, sk (in %ebx)
1741          * might be destroyed here. This current version compiles correctly,
1742          * but you have been warned.
1743          */
1744         return 0;
1745 
1746 csum_err:
1747         trace_tcp_bad_csum(skb);
1748         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1749         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1750         goto discard;
1751 }
1752 EXPORT_SYMBOL(tcp_v4_do_rcv);
1753 
1754 int tcp_v4_early_demux(struct sk_buff *skb)
1755 {
1756         const struct iphdr *iph;
1757         const struct tcphdr *th;
1758         struct sock *sk;
1759 
1760         if (skb->pkt_type != PACKET_HOST)
1761                 return 0;
1762 
1763         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1764                 return 0;
1765 
1766         iph = ip_hdr(skb);
1767         th = tcp_hdr(skb);
1768 
1769         if (th->doff < sizeof(struct tcphdr) / 4)
1770                 return 0;
1771 
1772         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1773                                        iph->saddr, th->source,
1774                                        iph->daddr, ntohs(th->dest),
1775                                        skb->skb_iif, inet_sdif(skb));
1776         if (sk) {
1777                 skb->sk = sk;
1778                 skb->destructor = sock_edemux;
1779                 if (sk_fullsock(sk)) {
1780                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1781 
1782                         if (dst)
1783                                 dst = dst_check(dst, 0);
1784                         if (dst &&
1785                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1786                                 skb_dst_set_noref(skb, dst);
1787                 }
1788         }
1789         return 0;
1790 }
1791 
1792 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1793 {
1794         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1795         u32 tail_gso_size, tail_gso_segs;
1796         struct skb_shared_info *shinfo;
1797         const struct tcphdr *th;
1798         struct tcphdr *thtail;
1799         struct sk_buff *tail;
1800         unsigned int hdrlen;
1801         bool fragstolen;
1802         u32 gso_segs;
1803         u32 gso_size;
1804         int delta;
1805 
1806         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1807          * we can fix skb->truesize to its real value to avoid future drops.
1808          * This is valid because skb is not yet charged to the socket.
1809          * It has been noticed pure SACK packets were sometimes dropped
1810          * (if cooked by drivers without copybreak feature).
1811          */
1812         skb_condense(skb);
1813 
1814         skb_dst_drop(skb);
1815 
1816         if (unlikely(tcp_checksum_complete(skb))) {
1817                 bh_unlock_sock(sk);
1818                 trace_tcp_bad_csum(skb);
1819                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1820                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1821                 return true;
1822         }
1823 
1824         /* Attempt coalescing to last skb in backlog, even if we are
1825          * above the limits.
1826          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1827          */
1828         th = (const struct tcphdr *)skb->data;
1829         hdrlen = th->doff * 4;
1830 
1831         tail = sk->sk_backlog.tail;
1832         if (!tail)
1833                 goto no_coalesce;
1834         thtail = (struct tcphdr *)tail->data;
1835 
1836         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1837             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1838             ((TCP_SKB_CB(tail)->tcp_flags |
1839               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1840             !((TCP_SKB_CB(tail)->tcp_flags &
1841               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1842             ((TCP_SKB_CB(tail)->tcp_flags ^
1843               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1844 #ifdef CONFIG_TLS_DEVICE
1845             tail->decrypted != skb->decrypted ||
1846 #endif
1847             thtail->doff != th->doff ||
1848             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1849                 goto no_coalesce;
1850 
1851         __skb_pull(skb, hdrlen);
1852 
1853         shinfo = skb_shinfo(skb);
1854         gso_size = shinfo->gso_size ?: skb->len;
1855         gso_segs = shinfo->gso_segs ?: 1;
1856 
1857         shinfo = skb_shinfo(tail);
1858         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1859         tail_gso_segs = shinfo->gso_segs ?: 1;
1860 
1861         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1862                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1863 
1864                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1865                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1866                         thtail->window = th->window;
1867                 }
1868 
1869                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1870                  * thtail->fin, so that the fast path in tcp_rcv_established()
1871                  * is not entered if we append a packet with a FIN.
1872                  * SYN, RST, URG are not present.
1873                  * ACK is set on both packets.
1874                  * PSH : we do not really care in TCP stack,
1875                  *       at least for 'GRO' packets.
1876                  */
1877                 thtail->fin |= th->fin;
1878                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1879 
1880                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1881                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1882                         tail->tstamp = skb->tstamp;
1883                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1884                 }
1885 
1886                 /* Not as strict as GRO. We only need to carry mss max value */
1887                 shinfo->gso_size = max(gso_size, tail_gso_size);
1888                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1889 
1890                 sk->sk_backlog.len += delta;
1891                 __NET_INC_STATS(sock_net(sk),
1892                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1893                 kfree_skb_partial(skb, fragstolen);
1894                 return false;
1895         }
1896         __skb_push(skb, hdrlen);
1897 
1898 no_coalesce:
1899         /* Only socket owner can try to collapse/prune rx queues
1900          * to reduce memory overhead, so add a little headroom here.
1901          * Few sockets backlog are possibly concurrently non empty.
1902          */
1903         limit += 64*1024;
1904 
1905         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1906                 bh_unlock_sock(sk);
1907                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1908                 return true;
1909         }
1910         return false;
1911 }
1912 EXPORT_SYMBOL(tcp_add_backlog);
1913 
1914 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1915 {
1916         struct tcphdr *th = (struct tcphdr *)skb->data;
1917 
1918         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1919 }
1920 EXPORT_SYMBOL(tcp_filter);
1921 
1922 static void tcp_v4_restore_cb(struct sk_buff *skb)
1923 {
1924         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1925                 sizeof(struct inet_skb_parm));
1926 }
1927 
1928 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1929                            const struct tcphdr *th)
1930 {
1931         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1932          * barrier() makes sure compiler wont play fool^Waliasing games.
1933          */
1934         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1935                 sizeof(struct inet_skb_parm));
1936         barrier();
1937 
1938         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1939         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1940                                     skb->len - th->doff * 4);
1941         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1942         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1943         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1944         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1945         TCP_SKB_CB(skb)->sacked  = 0;
1946         TCP_SKB_CB(skb)->has_rxtstamp =
1947                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1948 }
1949 
1950 /*
1951  *      From tcp_input.c
1952  */
1953 
1954 int tcp_v4_rcv(struct sk_buff *skb)
1955 {
1956         struct net *net = dev_net(skb->dev);
1957         struct sk_buff *skb_to_free;
1958         int sdif = inet_sdif(skb);
1959         int dif = inet_iif(skb);
1960         const struct iphdr *iph;
1961         const struct tcphdr *th;
1962         bool refcounted;
1963         struct sock *sk;
1964         int ret;
1965 
1966         if (skb->pkt_type != PACKET_HOST)
1967                 goto discard_it;
1968 
1969         /* Count it even if it's bad */
1970         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1971 
1972         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1973                 goto discard_it;
1974 
1975         th = (const struct tcphdr *)skb->data;
1976 
1977         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1978                 goto bad_packet;
1979         if (!pskb_may_pull(skb, th->doff * 4))
1980                 goto discard_it;
1981 
1982         /* An explanation is required here, I think.
1983          * Packet length and doff are validated by header prediction,
1984          * provided case of th->doff==0 is eliminated.
1985          * So, we defer the checks. */
1986 
1987         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1988                 goto csum_error;
1989 
1990         th = (const struct tcphdr *)skb->data;
1991         iph = ip_hdr(skb);
1992 lookup:
1993         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1994                                th->dest, sdif, &refcounted);
1995         if (!sk)
1996                 goto no_tcp_socket;
1997 
1998 process:
1999         if (sk->sk_state == TCP_TIME_WAIT)
2000                 goto do_time_wait;
2001 
2002         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2003                 struct request_sock *req = inet_reqsk(sk);
2004                 bool req_stolen = false;
2005                 struct sock *nsk;
2006 
2007                 sk = req->rsk_listener;
2008                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2009                         sk_drops_add(sk, skb);
2010                         reqsk_put(req);
2011                         goto discard_it;
2012                 }
2013                 if (tcp_checksum_complete(skb)) {
2014                         reqsk_put(req);
2015                         goto csum_error;
2016                 }
2017                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2018                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2019                         if (!nsk) {
2020                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2021                                 goto lookup;
2022                         }
2023                         sk = nsk;
2024                         /* reuseport_migrate_sock() has already held one sk_refcnt
2025                          * before returning.
2026                          */
2027                 } else {
2028                         /* We own a reference on the listener, increase it again
2029                          * as we might lose it too soon.
2030                          */
2031                         sock_hold(sk);
2032                 }
2033                 refcounted = true;
2034                 nsk = NULL;
2035                 if (!tcp_filter(sk, skb)) {
2036                         th = (const struct tcphdr *)skb->data;
2037                         iph = ip_hdr(skb);
2038                         tcp_v4_fill_cb(skb, iph, th);
2039                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2040                 }
2041                 if (!nsk) {
2042                         reqsk_put(req);
2043                         if (req_stolen) {
2044                                 /* Another cpu got exclusive access to req
2045                                  * and created a full blown socket.
2046                                  * Try to feed this packet to this socket
2047                                  * instead of discarding it.
2048                                  */
2049                                 tcp_v4_restore_cb(skb);
2050                                 sock_put(sk);
2051                                 goto lookup;
2052                         }
2053                         goto discard_and_relse;
2054                 }
2055                 if (nsk == sk) {
2056                         reqsk_put(req);
2057                         tcp_v4_restore_cb(skb);
2058                 } else if (tcp_child_process(sk, nsk, skb)) {
2059                         tcp_v4_send_reset(nsk, skb);
2060                         goto discard_and_relse;
2061                 } else {
2062                         sock_put(sk);
2063                         return 0;
2064                 }
2065         }
2066         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2067                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2068                 goto discard_and_relse;
2069         }
2070 
2071         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2072                 goto discard_and_relse;
2073 
2074         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2075                 goto discard_and_relse;
2076 
2077         nf_reset_ct(skb);
2078 
2079         if (tcp_filter(sk, skb))
2080                 goto discard_and_relse;
2081         th = (const struct tcphdr *)skb->data;
2082         iph = ip_hdr(skb);
2083         tcp_v4_fill_cb(skb, iph, th);
2084 
2085         skb->dev = NULL;
2086 
2087         if (sk->sk_state == TCP_LISTEN) {
2088                 ret = tcp_v4_do_rcv(sk, skb);
2089                 goto put_and_return;
2090         }
2091 
2092         sk_incoming_cpu_update(sk);
2093 
2094         bh_lock_sock_nested(sk);
2095         tcp_segs_in(tcp_sk(sk), skb);
2096         ret = 0;
2097         if (!sock_owned_by_user(sk)) {
2098                 skb_to_free = sk->sk_rx_skb_cache;
2099                 sk->sk_rx_skb_cache = NULL;
2100                 ret = tcp_v4_do_rcv(sk, skb);
2101         } else {
2102                 if (tcp_add_backlog(sk, skb))
2103                         goto discard_and_relse;
2104                 skb_to_free = NULL;
2105         }
2106         bh_unlock_sock(sk);
2107         if (skb_to_free)
2108                 __kfree_skb(skb_to_free);
2109 
2110 put_and_return:
2111         if (refcounted)
2112                 sock_put(sk);
2113 
2114         return ret;
2115 
2116 no_tcp_socket:
2117         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2118                 goto discard_it;
2119 
2120         tcp_v4_fill_cb(skb, iph, th);
2121 
2122         if (tcp_checksum_complete(skb)) {
2123 csum_error:
2124                 trace_tcp_bad_csum(skb);
2125                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2126 bad_packet:
2127                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2128         } else {
2129                 tcp_v4_send_reset(NULL, skb);
2130         }
2131 
2132 discard_it:
2133         /* Discard frame. */
2134         kfree_skb(skb);
2135         return 0;
2136 
2137 discard_and_relse:
2138         sk_drops_add(sk, skb);
2139         if (refcounted)
2140                 sock_put(sk);
2141         goto discard_it;
2142 
2143 do_time_wait:
2144         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2145                 inet_twsk_put(inet_twsk(sk));
2146                 goto discard_it;
2147         }
2148 
2149         tcp_v4_fill_cb(skb, iph, th);
2150 
2151         if (tcp_checksum_complete(skb)) {
2152                 inet_twsk_put(inet_twsk(sk));
2153                 goto csum_error;
2154         }
2155         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2156         case TCP_TW_SYN: {
2157                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2158                                                         &tcp_hashinfo, skb,
2159                                                         __tcp_hdrlen(th),
2160                                                         iph->saddr, th->source,
2161                                                         iph->daddr, th->dest,
2162                                                         inet_iif(skb),
2163                                                         sdif);
2164                 if (sk2) {
2165                         inet_twsk_deschedule_put(inet_twsk(sk));
2166                         sk = sk2;
2167                         tcp_v4_restore_cb(skb);
2168                         refcounted = false;
2169                         goto process;
2170                 }
2171         }
2172                 /* to ACK */
2173                 fallthrough;
2174         case TCP_TW_ACK:
2175                 tcp_v4_timewait_ack(sk, skb);
2176                 break;
2177         case TCP_TW_RST:
2178                 tcp_v4_send_reset(sk, skb);
2179                 inet_twsk_deschedule_put(inet_twsk(sk));
2180                 goto discard_it;
2181         case TCP_TW_SUCCESS:;
2182         }
2183         goto discard_it;
2184 }
2185 
2186 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2187         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2188         .twsk_unique    = tcp_twsk_unique,
2189         .twsk_destructor= tcp_twsk_destructor,
2190 };
2191 
2192 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2193 {
2194         struct dst_entry *dst = skb_dst(skb);
2195 
2196         if (dst && dst_hold_safe(dst)) {
2197                 sk->sk_rx_dst = dst;
2198                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2199         }
2200 }
2201 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2202 
2203 const struct inet_connection_sock_af_ops ipv4_specific = {
2204         .queue_xmit        = ip_queue_xmit,
2205         .send_check        = tcp_v4_send_check,
2206         .rebuild_header    = inet_sk_rebuild_header,
2207         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2208         .conn_request      = tcp_v4_conn_request,
2209         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2210         .net_header_len    = sizeof(struct iphdr),
2211         .setsockopt        = ip_setsockopt,
2212         .getsockopt        = ip_getsockopt,
2213         .addr2sockaddr     = inet_csk_addr2sockaddr,
2214         .sockaddr_len      = sizeof(struct sockaddr_in),
2215         .mtu_reduced       = tcp_v4_mtu_reduced,
2216 };
2217 EXPORT_SYMBOL(ipv4_specific);
2218 
2219 #ifdef CONFIG_TCP_MD5SIG
2220 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2221         .md5_lookup             = tcp_v4_md5_lookup,
2222         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2223         .md5_parse              = tcp_v4_parse_md5_keys,
2224 };
2225 #endif
2226 
2227 /* NOTE: A lot of things set to zero explicitly by call to
2228  *       sk_alloc() so need not be done here.
2229  */
2230 static int tcp_v4_init_sock(struct sock *sk)
2231 {
2232         struct inet_connection_sock *icsk = inet_csk(sk);
2233 
2234         tcp_init_sock(sk);
2235 
2236         icsk->icsk_af_ops = &ipv4_specific;
2237 
2238 #ifdef CONFIG_TCP_MD5SIG
2239         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2240 #endif
2241 
2242         return 0;
2243 }
2244 
2245 void tcp_v4_destroy_sock(struct sock *sk)
2246 {
2247         struct tcp_sock *tp = tcp_sk(sk);
2248 
2249         trace_tcp_destroy_sock(sk);
2250 
2251         tcp_clear_xmit_timers(sk);
2252 
2253         tcp_cleanup_congestion_control(sk);
2254 
2255         tcp_cleanup_ulp(sk);
2256 
2257         /* Cleanup up the write buffer. */
2258         tcp_write_queue_purge(sk);
2259 
2260         /* Check if we want to disable active TFO */
2261         tcp_fastopen_active_disable_ofo_check(sk);
2262 
2263         /* Cleans up our, hopefully empty, out_of_order_queue. */
2264         skb_rbtree_purge(&tp->out_of_order_queue);
2265 
2266 #ifdef CONFIG_TCP_MD5SIG
2267         /* Clean up the MD5 key list, if any */
2268         if (tp->md5sig_info) {
2269                 tcp_clear_md5_list(sk);
2270                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2271                 tp->md5sig_info = NULL;
2272         }
2273 #endif
2274 
2275         /* Clean up a referenced TCP bind bucket. */
2276         if (inet_csk(sk)->icsk_bind_hash)
2277                 inet_put_port(sk);
2278 
2279         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2280 
2281         /* If socket is aborted during connect operation */
2282         tcp_free_fastopen_req(tp);
2283         tcp_fastopen_destroy_cipher(sk);
2284         tcp_saved_syn_free(tp);
2285 
2286         sk_sockets_allocated_dec(sk);
2287 }
2288 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2289 
2290 #ifdef CONFIG_PROC_FS
2291 /* Proc filesystem TCP sock list dumping. */
2292 
2293 /*
2294  * Get next listener socket follow cur.  If cur is NULL, get first socket
2295  * starting from bucket given in st->bucket; when st->bucket is zero the
2296  * very first socket in the hash table is returned.
2297  */
2298 static void *listening_get_next(struct seq_file *seq, void *cur)
2299 {
2300         struct tcp_seq_afinfo *afinfo;
2301         struct tcp_iter_state *st = seq->private;
2302         struct net *net = seq_file_net(seq);
2303         struct inet_listen_hashbucket *ilb;
2304         struct hlist_nulls_node *node;
2305         struct sock *sk = cur;
2306 
2307         if (st->bpf_seq_afinfo)
2308                 afinfo = st->bpf_seq_afinfo;
2309         else
2310                 afinfo = PDE_DATA(file_inode(seq->file));
2311 
2312         if (!sk) {
2313 get_head:
2314                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2315                 spin_lock(&ilb->lock);
2316                 sk = sk_nulls_head(&ilb->nulls_head);
2317                 st->offset = 0;
2318                 goto get_sk;
2319         }
2320         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2321         ++st->num;
2322         ++st->offset;
2323 
2324         sk = sk_nulls_next(sk);
2325 get_sk:
2326         sk_nulls_for_each_from(sk, node) {
2327                 if (!net_eq(sock_net(sk), net))
2328                         continue;
2329                 if (afinfo->family == AF_UNSPEC ||
2330                     sk->sk_family == afinfo->family)
2331                         return sk;
2332         }
2333         spin_unlock(&ilb->lock);
2334         st->offset = 0;
2335         if (++st->bucket < INET_LHTABLE_SIZE)
2336                 goto get_head;
2337         return NULL;
2338 }
2339 
2340 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2341 {
2342         struct tcp_iter_state *st = seq->private;
2343         void *rc;
2344 
2345         st->bucket = 0;
2346         st->offset = 0;
2347         rc = listening_get_next(seq, NULL);
2348 
2349         while (rc && *pos) {
2350                 rc = listening_get_next(seq, rc);
2351                 --*pos;
2352         }
2353         return rc;
2354 }
2355 
2356 static inline bool empty_bucket(const struct tcp_iter_state *st)
2357 {
2358         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2359 }
2360 
2361 /*
2362  * Get first established socket starting from bucket given in st->bucket.
2363  * If st->bucket is zero, the very first socket in the hash is returned.
2364  */
2365 static void *established_get_first(struct seq_file *seq)
2366 {
2367         struct tcp_seq_afinfo *afinfo;
2368         struct tcp_iter_state *st = seq->private;
2369         struct net *net = seq_file_net(seq);
2370         void *rc = NULL;
2371 
2372         if (st->bpf_seq_afinfo)
2373                 afinfo = st->bpf_seq_afinfo;
2374         else
2375                 afinfo = PDE_DATA(file_inode(seq->file));
2376 
2377         st->offset = 0;
2378         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2379                 struct sock *sk;
2380                 struct hlist_nulls_node *node;
2381                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2382 
2383                 /* Lockless fast path for the common case of empty buckets */
2384                 if (empty_bucket(st))
2385                         continue;
2386 
2387                 spin_lock_bh(lock);
2388                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2389                         if ((afinfo->family != AF_UNSPEC &&
2390                              sk->sk_family != afinfo->family) ||
2391                             !net_eq(sock_net(sk), net)) {
2392                                 continue;
2393                         }
2394                         rc = sk;
2395                         goto out;
2396                 }
2397                 spin_unlock_bh(lock);
2398         }
2399 out:
2400         return rc;
2401 }
2402 
2403 static void *established_get_next(struct seq_file *seq, void *cur)
2404 {
2405         struct tcp_seq_afinfo *afinfo;
2406         struct sock *sk = cur;
2407         struct hlist_nulls_node *node;
2408         struct tcp_iter_state *st = seq->private;
2409         struct net *net = seq_file_net(seq);
2410 
2411         if (st->bpf_seq_afinfo)
2412                 afinfo = st->bpf_seq_afinfo;
2413         else
2414                 afinfo = PDE_DATA(file_inode(seq->file));
2415 
2416         ++st->num;
2417         ++st->offset;
2418 
2419         sk = sk_nulls_next(sk);
2420 
2421         sk_nulls_for_each_from(sk, node) {
2422                 if ((afinfo->family == AF_UNSPEC ||
2423                      sk->sk_family == afinfo->family) &&
2424                     net_eq(sock_net(sk), net))
2425                         return sk;
2426         }
2427 
2428         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2429         ++st->bucket;
2430         return established_get_first(seq);
2431 }
2432 
2433 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2434 {
2435         struct tcp_iter_state *st = seq->private;
2436         void *rc;
2437 
2438         st->bucket = 0;
2439         rc = established_get_first(seq);
2440 
2441         while (rc && pos) {
2442                 rc = established_get_next(seq, rc);
2443                 --pos;
2444         }
2445         return rc;
2446 }
2447 
2448 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2449 {
2450         void *rc;
2451         struct tcp_iter_state *st = seq->private;
2452 
2453         st->state = TCP_SEQ_STATE_LISTENING;
2454         rc        = listening_get_idx(seq, &pos);
2455 
2456         if (!rc) {
2457                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2458                 rc        = established_get_idx(seq, pos);
2459         }
2460 
2461         return rc;
2462 }
2463 
2464 static void *tcp_seek_last_pos(struct seq_file *seq)
2465 {
2466         struct tcp_iter_state *st = seq->private;
2467         int bucket = st->bucket;
2468         int offset = st->offset;
2469         int orig_num = st->num;
2470         void *rc = NULL;
2471 
2472         switch (st->state) {
2473         case TCP_SEQ_STATE_LISTENING:
2474                 if (st->bucket >= INET_LHTABLE_SIZE)
2475                         break;
2476                 st->state = TCP_SEQ_STATE_LISTENING;
2477                 rc = listening_get_next(seq, NULL);
2478                 while (offset-- && rc && bucket == st->bucket)
2479                         rc = listening_get_next(seq, rc);
2480                 if (rc)
2481                         break;
2482                 st->bucket = 0;
2483                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2484                 fallthrough;
2485         case TCP_SEQ_STATE_ESTABLISHED:
2486                 if (st->bucket > tcp_hashinfo.ehash_mask)
2487                         break;
2488                 rc = established_get_first(seq);
2489                 while (offset-- && rc && bucket == st->bucket)
2490                         rc = established_get_next(seq, rc);
2491         }
2492 
2493         st->num = orig_num;
2494 
2495         return rc;
2496 }
2497 
2498 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2499 {
2500         struct tcp_iter_state *st = seq->private;
2501         void *rc;
2502 
2503         if (*pos && *pos == st->last_pos) {
2504                 rc = tcp_seek_last_pos(seq);
2505                 if (rc)
2506                         goto out;
2507         }
2508 
2509         st->state = TCP_SEQ_STATE_LISTENING;
2510         st->num = 0;
2511         st->bucket = 0;
2512         st->offset = 0;
2513         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2514 
2515 out:
2516         st->last_pos = *pos;
2517         return rc;
2518 }
2519 EXPORT_SYMBOL(tcp_seq_start);
2520 
2521 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2522 {
2523         struct tcp_iter_state *st = seq->private;
2524         void *rc = NULL;
2525 
2526         if (v == SEQ_START_TOKEN) {
2527                 rc = tcp_get_idx(seq, 0);
2528                 goto out;
2529         }
2530 
2531         switch (st->state) {
2532         case TCP_SEQ_STATE_LISTENING:
2533                 rc = listening_get_next(seq, v);
2534                 if (!rc) {
2535                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2536                         st->bucket = 0;
2537                         st->offset = 0;
2538                         rc        = established_get_first(seq);
2539                 }
2540                 break;
2541         case TCP_SEQ_STATE_ESTABLISHED:
2542                 rc = established_get_next(seq, v);
2543                 break;
2544         }
2545 out:
2546         ++*pos;
2547         st->last_pos = *pos;
2548         return rc;
2549 }
2550 EXPORT_SYMBOL(tcp_seq_next);
2551 
2552 void tcp_seq_stop(struct seq_file *seq, void *v)
2553 {
2554         struct tcp_iter_state *st = seq->private;
2555 
2556         switch (st->state) {
2557         case TCP_SEQ_STATE_LISTENING:
2558                 if (v != SEQ_START_TOKEN)
2559                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2560                 break;
2561         case TCP_SEQ_STATE_ESTABLISHED:
2562                 if (v)
2563                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2564                 break;
2565         }
2566 }
2567 EXPORT_SYMBOL(tcp_seq_stop);
2568 
2569 static void get_openreq4(const struct request_sock *req,
2570                          struct seq_file *f, int i)
2571 {
2572         const struct inet_request_sock *ireq = inet_rsk(req);
2573         long delta = req->rsk_timer.expires - jiffies;
2574 
2575         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2576                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2577                 i,
2578                 ireq->ir_loc_addr,
2579                 ireq->ir_num,
2580                 ireq->ir_rmt_addr,
2581                 ntohs(ireq->ir_rmt_port),
2582                 TCP_SYN_RECV,
2583                 0, 0, /* could print option size, but that is af dependent. */
2584                 1,    /* timers active (only the expire timer) */
2585                 jiffies_delta_to_clock_t(delta),
2586                 req->num_timeout,
2587                 from_kuid_munged(seq_user_ns(f),
2588                                  sock_i_uid(req->rsk_listener)),
2589                 0,  /* non standard timer */
2590                 0, /* open_requests have no inode */
2591                 0,
2592                 req);
2593 }
2594 
2595 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2596 {
2597         int timer_active;
2598         unsigned long timer_expires;
2599         const struct tcp_sock *tp = tcp_sk(sk);
2600         const struct inet_connection_sock *icsk = inet_csk(sk);
2601         const struct inet_sock *inet = inet_sk(sk);
2602         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2603         __be32 dest = inet->inet_daddr;
2604         __be32 src = inet->inet_rcv_saddr;
2605         __u16 destp = ntohs(inet->inet_dport);
2606         __u16 srcp = ntohs(inet->inet_sport);
2607         int rx_queue;
2608         int state;
2609 
2610         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2611             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2612             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2613                 timer_active    = 1;
2614                 timer_expires   = icsk->icsk_timeout;
2615         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2616                 timer_active    = 4;
2617                 timer_expires   = icsk->icsk_timeout;
2618         } else if (timer_pending(&sk->sk_timer)) {
2619                 timer_active    = 2;
2620                 timer_expires   = sk->sk_timer.expires;
2621         } else {
2622                 timer_active    = 0;
2623                 timer_expires = jiffies;
2624         }
2625 
2626         state = inet_sk_state_load(sk);
2627         if (state == TCP_LISTEN)
2628                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2629         else
2630                 /* Because we don't lock the socket,
2631                  * we might find a transient negative value.
2632                  */
2633                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2634                                       READ_ONCE(tp->copied_seq), 0);
2635 
2636         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2637                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2638                 i, src, srcp, dest, destp, state,
2639                 READ_ONCE(tp->write_seq) - tp->snd_una,
2640                 rx_queue,
2641                 timer_active,
2642                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2643                 icsk->icsk_retransmits,
2644                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2645                 icsk->icsk_probes_out,
2646                 sock_i_ino(sk),
2647                 refcount_read(&sk->sk_refcnt), sk,
2648                 jiffies_to_clock_t(icsk->icsk_rto),
2649                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2650                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2651                 tp->snd_cwnd,
2652                 state == TCP_LISTEN ?
2653                     fastopenq->max_qlen :
2654                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2655 }
2656 
2657 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2658                                struct seq_file *f, int i)
2659 {
2660         long delta = tw->tw_timer.expires - jiffies;
2661         __be32 dest, src;
2662         __u16 destp, srcp;
2663 
2664         dest  = tw->tw_daddr;
2665         src   = tw->tw_rcv_saddr;
2666         destp = ntohs(tw->tw_dport);
2667         srcp  = ntohs(tw->tw_sport);
2668 
2669         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2670                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2671                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2672                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2673                 refcount_read(&tw->tw_refcnt), tw);
2674 }
2675 
2676 #define TMPSZ 150
2677 
2678 static int tcp4_seq_show(struct seq_file *seq, void *v)
2679 {
2680         struct tcp_iter_state *st;
2681         struct sock *sk = v;
2682 
2683         seq_setwidth(seq, TMPSZ - 1);
2684         if (v == SEQ_START_TOKEN) {
2685                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2686                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2687                            "inode");
2688                 goto out;
2689         }
2690         st = seq->private;
2691 
2692         if (sk->sk_state == TCP_TIME_WAIT)
2693                 get_timewait4_sock(v, seq, st->num);
2694         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2695                 get_openreq4(v, seq, st->num);
2696         else
2697                 get_tcp4_sock(v, seq, st->num);
2698 out:
2699         seq_pad(seq, '\n');
2700         return 0;
2701 }
2702 
2703 #ifdef CONFIG_BPF_SYSCALL
2704 struct bpf_iter__tcp {
2705         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2706         __bpf_md_ptr(struct sock_common *, sk_common);
2707         uid_t uid __aligned(8);
2708 };
2709 
2710 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2711                              struct sock_common *sk_common, uid_t uid)
2712 {
2713         struct bpf_iter__tcp ctx;
2714 
2715         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2716         ctx.meta = meta;
2717         ctx.sk_common = sk_common;
2718         ctx.uid = uid;
2719         return bpf_iter_run_prog(prog, &ctx);
2720 }
2721 
2722 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2723 {
2724         struct bpf_iter_meta meta;
2725         struct bpf_prog *prog;
2726         struct sock *sk = v;
2727         uid_t uid;
2728 
2729         if (v == SEQ_START_TOKEN)
2730                 return 0;
2731 
2732         if (sk->sk_state == TCP_TIME_WAIT) {
2733                 uid = 0;
2734         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2735                 const struct request_sock *req = v;
2736 
2737                 uid = from_kuid_munged(seq_user_ns(seq),
2738                                        sock_i_uid(req->rsk_listener));
2739         } else {
2740                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2741         }
2742 
2743         meta.seq = seq;
2744         prog = bpf_iter_get_info(&meta, false);
2745         return tcp_prog_seq_show(prog, &meta, v, uid);
2746 }
2747 
2748 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2749 {
2750         struct bpf_iter_meta meta;
2751         struct bpf_prog *prog;
2752 
2753         if (!v) {
2754                 meta.seq = seq;
2755                 prog = bpf_iter_get_info(&meta, true);
2756                 if (prog)
2757                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2758         }
2759 
2760         tcp_seq_stop(seq, v);
2761 }
2762 
2763 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2764         .show           = bpf_iter_tcp_seq_show,
2765         .start          = tcp_seq_start,
2766         .next           = tcp_seq_next,
2767         .stop           = bpf_iter_tcp_seq_stop,
2768 };
2769 #endif
2770 
2771 static const struct seq_operations tcp4_seq_ops = {
2772         .show           = tcp4_seq_show,
2773         .start          = tcp_seq_start,
2774         .next           = tcp_seq_next,
2775         .stop           = tcp_seq_stop,
2776 };
2777 
2778 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2779         .family         = AF_INET,
2780 };
2781 
2782 static int __net_init tcp4_proc_init_net(struct net *net)
2783 {
2784         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2785                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2786                 return -ENOMEM;
2787         return 0;
2788 }
2789 
2790 static void __net_exit tcp4_proc_exit_net(struct net *net)
2791 {
2792         remove_proc_entry("tcp", net->proc_net);
2793 }
2794 
2795 static struct pernet_operations tcp4_net_ops = {
2796         .init = tcp4_proc_init_net,
2797         .exit = tcp4_proc_exit_net,
2798 };
2799 
2800 int __init tcp4_proc_init(void)
2801 {
2802         return register_pernet_subsys(&tcp4_net_ops);
2803 }
2804 
2805 void tcp4_proc_exit(void)
2806 {
2807         unregister_pernet_subsys(&tcp4_net_ops);
2808 }
2809 #endif /* CONFIG_PROC_FS */
2810 
2811 /* @wake is one when sk_stream_write_space() calls us.
2812  * This sends EPOLLOUT only if notsent_bytes is half the limit.
2813  * This mimics the strategy used in sock_def_write_space().
2814  */
2815 bool tcp_stream_memory_free(const struct sock *sk, int wake)
2816 {
2817         const struct tcp_sock *tp = tcp_sk(sk);
2818         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2819                             READ_ONCE(tp->snd_nxt);
2820 
2821         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2822 }
2823 EXPORT_SYMBOL(tcp_stream_memory_free);
2824 
2825 struct proto tcp_prot = {
2826         .name                   = "TCP",
2827         .owner                  = THIS_MODULE,
2828         .close                  = tcp_close,
2829         .pre_connect            = tcp_v4_pre_connect,
2830         .connect                = tcp_v4_connect,
2831         .disconnect             = tcp_disconnect,
2832         .accept                 = inet_csk_accept,
2833         .ioctl                  = tcp_ioctl,
2834         .init                   = tcp_v4_init_sock,
2835         .destroy                = tcp_v4_destroy_sock,
2836         .shutdown               = tcp_shutdown,
2837         .setsockopt             = tcp_setsockopt,
2838         .getsockopt             = tcp_getsockopt,
2839         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
2840         .keepalive              = tcp_set_keepalive,
2841         .recvmsg                = tcp_recvmsg,
2842         .sendmsg                = tcp_sendmsg,
2843         .sendpage               = tcp_sendpage,
2844         .backlog_rcv            = tcp_v4_do_rcv,
2845         .release_cb             = tcp_release_cb,
2846         .hash                   = inet_hash,
2847         .unhash                 = inet_unhash,
2848         .get_port               = inet_csk_get_port,
2849 #ifdef CONFIG_BPF_SYSCALL
2850         .psock_update_sk_prot   = tcp_bpf_update_proto,
2851 #endif
2852         .enter_memory_pressure  = tcp_enter_memory_pressure,
2853         .leave_memory_pressure  = tcp_leave_memory_pressure,
2854         .stream_memory_free     = tcp_stream_memory_free,
2855         .sockets_allocated      = &tcp_sockets_allocated,
2856         .orphan_count           = &tcp_orphan_count,
2857         .memory_allocated       = &tcp_memory_allocated,
2858         .memory_pressure        = &tcp_memory_pressure,
2859         .sysctl_mem             = sysctl_tcp_mem,
2860         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2861         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2862         .max_header             = MAX_TCP_HEADER,
2863         .obj_size               = sizeof(struct tcp_sock),
2864         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2865         .twsk_prot              = &tcp_timewait_sock_ops,
2866         .rsk_prot               = &tcp_request_sock_ops,
2867         .h.hashinfo             = &tcp_hashinfo,
2868         .no_autobind            = true,
2869         .diag_destroy           = tcp_abort,
2870 };
2871 EXPORT_SYMBOL(tcp_prot);
2872 
2873 static void __net_exit tcp_sk_exit(struct net *net)
2874 {
2875         int cpu;
2876 
2877         if (net->ipv4.tcp_congestion_control)
2878                 bpf_module_put(net->ipv4.tcp_congestion_control,
2879                                net->ipv4.tcp_congestion_control->owner);
2880 
2881         for_each_possible_cpu(cpu)
2882                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2883         free_percpu(net->ipv4.tcp_sk);
2884 }
2885 
2886 static int __net_init tcp_sk_init(struct net *net)
2887 {
2888         int res, cpu, cnt;
2889 
2890         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2891         if (!net->ipv4.tcp_sk)
2892                 return -ENOMEM;
2893 
2894         for_each_possible_cpu(cpu) {
2895                 struct sock *sk;
2896 
2897                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2898                                            IPPROTO_TCP, net);
2899                 if (res)
2900                         goto fail;
2901                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2902 
2903                 /* Please enforce IP_DF and IPID==0 for RST and
2904                  * ACK sent in SYN-RECV and TIME-WAIT state.
2905                  */
2906                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2907 
2908                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2909         }
2910 
2911         net->ipv4.sysctl_tcp_ecn = 2;
2912         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2913 
2914         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2915         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2916         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2917         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2918         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2919 
2920         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2921         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2922         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2923 
2924         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2925         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2926         net->ipv4.sysctl_tcp_syncookies = 1;
2927         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2928         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2929         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2930         net->ipv4.sysctl_tcp_orphan_retries = 0;
2931         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2932         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2933         net->ipv4.sysctl_tcp_tw_reuse = 2;
2934         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2935 
2936         cnt = tcp_hashinfo.ehash_mask + 1;
2937         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2938         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2939 
2940         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2941         net->ipv4.sysctl_tcp_sack = 1;
2942         net->ipv4.sysctl_tcp_window_scaling = 1;
2943         net->ipv4.sysctl_tcp_timestamps = 1;
2944         net->ipv4.sysctl_tcp_early_retrans = 3;
2945         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2946         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2947         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2948         net->ipv4.sysctl_tcp_max_reordering = 300;
2949         net->ipv4.sysctl_tcp_dsack = 1;
2950         net->ipv4.sysctl_tcp_app_win = 31;
2951         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2952         net->ipv4.sysctl_tcp_frto = 2;
2953         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2954         /* This limits the percentage of the congestion window which we
2955          * will allow a single TSO frame to consume.  Building TSO frames
2956          * which are too large can cause TCP streams to be bursty.
2957          */
2958         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2959         /* Default TSQ limit of 16 TSO segments */
2960         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2961         /* rfc5961 challenge ack rate limiting */
2962         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2963         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2964         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2965         net->ipv4.sysctl_tcp_autocorking = 1;
2966         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2967         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2968         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2969         if (net != &init_net) {
2970                 memcpy(net->ipv4.sysctl_tcp_rmem,
2971                        init_net.ipv4.sysctl_tcp_rmem,
2972                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2973                 memcpy(net->ipv4.sysctl_tcp_wmem,
2974                        init_net.ipv4.sysctl_tcp_wmem,
2975                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2976         }
2977         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2978         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2979         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2980         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2981         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2982         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2983         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2984 
2985         /* Reno is always built in */
2986         if (!net_eq(net, &init_net) &&
2987             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2988                                init_net.ipv4.tcp_congestion_control->owner))
2989                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2990         else
2991                 net->ipv4.tcp_congestion_control = &tcp_reno;
2992 
2993         return 0;
2994 fail:
2995         tcp_sk_exit(net);
2996 
2997         return res;
2998 }
2999 
3000 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3001 {
3002         struct net *net;
3003 
3004         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3005 
3006         list_for_each_entry(net, net_exit_list, exit_list)
3007                 tcp_fastopen_ctx_destroy(net);
3008 }
3009 
3010 static struct pernet_operations __net_initdata tcp_sk_ops = {
3011        .init       = tcp_sk_init,
3012        .exit       = tcp_sk_exit,
3013        .exit_batch = tcp_sk_exit_batch,
3014 };
3015 
3016 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3017 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3018                      struct sock_common *sk_common, uid_t uid)
3019 
3020 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3021 {
3022         struct tcp_iter_state *st = priv_data;
3023         struct tcp_seq_afinfo *afinfo;
3024         int ret;
3025 
3026         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
3027         if (!afinfo)
3028                 return -ENOMEM;
3029 
3030         afinfo->family = AF_UNSPEC;
3031         st->bpf_seq_afinfo = afinfo;
3032         ret = bpf_iter_init_seq_net(priv_data, aux);
3033         if (ret)
3034                 kfree(afinfo);
3035         return ret;
3036 }
3037 
3038 static void bpf_iter_fini_tcp(void *priv_data)
3039 {
3040         struct tcp_iter_state *st = priv_data;
3041 
3042         kfree(st->bpf_seq_afinfo);
3043         bpf_iter_fini_seq_net(priv_data);
3044 }
3045 
3046 static const struct bpf_iter_seq_info tcp_seq_info = {
3047         .seq_ops                = &bpf_iter_tcp_seq_ops,
3048         .init_seq_private       = bpf_iter_init_tcp,
3049         .fini_seq_private       = bpf_iter_fini_tcp,
3050         .seq_priv_size          = sizeof(struct tcp_iter_state),
3051 };
3052 
3053 static struct bpf_iter_reg tcp_reg_info = {
3054         .target                 = "tcp",
3055         .ctx_arg_info_size      = 1,
3056         .ctx_arg_info           = {
3057                 { offsetof(struct bpf_iter__tcp, sk_common),
3058                   PTR_TO_BTF_ID_OR_NULL },
3059         },
3060         .seq_info               = &tcp_seq_info,
3061 };
3062 
3063 static void __init bpf_iter_register(void)
3064 {
3065         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3066         if (bpf_iter_reg_target(&tcp_reg_info))
3067                 pr_warn("Warning: could not register bpf iterator tcp\n");
3068 }
3069 
3070 #endif
3071 
3072 void __init tcp_v4_init(void)
3073 {
3074         if (register_pernet_subsys(&tcp_sk_ops))
3075                 panic("Failed to create the TCP control socket.\n");
3076 
3077 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3078         bpf_iter_register();
3079 #endif
3080 }
3081 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp