~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/tcp_ipv4.c

Version: ~ [ linux-5.15-rc3 ] ~ [ linux-5.14.8 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.69 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.149 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.208 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.248 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.284 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.285 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              Implementation of the Transmission Control Protocol(TCP).
  7  *
  8  *              IPv4 specific functions
  9  *
 10  *
 11  *              code split from:
 12  *              linux/ipv4/tcp.c
 13  *              linux/ipv4/tcp_input.c
 14  *              linux/ipv4/tcp_output.c
 15  *
 16  *              See tcp.c for author information
 17  *
 18  *      This program is free software; you can redistribute it and/or
 19  *      modify it under the terms of the GNU General Public License
 20  *      as published by the Free Software Foundation; either version
 21  *      2 of the License, or (at your option) any later version.
 22  */
 23 
 24 /*
 25  * Changes:
 26  *              David S. Miller :       New socket lookup architecture.
 27  *                                      This code is dedicated to John Dyson.
 28  *              David S. Miller :       Change semantics of established hash,
 29  *                                      half is devoted to TIME_WAIT sockets
 30  *                                      and the rest go in the other half.
 31  *              Andi Kleen :            Add support for syncookies and fixed
 32  *                                      some bugs: ip options weren't passed to
 33  *                                      the TCP layer, missed a check for an
 34  *                                      ACK bit.
 35  *              Andi Kleen :            Implemented fast path mtu discovery.
 36  *                                      Fixed many serious bugs in the
 37  *                                      request_sock handling and moved
 38  *                                      most of it into the af independent code.
 39  *                                      Added tail drop and some other bugfixes.
 40  *                                      Added new listen semantics.
 41  *              Mike McLagan    :       Routing by source
 42  *      Juan Jose Ciarlante:            ip_dynaddr bits
 43  *              Andi Kleen:             various fixes.
 44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
 45  *                                      coma.
 46  *      Andi Kleen              :       Fix new listen.
 47  *      Andi Kleen              :       Fix accept error reporting.
 48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
 49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 50  *                                      a single port at the same time.
 51  */
 52 
 53 #define pr_fmt(fmt) "TCP: " fmt
 54 
 55 #include <linux/bottom_half.h>
 56 #include <linux/types.h>
 57 #include <linux/fcntl.h>
 58 #include <linux/module.h>
 59 #include <linux/random.h>
 60 #include <linux/cache.h>
 61 #include <linux/jhash.h>
 62 #include <linux/init.h>
 63 #include <linux/times.h>
 64 #include <linux/slab.h>
 65 
 66 #include <net/net_namespace.h>
 67 #include <net/icmp.h>
 68 #include <net/inet_hashtables.h>
 69 #include <net/tcp.h>
 70 #include <net/transp_v6.h>
 71 #include <net/ipv6.h>
 72 #include <net/inet_common.h>
 73 #include <net/timewait_sock.h>
 74 #include <net/xfrm.h>
 75 #include <net/secure_seq.h>
 76 #include <net/busy_poll.h>
 77 
 78 #include <linux/inet.h>
 79 #include <linux/ipv6.h>
 80 #include <linux/stddef.h>
 81 #include <linux/proc_fs.h>
 82 #include <linux/seq_file.h>
 83 
 84 #include <crypto/hash.h>
 85 #include <linux/scatterlist.h>
 86 
 87 int sysctl_tcp_tw_reuse __read_mostly;
 88 int sysctl_tcp_low_latency __read_mostly;
 89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
 90 
 91 #ifdef CONFIG_TCP_MD5SIG
 92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
 94 #endif
 95 
 96 struct inet_hashinfo tcp_hashinfo;
 97 EXPORT_SYMBOL(tcp_hashinfo);
 98 
 99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102                                           ip_hdr(skb)->saddr,
103                                           tcp_hdr(skb)->dest,
104                                           tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110         struct tcp_sock *tp = tcp_sk(sk);
111 
112         /* With PAWS, it is safe from the viewpoint
113            of data integrity. Even without PAWS it is safe provided sequence
114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116            Actually, the idea is close to VJ's one, only timestamp cache is
117            held not per host, but per port pair and TW bucket is used as state
118            holder.
119 
120            If TW bucket has been already destroyed we fall back to VJ's scheme
121            and use initial timestamp retrieved from peer table.
122          */
123         if (tcptw->tw_ts_recent_stamp &&
124             (!twp || (sysctl_tcp_tw_reuse &&
125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127                 if (tp->write_seq == 0)
128                         tp->write_seq = 1;
129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131                 sock_hold(sktw);
132                 return 1;
133         }
134 
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143         struct inet_sock *inet = inet_sk(sk);
144         struct tcp_sock *tp = tcp_sk(sk);
145         __be16 orig_sport, orig_dport;
146         __be32 daddr, nexthop;
147         struct flowi4 *fl4;
148         struct rtable *rt;
149         int err;
150         struct ip_options_rcu *inet_opt;
151 
152         if (addr_len < sizeof(struct sockaddr_in))
153                 return -EINVAL;
154 
155         if (usin->sin_family != AF_INET)
156                 return -EAFNOSUPPORT;
157 
158         nexthop = daddr = usin->sin_addr.s_addr;
159         inet_opt = rcu_dereference_protected(inet->inet_opt,
160                                              lockdep_sock_is_held(sk));
161         if (inet_opt && inet_opt->opt.srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet_opt->opt.faddr;
165         }
166 
167         orig_sport = inet->inet_sport;
168         orig_dport = usin->sin_port;
169         fl4 = &inet->cork.fl.u.ip4;
170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172                               IPPROTO_TCP,
173                               orig_sport, orig_dport, sk);
174         if (IS_ERR(rt)) {
175                 err = PTR_ERR(rt);
176                 if (err == -ENETUNREACH)
177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return err;
179         }
180 
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185 
186         if (!inet_opt || !inet_opt->opt.srr)
187                 daddr = fl4->daddr;
188 
189         if (!inet->inet_saddr)
190                 inet->inet_saddr = fl4->saddr;
191         sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 if (likely(!tp->repair))
198                         tp->write_seq      = 0;
199         }
200 
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205         inet->inet_dport = usin->sin_port;
206         sk_daddr_set(sk, daddr);
207 
208         inet_csk(sk)->icsk_ext_hdr_len = 0;
209         if (inet_opt)
210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214         /* Socket identity is still unknown (sport may be zero).
215          * However we set state to SYN-SENT and not releasing socket
216          * lock select source port, enter ourselves into the hash tables and
217          * complete initialization after this.
218          */
219         tcp_set_state(sk, TCP_SYN_SENT);
220         err = inet_hash_connect(&tcp_death_row, sk);
221         if (err)
222                 goto failure;
223 
224         sk_set_txhash(sk);
225 
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236 
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242 
243         inet->inet_id = tp->write_seq ^ jiffies;
244 
245         err = tcp_connect(sk);
246 
247         rt = NULL;
248         if (err)
249                 goto failure;
250 
251         return 0;
252 
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276 
277         dst = inet_csk_update_pmtu(sk, mtu);
278         if (!dst)
279                 return;
280 
281         /* Something is about to be wrong... Remember soft error
282          * for the case, if this connection will not able to recover.
283          */
284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285                 sk->sk_err_soft = EMSGSIZE;
286 
287         mtu = dst_mtu(dst);
288 
289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290             ip_sk_accept_pmtu(sk) &&
291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292                 tcp_sync_mss(sk, mtu);
293 
294                 /* Resend the TCP packet because it's
295                  * clear that the old packet has been
296                  * dropped. This is the new "fast" path mtu
297                  * discovery.
298                  */
299                 tcp_simple_retransmit(sk);
300         } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306         struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308         if (dst)
309                 dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316         struct request_sock *req = inet_reqsk(sk);
317         struct net *net = sock_net(sk);
318 
319         /* ICMPs are not backlogged, hence we cannot get
320          * an established socket here.
321          */
322         if (seq != tcp_rsk(req)->snt_isn) {
323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
324         } else if (abort) {
325                 /*
326                  * Still in SYN_RECV, just remove it silently.
327                  * There is no good way to pass the error to the newly
328                  * created socket, and POSIX does not want network
329                  * errors returned from accept().
330                  */
331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332                 tcp_listendrop(req->rsk_listener);
333         }
334         reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337 
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353 
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358         struct inet_connection_sock *icsk;
359         struct tcp_sock *tp;
360         struct inet_sock *inet;
361         const int type = icmp_hdr(icmp_skb)->type;
362         const int code = icmp_hdr(icmp_skb)->code;
363         struct sock *sk;
364         struct sk_buff *skb;
365         struct request_sock *fastopen;
366         __u32 seq, snd_una;
367         __u32 remaining;
368         int err;
369         struct net *net = dev_net(icmp_skb->dev);
370 
371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372                                        th->dest, iph->saddr, ntohs(th->source),
373                                        inet_iif(icmp_skb));
374         if (!sk) {
375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
376                 return;
377         }
378         if (sk->sk_state == TCP_TIME_WAIT) {
379                 inet_twsk_put(inet_twsk(sk));
380                 return;
381         }
382         seq = ntohl(th->seq);
383         if (sk->sk_state == TCP_NEW_SYN_RECV)
384                 return tcp_req_err(sk, seq,
385                                   type == ICMP_PARAMETERPROB ||
386                                   type == ICMP_TIME_EXCEEDED ||
387                                   (type == ICMP_DEST_UNREACH &&
388                                    (code == ICMP_NET_UNREACH ||
389                                     code == ICMP_HOST_UNREACH)));
390 
391         bh_lock_sock(sk);
392         /* If too many ICMPs get dropped on busy
393          * servers this needs to be solved differently.
394          * We do take care of PMTU discovery (RFC1191) special case :
395          * we can receive locally generated ICMP messages while socket is held.
396          */
397         if (sock_owned_by_user(sk)) {
398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
400         }
401         if (sk->sk_state == TCP_CLOSE)
402                 goto out;
403 
404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
406                 goto out;
407         }
408 
409         icsk = inet_csk(sk);
410         tp = tcp_sk(sk);
411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412         fastopen = tp->fastopen_rsk;
413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414         if (sk->sk_state != TCP_LISTEN &&
415             !between(seq, snd_una, tp->snd_nxt)) {
416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417                 goto out;
418         }
419 
420         switch (type) {
421         case ICMP_REDIRECT:
422                 do_redirect(icmp_skb, sk);
423                 goto out;
424         case ICMP_SOURCE_QUENCH:
425                 /* Just silently ignore these. */
426                 goto out;
427         case ICMP_PARAMETERPROB:
428                 err = EPROTO;
429                 break;
430         case ICMP_DEST_UNREACH:
431                 if (code > NR_ICMP_UNREACH)
432                         goto out;
433 
434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435                         /* We are not interested in TCP_LISTEN and open_requests
436                          * (SYN-ACKs send out by Linux are always <576bytes so
437                          * they should go through unfragmented).
438                          */
439                         if (sk->sk_state == TCP_LISTEN)
440                                 goto out;
441 
442                         tp->mtu_info = info;
443                         if (!sock_owned_by_user(sk)) {
444                                 tcp_v4_mtu_reduced(sk);
445                         } else {
446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447                                         sock_hold(sk);
448                         }
449                         goto out;
450                 }
451 
452                 err = icmp_err_convert[code].errno;
453                 /* check if icmp_skb allows revert of backoff
454                  * (see draft-zimmermann-tcp-lcd) */
455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456                         break;
457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458                     !icsk->icsk_backoff || fastopen)
459                         break;
460 
461                 if (sock_owned_by_user(sk))
462                         break;
463 
464                 icsk->icsk_backoff--;
465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466                                                TCP_TIMEOUT_INIT;
467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468 
469                 skb = tcp_write_queue_head(sk);
470                 BUG_ON(!skb);
471 
472                 remaining = icsk->icsk_rto -
473                             min(icsk->icsk_rto,
474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
475 
476                 if (remaining) {
477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478                                                   remaining, TCP_RTO_MAX);
479                 } else {
480                         /* RTO revert clocked out retransmission.
481                          * Will retransmit now */
482                         tcp_retransmit_timer(sk);
483                 }
484 
485                 break;
486         case ICMP_TIME_EXCEEDED:
487                 err = EHOSTUNREACH;
488                 break;
489         default:
490                 goto out;
491         }
492 
493         switch (sk->sk_state) {
494         case TCP_SYN_SENT:
495         case TCP_SYN_RECV:
496                 /* Only in fast or simultaneous open. If a fast open socket is
497                  * is already accepted it is treated as a connected one below.
498                  */
499                 if (fastopen && !fastopen->sk)
500                         break;
501 
502                 if (!sock_owned_by_user(sk)) {
503                         sk->sk_err = err;
504 
505                         sk->sk_error_report(sk);
506 
507                         tcp_done(sk);
508                 } else {
509                         sk->sk_err_soft = err;
510                 }
511                 goto out;
512         }
513 
514         /* If we've already connected we will keep trying
515          * until we time out, or the user gives up.
516          *
517          * rfc1122 4.2.3.9 allows to consider as hard errors
518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519          * but it is obsoleted by pmtu discovery).
520          *
521          * Note, that in modern internet, where routing is unreliable
522          * and in each dark corner broken firewalls sit, sending random
523          * errors ordered by their masters even this two messages finally lose
524          * their original sense (even Linux sends invalid PORT_UNREACHs)
525          *
526          * Now we are in compliance with RFCs.
527          *                                                      --ANK (980905)
528          */
529 
530         inet = inet_sk(sk);
531         if (!sock_owned_by_user(sk) && inet->recverr) {
532                 sk->sk_err = err;
533                 sk->sk_error_report(sk);
534         } else  { /* Only an error on timeout */
535                 sk->sk_err_soft = err;
536         }
537 
538 out:
539         bh_unlock_sock(sk);
540         sock_put(sk);
541 }
542 
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545         struct tcphdr *th = tcp_hdr(skb);
546 
547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549                 skb->csum_start = skb_transport_header(skb) - skb->head;
550                 skb->csum_offset = offsetof(struct tcphdr, check);
551         } else {
552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
553                                          csum_partial(th,
554                                                       th->doff << 2,
555                                                       skb->csum));
556         }
557 }
558 
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562         const struct inet_sock *inet = inet_sk(sk);
563 
564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567 
568 /*
569  *      This routine will send an RST to the other tcp.
570  *
571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *                    for reset.
573  *      Answer: if a packet caused RST, it is not for a socket
574  *              existing in our system, if it is matched to a socket,
575  *              it is just duplicate segment or bug in other side's TCP.
576  *              So that we build reply only basing on parameters
577  *              arrived with segment.
578  *      Exception: precedence violation. We do not implement it in any case.
579  */
580 
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583         const struct tcphdr *th = tcp_hdr(skb);
584         struct {
585                 struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589         } rep;
590         struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592         struct tcp_md5sig_key *key = NULL;
593         const __u8 *hash_location = NULL;
594         unsigned char newhash[16];
595         int genhash;
596         struct sock *sk1 = NULL;
597 #endif
598         struct net *net;
599 
600         /* Never send a reset in response to a reset. */
601         if (th->rst)
602                 return;
603 
604         /* If sk not NULL, it means we did a successful lookup and incoming
605          * route had to be correct. prequeue might have dropped our dst.
606          */
607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608                 return;
609 
610         /* Swap the send and the receive. */
611         memset(&rep, 0, sizeof(rep));
612         rep.th.dest   = th->source;
613         rep.th.source = th->dest;
614         rep.th.doff   = sizeof(struct tcphdr) / 4;
615         rep.th.rst    = 1;
616 
617         if (th->ack) {
618                 rep.th.seq = th->ack_seq;
619         } else {
620                 rep.th.ack = 1;
621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622                                        skb->len - (th->doff << 2));
623         }
624 
625         memset(&arg, 0, sizeof(arg));
626         arg.iov[0].iov_base = (unsigned char *)&rep;
627         arg.iov[0].iov_len  = sizeof(rep.th);
628 
629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631         rcu_read_lock();
632         hash_location = tcp_parse_md5sig_option(th);
633         if (sk && sk_fullsock(sk)) {
634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
635                                         &ip_hdr(skb)->saddr, AF_INET);
636         } else if (hash_location) {
637                 /*
638                  * active side is lost. Try to find listening socket through
639                  * source port, and then find md5 key through listening socket.
640                  * we are not loose security here:
641                  * Incoming packet is checked with md5 hash with finding key,
642                  * no RST generated if md5 hash doesn't match.
643                  */
644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
645                                              ip_hdr(skb)->saddr,
646                                              th->source, ip_hdr(skb)->daddr,
647                                              ntohs(th->source), inet_iif(skb));
648                 /* don't send rst if it can't find key */
649                 if (!sk1)
650                         goto out;
651 
652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
653                                         &ip_hdr(skb)->saddr, AF_INET);
654                 if (!key)
655                         goto out;
656 
657 
658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
660                         goto out;
661 
662         }
663 
664         if (key) {
665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666                                    (TCPOPT_NOP << 16) |
667                                    (TCPOPT_MD5SIG << 8) |
668                                    TCPOLEN_MD5SIG);
669                 /* Update length and the length the header thinks exists */
670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671                 rep.th.doff = arg.iov[0].iov_len / 4;
672 
673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674                                      key, ip_hdr(skb)->saddr,
675                                      ip_hdr(skb)->daddr, &rep.th);
676         }
677 #endif
678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679                                       ip_hdr(skb)->saddr, /* XXX */
680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683 
684         /* When socket is gone, all binding information is lost.
685          * routing might fail in this case. No choice here, if we choose to force
686          * input interface, we will misroute in case of asymmetric route.
687          */
688         if (sk)
689                 arg.bound_dev_if = sk->sk_bound_dev_if;
690 
691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
693 
694         arg.tos = ip_hdr(skb)->tos;
695         local_bh_disable();
696         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
697                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
698                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
699                               &arg, arg.iov[0].iov_len);
700 
701         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
702         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
703         local_bh_enable();
704 
705 #ifdef CONFIG_TCP_MD5SIG
706 out:
707         rcu_read_unlock();
708 #endif
709 }
710 
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714 
715 static void tcp_v4_send_ack(struct net *net,
716                             struct sk_buff *skb, u32 seq, u32 ack,
717                             u32 win, u32 tsval, u32 tsecr, int oif,
718                             struct tcp_md5sig_key *key,
719                             int reply_flags, u8 tos)
720 {
721         const struct tcphdr *th = tcp_hdr(skb);
722         struct {
723                 struct tcphdr th;
724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728                         ];
729         } rep;
730         struct ip_reply_arg arg;
731 
732         memset(&rep.th, 0, sizeof(struct tcphdr));
733         memset(&arg, 0, sizeof(arg));
734 
735         arg.iov[0].iov_base = (unsigned char *)&rep;
736         arg.iov[0].iov_len  = sizeof(rep.th);
737         if (tsecr) {
738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739                                    (TCPOPT_TIMESTAMP << 8) |
740                                    TCPOLEN_TIMESTAMP);
741                 rep.opt[1] = htonl(tsval);
742                 rep.opt[2] = htonl(tsecr);
743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744         }
745 
746         /* Swap the send and the receive. */
747         rep.th.dest    = th->source;
748         rep.th.source  = th->dest;
749         rep.th.doff    = arg.iov[0].iov_len / 4;
750         rep.th.seq     = htonl(seq);
751         rep.th.ack_seq = htonl(ack);
752         rep.th.ack     = 1;
753         rep.th.window  = htons(win);
754 
755 #ifdef CONFIG_TCP_MD5SIG
756         if (key) {
757                 int offset = (tsecr) ? 3 : 0;
758 
759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760                                           (TCPOPT_NOP << 16) |
761                                           (TCPOPT_MD5SIG << 8) |
762                                           TCPOLEN_MD5SIG);
763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764                 rep.th.doff = arg.iov[0].iov_len/4;
765 
766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767                                     key, ip_hdr(skb)->saddr,
768                                     ip_hdr(skb)->daddr, &rep.th);
769         }
770 #endif
771         arg.flags = reply_flags;
772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773                                       ip_hdr(skb)->saddr, /* XXX */
774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776         if (oif)
777                 arg.bound_dev_if = oif;
778         arg.tos = tos;
779         local_bh_disable();
780         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
781                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
782                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
783                               &arg, arg.iov[0].iov_len);
784 
785         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
786         local_bh_enable();
787 }
788 
789 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
790 {
791         struct inet_timewait_sock *tw = inet_twsk(sk);
792         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
793 
794         tcp_v4_send_ack(sock_net(sk), skb,
795                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
797                         tcp_time_stamp + tcptw->tw_ts_offset,
798                         tcptw->tw_ts_recent,
799                         tw->tw_bound_dev_if,
800                         tcp_twsk_md5_key(tcptw),
801                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
802                         tw->tw_tos
803                         );
804 
805         inet_twsk_put(tw);
806 }
807 
808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
809                                   struct request_sock *req)
810 {
811         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
812          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
813          */
814         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
815                                              tcp_sk(sk)->snd_nxt;
816 
817         /* RFC 7323 2.3
818          * The window field (SEG.WND) of every outgoing segment, with the
819          * exception of <SYN> segments, MUST be right-shifted by
820          * Rcv.Wind.Shift bits:
821          */
822         tcp_v4_send_ack(sock_net(sk), skb, seq,
823                         tcp_rsk(req)->rcv_nxt,
824                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
825                         tcp_time_stamp,
826                         req->ts_recent,
827                         0,
828                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
829                                           AF_INET),
830                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
831                         ip_hdr(skb)->tos);
832 }
833 
834 /*
835  *      Send a SYN-ACK after having received a SYN.
836  *      This still operates on a request_sock only, not on a big
837  *      socket.
838  */
839 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
840                               struct flowi *fl,
841                               struct request_sock *req,
842                               struct tcp_fastopen_cookie *foc,
843                               enum tcp_synack_type synack_type)
844 {
845         const struct inet_request_sock *ireq = inet_rsk(req);
846         struct flowi4 fl4;
847         int err = -1;
848         struct sk_buff *skb;
849 
850         /* First, grab a route. */
851         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
852                 return -1;
853 
854         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
855 
856         if (skb) {
857                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
858 
859                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
860                                             ireq->ir_rmt_addr,
861                                             ireq->opt);
862                 err = net_xmit_eval(err);
863         }
864 
865         return err;
866 }
867 
868 /*
869  *      IPv4 request_sock destructor.
870  */
871 static void tcp_v4_reqsk_destructor(struct request_sock *req)
872 {
873         kfree(inet_rsk(req)->opt);
874 }
875 
876 #ifdef CONFIG_TCP_MD5SIG
877 /*
878  * RFC2385 MD5 checksumming requires a mapping of
879  * IP address->MD5 Key.
880  * We need to maintain these in the sk structure.
881  */
882 
883 /* Find the Key structure for an address.  */
884 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
885                                          const union tcp_md5_addr *addr,
886                                          int family)
887 {
888         const struct tcp_sock *tp = tcp_sk(sk);
889         struct tcp_md5sig_key *key;
890         unsigned int size = sizeof(struct in_addr);
891         const struct tcp_md5sig_info *md5sig;
892 
893         /* caller either holds rcu_read_lock() or socket lock */
894         md5sig = rcu_dereference_check(tp->md5sig_info,
895                                        lockdep_sock_is_held(sk));
896         if (!md5sig)
897                 return NULL;
898 #if IS_ENABLED(CONFIG_IPV6)
899         if (family == AF_INET6)
900                 size = sizeof(struct in6_addr);
901 #endif
902         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
903                 if (key->family != family)
904                         continue;
905                 if (!memcmp(&key->addr, addr, size))
906                         return key;
907         }
908         return NULL;
909 }
910 EXPORT_SYMBOL(tcp_md5_do_lookup);
911 
912 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
913                                          const struct sock *addr_sk)
914 {
915         const union tcp_md5_addr *addr;
916 
917         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
918         return tcp_md5_do_lookup(sk, addr, AF_INET);
919 }
920 EXPORT_SYMBOL(tcp_v4_md5_lookup);
921 
922 /* This can be called on a newly created socket, from other files */
923 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
924                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
925 {
926         /* Add Key to the list */
927         struct tcp_md5sig_key *key;
928         struct tcp_sock *tp = tcp_sk(sk);
929         struct tcp_md5sig_info *md5sig;
930 
931         key = tcp_md5_do_lookup(sk, addr, family);
932         if (key) {
933                 /* Pre-existing entry - just update that one. */
934                 memcpy(key->key, newkey, newkeylen);
935                 key->keylen = newkeylen;
936                 return 0;
937         }
938 
939         md5sig = rcu_dereference_protected(tp->md5sig_info,
940                                            lockdep_sock_is_held(sk));
941         if (!md5sig) {
942                 md5sig = kmalloc(sizeof(*md5sig), gfp);
943                 if (!md5sig)
944                         return -ENOMEM;
945 
946                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
947                 INIT_HLIST_HEAD(&md5sig->head);
948                 rcu_assign_pointer(tp->md5sig_info, md5sig);
949         }
950 
951         key = sock_kmalloc(sk, sizeof(*key), gfp);
952         if (!key)
953                 return -ENOMEM;
954         if (!tcp_alloc_md5sig_pool()) {
955                 sock_kfree_s(sk, key, sizeof(*key));
956                 return -ENOMEM;
957         }
958 
959         memcpy(key->key, newkey, newkeylen);
960         key->keylen = newkeylen;
961         key->family = family;
962         memcpy(&key->addr, addr,
963                (family == AF_INET6) ? sizeof(struct in6_addr) :
964                                       sizeof(struct in_addr));
965         hlist_add_head_rcu(&key->node, &md5sig->head);
966         return 0;
967 }
968 EXPORT_SYMBOL(tcp_md5_do_add);
969 
970 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
971 {
972         struct tcp_md5sig_key *key;
973 
974         key = tcp_md5_do_lookup(sk, addr, family);
975         if (!key)
976                 return -ENOENT;
977         hlist_del_rcu(&key->node);
978         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
979         kfree_rcu(key, rcu);
980         return 0;
981 }
982 EXPORT_SYMBOL(tcp_md5_do_del);
983 
984 static void tcp_clear_md5_list(struct sock *sk)
985 {
986         struct tcp_sock *tp = tcp_sk(sk);
987         struct tcp_md5sig_key *key;
988         struct hlist_node *n;
989         struct tcp_md5sig_info *md5sig;
990 
991         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
992 
993         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
994                 hlist_del_rcu(&key->node);
995                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
996                 kfree_rcu(key, rcu);
997         }
998 }
999 
1000 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1001                                  int optlen)
1002 {
1003         struct tcp_md5sig cmd;
1004         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1005 
1006         if (optlen < sizeof(cmd))
1007                 return -EINVAL;
1008 
1009         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1010                 return -EFAULT;
1011 
1012         if (sin->sin_family != AF_INET)
1013                 return -EINVAL;
1014 
1015         if (!cmd.tcpm_keylen)
1016                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017                                       AF_INET);
1018 
1019         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020                 return -EINVAL;
1021 
1022         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1024                               GFP_KERNEL);
1025 }
1026 
1027 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1028                                         __be32 daddr, __be32 saddr, int nbytes)
1029 {
1030         struct tcp4_pseudohdr *bp;
1031         struct scatterlist sg;
1032 
1033         bp = &hp->md5_blk.ip4;
1034 
1035         /*
1036          * 1. the TCP pseudo-header (in the order: source IP address,
1037          * destination IP address, zero-padded protocol number, and
1038          * segment length)
1039          */
1040         bp->saddr = saddr;
1041         bp->daddr = daddr;
1042         bp->pad = 0;
1043         bp->protocol = IPPROTO_TCP;
1044         bp->len = cpu_to_be16(nbytes);
1045 
1046         sg_init_one(&sg, bp, sizeof(*bp));
1047         ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1048         return crypto_ahash_update(hp->md5_req);
1049 }
1050 
1051 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1052                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1053 {
1054         struct tcp_md5sig_pool *hp;
1055         struct ahash_request *req;
1056 
1057         hp = tcp_get_md5sig_pool();
1058         if (!hp)
1059                 goto clear_hash_noput;
1060         req = hp->md5_req;
1061 
1062         if (crypto_ahash_init(req))
1063                 goto clear_hash;
1064         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1065                 goto clear_hash;
1066         if (tcp_md5_hash_header(hp, th))
1067                 goto clear_hash;
1068         if (tcp_md5_hash_key(hp, key))
1069                 goto clear_hash;
1070         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1071         if (crypto_ahash_final(req))
1072                 goto clear_hash;
1073 
1074         tcp_put_md5sig_pool();
1075         return 0;
1076 
1077 clear_hash:
1078         tcp_put_md5sig_pool();
1079 clear_hash_noput:
1080         memset(md5_hash, 0, 16);
1081         return 1;
1082 }
1083 
1084 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1085                         const struct sock *sk,
1086                         const struct sk_buff *skb)
1087 {
1088         struct tcp_md5sig_pool *hp;
1089         struct ahash_request *req;
1090         const struct tcphdr *th = tcp_hdr(skb);
1091         __be32 saddr, daddr;
1092 
1093         if (sk) { /* valid for establish/request sockets */
1094                 saddr = sk->sk_rcv_saddr;
1095                 daddr = sk->sk_daddr;
1096         } else {
1097                 const struct iphdr *iph = ip_hdr(skb);
1098                 saddr = iph->saddr;
1099                 daddr = iph->daddr;
1100         }
1101 
1102         hp = tcp_get_md5sig_pool();
1103         if (!hp)
1104                 goto clear_hash_noput;
1105         req = hp->md5_req;
1106 
1107         if (crypto_ahash_init(req))
1108                 goto clear_hash;
1109 
1110         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1111                 goto clear_hash;
1112         if (tcp_md5_hash_header(hp, th))
1113                 goto clear_hash;
1114         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1115                 goto clear_hash;
1116         if (tcp_md5_hash_key(hp, key))
1117                 goto clear_hash;
1118         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1119         if (crypto_ahash_final(req))
1120                 goto clear_hash;
1121 
1122         tcp_put_md5sig_pool();
1123         return 0;
1124 
1125 clear_hash:
1126         tcp_put_md5sig_pool();
1127 clear_hash_noput:
1128         memset(md5_hash, 0, 16);
1129         return 1;
1130 }
1131 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1132 
1133 #endif
1134 
1135 /* Called with rcu_read_lock() */
1136 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1137                                     const struct sk_buff *skb)
1138 {
1139 #ifdef CONFIG_TCP_MD5SIG
1140         /*
1141          * This gets called for each TCP segment that arrives
1142          * so we want to be efficient.
1143          * We have 3 drop cases:
1144          * o No MD5 hash and one expected.
1145          * o MD5 hash and we're not expecting one.
1146          * o MD5 hash and its wrong.
1147          */
1148         const __u8 *hash_location = NULL;
1149         struct tcp_md5sig_key *hash_expected;
1150         const struct iphdr *iph = ip_hdr(skb);
1151         const struct tcphdr *th = tcp_hdr(skb);
1152         int genhash;
1153         unsigned char newhash[16];
1154 
1155         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1156                                           AF_INET);
1157         hash_location = tcp_parse_md5sig_option(th);
1158 
1159         /* We've parsed the options - do we have a hash? */
1160         if (!hash_expected && !hash_location)
1161                 return false;
1162 
1163         if (hash_expected && !hash_location) {
1164                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1165                 return true;
1166         }
1167 
1168         if (!hash_expected && hash_location) {
1169                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1170                 return true;
1171         }
1172 
1173         /* Okay, so this is hash_expected and hash_location -
1174          * so we need to calculate the checksum.
1175          */
1176         genhash = tcp_v4_md5_hash_skb(newhash,
1177                                       hash_expected,
1178                                       NULL, skb);
1179 
1180         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1181                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1182                                      &iph->saddr, ntohs(th->source),
1183                                      &iph->daddr, ntohs(th->dest),
1184                                      genhash ? " tcp_v4_calc_md5_hash failed"
1185                                      : "");
1186                 return true;
1187         }
1188         return false;
1189 #endif
1190         return false;
1191 }
1192 
1193 static void tcp_v4_init_req(struct request_sock *req,
1194                             const struct sock *sk_listener,
1195                             struct sk_buff *skb)
1196 {
1197         struct inet_request_sock *ireq = inet_rsk(req);
1198 
1199         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1200         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1201         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1202         ireq->opt = tcp_v4_save_options(skb);
1203 }
1204 
1205 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1206                                           struct flowi *fl,
1207                                           const struct request_sock *req,
1208                                           bool *strict)
1209 {
1210         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1211 
1212         if (strict) {
1213                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1214                         *strict = true;
1215                 else
1216                         *strict = false;
1217         }
1218 
1219         return dst;
1220 }
1221 
1222 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1223         .family         =       PF_INET,
1224         .obj_size       =       sizeof(struct tcp_request_sock),
1225         .rtx_syn_ack    =       tcp_rtx_synack,
1226         .send_ack       =       tcp_v4_reqsk_send_ack,
1227         .destructor     =       tcp_v4_reqsk_destructor,
1228         .send_reset     =       tcp_v4_send_reset,
1229         .syn_ack_timeout =      tcp_syn_ack_timeout,
1230 };
1231 
1232 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1233         .mss_clamp      =       TCP_MSS_DEFAULT,
1234 #ifdef CONFIG_TCP_MD5SIG
1235         .req_md5_lookup =       tcp_v4_md5_lookup,
1236         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1237 #endif
1238         .init_req       =       tcp_v4_init_req,
1239 #ifdef CONFIG_SYN_COOKIES
1240         .cookie_init_seq =      cookie_v4_init_sequence,
1241 #endif
1242         .route_req      =       tcp_v4_route_req,
1243         .init_seq       =       tcp_v4_init_sequence,
1244         .send_synack    =       tcp_v4_send_synack,
1245 };
1246 
1247 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1248 {
1249         /* Never answer to SYNs send to broadcast or multicast */
1250         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1251                 goto drop;
1252 
1253         return tcp_conn_request(&tcp_request_sock_ops,
1254                                 &tcp_request_sock_ipv4_ops, sk, skb);
1255 
1256 drop:
1257         tcp_listendrop(sk);
1258         return 0;
1259 }
1260 EXPORT_SYMBOL(tcp_v4_conn_request);
1261 
1262 
1263 /*
1264  * The three way handshake has completed - we got a valid synack -
1265  * now create the new socket.
1266  */
1267 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1268                                   struct request_sock *req,
1269                                   struct dst_entry *dst,
1270                                   struct request_sock *req_unhash,
1271                                   bool *own_req)
1272 {
1273         struct inet_request_sock *ireq;
1274         struct inet_sock *newinet;
1275         struct tcp_sock *newtp;
1276         struct sock *newsk;
1277 #ifdef CONFIG_TCP_MD5SIG
1278         struct tcp_md5sig_key *key;
1279 #endif
1280         struct ip_options_rcu *inet_opt;
1281 
1282         if (sk_acceptq_is_full(sk))
1283                 goto exit_overflow;
1284 
1285         newsk = tcp_create_openreq_child(sk, req, skb);
1286         if (!newsk)
1287                 goto exit_nonewsk;
1288 
1289         newsk->sk_gso_type = SKB_GSO_TCPV4;
1290         inet_sk_rx_dst_set(newsk, skb);
1291 
1292         newtp                 = tcp_sk(newsk);
1293         newinet               = inet_sk(newsk);
1294         ireq                  = inet_rsk(req);
1295         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1296         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1297         newsk->sk_bound_dev_if = ireq->ir_iif;
1298         newinet->inet_saddr           = ireq->ir_loc_addr;
1299         inet_opt              = ireq->opt;
1300         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1301         ireq->opt             = NULL;
1302         newinet->mc_index     = inet_iif(skb);
1303         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1304         newinet->rcv_tos      = ip_hdr(skb)->tos;
1305         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1306         if (inet_opt)
1307                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1308         newinet->inet_id = newtp->write_seq ^ jiffies;
1309 
1310         if (!dst) {
1311                 dst = inet_csk_route_child_sock(sk, newsk, req);
1312                 if (!dst)
1313                         goto put_and_exit;
1314         } else {
1315                 /* syncookie case : see end of cookie_v4_check() */
1316         }
1317         sk_setup_caps(newsk, dst);
1318 
1319         tcp_ca_openreq_child(newsk, dst);
1320 
1321         tcp_sync_mss(newsk, dst_mtu(dst));
1322         newtp->advmss = dst_metric_advmss(dst);
1323         if (tcp_sk(sk)->rx_opt.user_mss &&
1324             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1325                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1326 
1327         tcp_initialize_rcv_mss(newsk);
1328 
1329 #ifdef CONFIG_TCP_MD5SIG
1330         /* Copy over the MD5 key from the original socket */
1331         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1332                                 AF_INET);
1333         if (key) {
1334                 /*
1335                  * We're using one, so create a matching key
1336                  * on the newsk structure. If we fail to get
1337                  * memory, then we end up not copying the key
1338                  * across. Shucks.
1339                  */
1340                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1341                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1342                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1343         }
1344 #endif
1345 
1346         if (__inet_inherit_port(sk, newsk) < 0)
1347                 goto put_and_exit;
1348         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1349         if (*own_req)
1350                 tcp_move_syn(newtp, req);
1351 
1352         return newsk;
1353 
1354 exit_overflow:
1355         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1356 exit_nonewsk:
1357         dst_release(dst);
1358 exit:
1359         tcp_listendrop(sk);
1360         return NULL;
1361 put_and_exit:
1362         inet_csk_prepare_forced_close(newsk);
1363         tcp_done(newsk);
1364         goto exit;
1365 }
1366 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1367 
1368 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1369 {
1370 #ifdef CONFIG_SYN_COOKIES
1371         const struct tcphdr *th = tcp_hdr(skb);
1372 
1373         if (!th->syn)
1374                 sk = cookie_v4_check(sk, skb);
1375 #endif
1376         return sk;
1377 }
1378 
1379 /* The socket must have it's spinlock held when we get
1380  * here, unless it is a TCP_LISTEN socket.
1381  *
1382  * We have a potential double-lock case here, so even when
1383  * doing backlog processing we use the BH locking scheme.
1384  * This is because we cannot sleep with the original spinlock
1385  * held.
1386  */
1387 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1388 {
1389         struct sock *rsk;
1390 
1391         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1392                 struct dst_entry *dst = sk->sk_rx_dst;
1393 
1394                 sock_rps_save_rxhash(sk, skb);
1395                 sk_mark_napi_id(sk, skb);
1396                 if (dst) {
1397                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1398                             !dst->ops->check(dst, 0)) {
1399                                 dst_release(dst);
1400                                 sk->sk_rx_dst = NULL;
1401                         }
1402                 }
1403                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1404                 return 0;
1405         }
1406 
1407         if (tcp_checksum_complete(skb))
1408                 goto csum_err;
1409 
1410         if (sk->sk_state == TCP_LISTEN) {
1411                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1412 
1413                 if (!nsk)
1414                         goto discard;
1415                 if (nsk != sk) {
1416                         sock_rps_save_rxhash(nsk, skb);
1417                         sk_mark_napi_id(nsk, skb);
1418                         if (tcp_child_process(sk, nsk, skb)) {
1419                                 rsk = nsk;
1420                                 goto reset;
1421                         }
1422                         return 0;
1423                 }
1424         } else
1425                 sock_rps_save_rxhash(sk, skb);
1426 
1427         if (tcp_rcv_state_process(sk, skb)) {
1428                 rsk = sk;
1429                 goto reset;
1430         }
1431         return 0;
1432 
1433 reset:
1434         tcp_v4_send_reset(rsk, skb);
1435 discard:
1436         kfree_skb(skb);
1437         /* Be careful here. If this function gets more complicated and
1438          * gcc suffers from register pressure on the x86, sk (in %ebx)
1439          * might be destroyed here. This current version compiles correctly,
1440          * but you have been warned.
1441          */
1442         return 0;
1443 
1444 csum_err:
1445         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1446         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1447         goto discard;
1448 }
1449 EXPORT_SYMBOL(tcp_v4_do_rcv);
1450 
1451 void tcp_v4_early_demux(struct sk_buff *skb)
1452 {
1453         const struct iphdr *iph;
1454         const struct tcphdr *th;
1455         struct sock *sk;
1456 
1457         if (skb->pkt_type != PACKET_HOST)
1458                 return;
1459 
1460         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1461                 return;
1462 
1463         iph = ip_hdr(skb);
1464         th = tcp_hdr(skb);
1465 
1466         if (th->doff < sizeof(struct tcphdr) / 4)
1467                 return;
1468 
1469         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1470                                        iph->saddr, th->source,
1471                                        iph->daddr, ntohs(th->dest),
1472                                        skb->skb_iif);
1473         if (sk) {
1474                 skb->sk = sk;
1475                 skb->destructor = sock_edemux;
1476                 if (sk_fullsock(sk)) {
1477                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1478 
1479                         if (dst)
1480                                 dst = dst_check(dst, 0);
1481                         if (dst &&
1482                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1483                                 skb_dst_set_noref(skb, dst);
1484                 }
1485         }
1486 }
1487 
1488 /* Packet is added to VJ-style prequeue for processing in process
1489  * context, if a reader task is waiting. Apparently, this exciting
1490  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1491  * failed somewhere. Latency? Burstiness? Well, at least now we will
1492  * see, why it failed. 8)8)                               --ANK
1493  *
1494  */
1495 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct tcp_sock *tp = tcp_sk(sk);
1498 
1499         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1500                 return false;
1501 
1502         if (skb->len <= tcp_hdrlen(skb) &&
1503             skb_queue_len(&tp->ucopy.prequeue) == 0)
1504                 return false;
1505 
1506         /* Before escaping RCU protected region, we need to take care of skb
1507          * dst. Prequeue is only enabled for established sockets.
1508          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1509          * Instead of doing full sk_rx_dst validity here, let's perform
1510          * an optimistic check.
1511          */
1512         if (likely(sk->sk_rx_dst))
1513                 skb_dst_drop(skb);
1514         else
1515                 skb_dst_force_safe(skb);
1516 
1517         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1518         tp->ucopy.memory += skb->truesize;
1519         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1520             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1521                 struct sk_buff *skb1;
1522 
1523                 BUG_ON(sock_owned_by_user(sk));
1524                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1525                                 skb_queue_len(&tp->ucopy.prequeue));
1526 
1527                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1528                         sk_backlog_rcv(sk, skb1);
1529 
1530                 tp->ucopy.memory = 0;
1531         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1532                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1533                                            POLLIN | POLLRDNORM | POLLRDBAND);
1534                 if (!inet_csk_ack_scheduled(sk))
1535                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1536                                                   (3 * tcp_rto_min(sk)) / 4,
1537                                                   TCP_RTO_MAX);
1538         }
1539         return true;
1540 }
1541 EXPORT_SYMBOL(tcp_prequeue);
1542 
1543 /*
1544  *      From tcp_input.c
1545  */
1546 
1547 int tcp_v4_rcv(struct sk_buff *skb)
1548 {
1549         struct net *net = dev_net(skb->dev);
1550         const struct iphdr *iph;
1551         const struct tcphdr *th;
1552         bool refcounted;
1553         struct sock *sk;
1554         int ret;
1555 
1556         if (skb->pkt_type != PACKET_HOST)
1557                 goto discard_it;
1558 
1559         /* Count it even if it's bad */
1560         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1561 
1562         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1563                 goto discard_it;
1564 
1565         th = (const struct tcphdr *)skb->data;
1566 
1567         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1568                 goto bad_packet;
1569         if (!pskb_may_pull(skb, th->doff * 4))
1570                 goto discard_it;
1571 
1572         /* An explanation is required here, I think.
1573          * Packet length and doff are validated by header prediction,
1574          * provided case of th->doff==0 is eliminated.
1575          * So, we defer the checks. */
1576 
1577         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1578                 goto csum_error;
1579 
1580         th = (const struct tcphdr *)skb->data;
1581         iph = ip_hdr(skb);
1582         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1583          * barrier() makes sure compiler wont play fool^Waliasing games.
1584          */
1585         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1586                 sizeof(struct inet_skb_parm));
1587         barrier();
1588 
1589         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1590         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1591                                     skb->len - th->doff * 4);
1592         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1593         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1594         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1595         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1596         TCP_SKB_CB(skb)->sacked  = 0;
1597 
1598 lookup:
1599         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1600                                th->dest, &refcounted);
1601         if (!sk)
1602                 goto no_tcp_socket;
1603 
1604 process:
1605         if (sk->sk_state == TCP_TIME_WAIT)
1606                 goto do_time_wait;
1607 
1608         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1609                 struct request_sock *req = inet_reqsk(sk);
1610                 struct sock *nsk;
1611 
1612                 sk = req->rsk_listener;
1613                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1614                         reqsk_put(req);
1615                         goto discard_it;
1616                 }
1617                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1618                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1619                         goto lookup;
1620                 }
1621                 /* We own a reference on the listener, increase it again
1622                  * as we might lose it too soon.
1623                  */
1624                 sock_hold(sk);
1625                 refcounted = true;
1626                 nsk = tcp_check_req(sk, skb, req, false);
1627                 if (!nsk) {
1628                         reqsk_put(req);
1629                         goto discard_and_relse;
1630                 }
1631                 if (nsk == sk) {
1632                         reqsk_put(req);
1633                 } else if (tcp_child_process(sk, nsk, skb)) {
1634                         tcp_v4_send_reset(nsk, skb);
1635                         goto discard_and_relse;
1636                 } else {
1637                         sock_put(sk);
1638                         return 0;
1639                 }
1640         }
1641         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1642                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1643                 goto discard_and_relse;
1644         }
1645 
1646         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1647                 goto discard_and_relse;
1648 
1649         if (tcp_v4_inbound_md5_hash(sk, skb))
1650                 goto discard_and_relse;
1651 
1652         nf_reset(skb);
1653 
1654         if (sk_filter(sk, skb))
1655                 goto discard_and_relse;
1656 
1657         skb->dev = NULL;
1658 
1659         if (sk->sk_state == TCP_LISTEN) {
1660                 ret = tcp_v4_do_rcv(sk, skb);
1661                 goto put_and_return;
1662         }
1663 
1664         sk_incoming_cpu_update(sk);
1665 
1666         bh_lock_sock_nested(sk);
1667         tcp_segs_in(tcp_sk(sk), skb);
1668         ret = 0;
1669         if (!sock_owned_by_user(sk)) {
1670                 if (!tcp_prequeue(sk, skb))
1671                         ret = tcp_v4_do_rcv(sk, skb);
1672         } else if (unlikely(sk_add_backlog(sk, skb,
1673                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1674                 bh_unlock_sock(sk);
1675                 __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
1676                 goto discard_and_relse;
1677         }
1678         bh_unlock_sock(sk);
1679 
1680 put_and_return:
1681         if (refcounted)
1682                 sock_put(sk);
1683 
1684         return ret;
1685 
1686 no_tcp_socket:
1687         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1688                 goto discard_it;
1689 
1690         if (tcp_checksum_complete(skb)) {
1691 csum_error:
1692                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1693 bad_packet:
1694                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1695         } else {
1696                 tcp_v4_send_reset(NULL, skb);
1697         }
1698 
1699 discard_it:
1700         /* Discard frame. */
1701         kfree_skb(skb);
1702         return 0;
1703 
1704 discard_and_relse:
1705         sk_drops_add(sk, skb);
1706         if (refcounted)
1707                 sock_put(sk);
1708         goto discard_it;
1709 
1710 do_time_wait:
1711         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1712                 inet_twsk_put(inet_twsk(sk));
1713                 goto discard_it;
1714         }
1715 
1716         if (tcp_checksum_complete(skb)) {
1717                 inet_twsk_put(inet_twsk(sk));
1718                 goto csum_error;
1719         }
1720         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1721         case TCP_TW_SYN: {
1722                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1723                                                         &tcp_hashinfo, skb,
1724                                                         __tcp_hdrlen(th),
1725                                                         iph->saddr, th->source,
1726                                                         iph->daddr, th->dest,
1727                                                         inet_iif(skb));
1728                 if (sk2) {
1729                         inet_twsk_deschedule_put(inet_twsk(sk));
1730                         sk = sk2;
1731                         refcounted = false;
1732                         goto process;
1733                 }
1734                 /* Fall through to ACK */
1735         }
1736         case TCP_TW_ACK:
1737                 tcp_v4_timewait_ack(sk, skb);
1738                 break;
1739         case TCP_TW_RST:
1740                 tcp_v4_send_reset(sk, skb);
1741                 inet_twsk_deschedule_put(inet_twsk(sk));
1742                 goto discard_it;
1743         case TCP_TW_SUCCESS:;
1744         }
1745         goto discard_it;
1746 }
1747 
1748 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1749         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1750         .twsk_unique    = tcp_twsk_unique,
1751         .twsk_destructor= tcp_twsk_destructor,
1752 };
1753 
1754 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1755 {
1756         struct dst_entry *dst = skb_dst(skb);
1757 
1758         if (dst && dst_hold_safe(dst)) {
1759                 sk->sk_rx_dst = dst;
1760                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1761         }
1762 }
1763 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1764 
1765 const struct inet_connection_sock_af_ops ipv4_specific = {
1766         .queue_xmit        = ip_queue_xmit,
1767         .send_check        = tcp_v4_send_check,
1768         .rebuild_header    = inet_sk_rebuild_header,
1769         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1770         .conn_request      = tcp_v4_conn_request,
1771         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1772         .net_header_len    = sizeof(struct iphdr),
1773         .setsockopt        = ip_setsockopt,
1774         .getsockopt        = ip_getsockopt,
1775         .addr2sockaddr     = inet_csk_addr2sockaddr,
1776         .sockaddr_len      = sizeof(struct sockaddr_in),
1777         .bind_conflict     = inet_csk_bind_conflict,
1778 #ifdef CONFIG_COMPAT
1779         .compat_setsockopt = compat_ip_setsockopt,
1780         .compat_getsockopt = compat_ip_getsockopt,
1781 #endif
1782         .mtu_reduced       = tcp_v4_mtu_reduced,
1783 };
1784 EXPORT_SYMBOL(ipv4_specific);
1785 
1786 #ifdef CONFIG_TCP_MD5SIG
1787 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1788         .md5_lookup             = tcp_v4_md5_lookup,
1789         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1790         .md5_parse              = tcp_v4_parse_md5_keys,
1791 };
1792 #endif
1793 
1794 /* NOTE: A lot of things set to zero explicitly by call to
1795  *       sk_alloc() so need not be done here.
1796  */
1797 static int tcp_v4_init_sock(struct sock *sk)
1798 {
1799         struct inet_connection_sock *icsk = inet_csk(sk);
1800 
1801         tcp_init_sock(sk);
1802 
1803         icsk->icsk_af_ops = &ipv4_specific;
1804 
1805 #ifdef CONFIG_TCP_MD5SIG
1806         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1807 #endif
1808 
1809         return 0;
1810 }
1811 
1812 void tcp_v4_destroy_sock(struct sock *sk)
1813 {
1814         struct tcp_sock *tp = tcp_sk(sk);
1815 
1816         tcp_clear_xmit_timers(sk);
1817 
1818         tcp_cleanup_congestion_control(sk);
1819 
1820         /* Cleanup up the write buffer. */
1821         tcp_write_queue_purge(sk);
1822 
1823         /* Cleans up our, hopefully empty, out_of_order_queue. */
1824         __skb_queue_purge(&tp->out_of_order_queue);
1825 
1826 #ifdef CONFIG_TCP_MD5SIG
1827         /* Clean up the MD5 key list, if any */
1828         if (tp->md5sig_info) {
1829                 tcp_clear_md5_list(sk);
1830                 kfree_rcu(tp->md5sig_info, rcu);
1831                 tp->md5sig_info = NULL;
1832         }
1833 #endif
1834 
1835         /* Clean prequeue, it must be empty really */
1836         __skb_queue_purge(&tp->ucopy.prequeue);
1837 
1838         /* Clean up a referenced TCP bind bucket. */
1839         if (inet_csk(sk)->icsk_bind_hash)
1840                 inet_put_port(sk);
1841 
1842         BUG_ON(tp->fastopen_rsk);
1843 
1844         /* If socket is aborted during connect operation */
1845         tcp_free_fastopen_req(tp);
1846         tcp_saved_syn_free(tp);
1847 
1848         local_bh_disable();
1849         sk_sockets_allocated_dec(sk);
1850         local_bh_enable();
1851 
1852         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1853                 sock_release_memcg(sk);
1854 }
1855 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1856 
1857 #ifdef CONFIG_PROC_FS
1858 /* Proc filesystem TCP sock list dumping. */
1859 
1860 /*
1861  * Get next listener socket follow cur.  If cur is NULL, get first socket
1862  * starting from bucket given in st->bucket; when st->bucket is zero the
1863  * very first socket in the hash table is returned.
1864  */
1865 static void *listening_get_next(struct seq_file *seq, void *cur)
1866 {
1867         struct tcp_iter_state *st = seq->private;
1868         struct net *net = seq_file_net(seq);
1869         struct inet_listen_hashbucket *ilb;
1870         struct inet_connection_sock *icsk;
1871         struct sock *sk = cur;
1872 
1873         if (!sk) {
1874 get_head:
1875                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876                 spin_lock_bh(&ilb->lock);
1877                 sk = sk_head(&ilb->head);
1878                 st->offset = 0;
1879                 goto get_sk;
1880         }
1881         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1882         ++st->num;
1883         ++st->offset;
1884 
1885         sk = sk_next(sk);
1886 get_sk:
1887         sk_for_each_from(sk) {
1888                 if (!net_eq(sock_net(sk), net))
1889                         continue;
1890                 if (sk->sk_family == st->family)
1891                         return sk;
1892                 icsk = inet_csk(sk);
1893         }
1894         spin_unlock_bh(&ilb->lock);
1895         st->offset = 0;
1896         if (++st->bucket < INET_LHTABLE_SIZE)
1897                 goto get_head;
1898         return NULL;
1899 }
1900 
1901 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1902 {
1903         struct tcp_iter_state *st = seq->private;
1904         void *rc;
1905 
1906         st->bucket = 0;
1907         st->offset = 0;
1908         rc = listening_get_next(seq, NULL);
1909 
1910         while (rc && *pos) {
1911                 rc = listening_get_next(seq, rc);
1912                 --*pos;
1913         }
1914         return rc;
1915 }
1916 
1917 static inline bool empty_bucket(const struct tcp_iter_state *st)
1918 {
1919         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1920 }
1921 
1922 /*
1923  * Get first established socket starting from bucket given in st->bucket.
1924  * If st->bucket is zero, the very first socket in the hash is returned.
1925  */
1926 static void *established_get_first(struct seq_file *seq)
1927 {
1928         struct tcp_iter_state *st = seq->private;
1929         struct net *net = seq_file_net(seq);
1930         void *rc = NULL;
1931 
1932         st->offset = 0;
1933         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1934                 struct sock *sk;
1935                 struct hlist_nulls_node *node;
1936                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1937 
1938                 /* Lockless fast path for the common case of empty buckets */
1939                 if (empty_bucket(st))
1940                         continue;
1941 
1942                 spin_lock_bh(lock);
1943                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1944                         if (sk->sk_family != st->family ||
1945                             !net_eq(sock_net(sk), net)) {
1946                                 continue;
1947                         }
1948                         rc = sk;
1949                         goto out;
1950                 }
1951                 spin_unlock_bh(lock);
1952         }
1953 out:
1954         return rc;
1955 }
1956 
1957 static void *established_get_next(struct seq_file *seq, void *cur)
1958 {
1959         struct sock *sk = cur;
1960         struct hlist_nulls_node *node;
1961         struct tcp_iter_state *st = seq->private;
1962         struct net *net = seq_file_net(seq);
1963 
1964         ++st->num;
1965         ++st->offset;
1966 
1967         sk = sk_nulls_next(sk);
1968 
1969         sk_nulls_for_each_from(sk, node) {
1970                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1971                         return sk;
1972         }
1973 
1974         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1975         ++st->bucket;
1976         return established_get_first(seq);
1977 }
1978 
1979 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1980 {
1981         struct tcp_iter_state *st = seq->private;
1982         void *rc;
1983 
1984         st->bucket = 0;
1985         rc = established_get_first(seq);
1986 
1987         while (rc && pos) {
1988                 rc = established_get_next(seq, rc);
1989                 --pos;
1990         }
1991         return rc;
1992 }
1993 
1994 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1995 {
1996         void *rc;
1997         struct tcp_iter_state *st = seq->private;
1998 
1999         st->state = TCP_SEQ_STATE_LISTENING;
2000         rc        = listening_get_idx(seq, &pos);
2001 
2002         if (!rc) {
2003                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2004                 rc        = established_get_idx(seq, pos);
2005         }
2006 
2007         return rc;
2008 }
2009 
2010 static void *tcp_seek_last_pos(struct seq_file *seq)
2011 {
2012         struct tcp_iter_state *st = seq->private;
2013         int offset = st->offset;
2014         int orig_num = st->num;
2015         void *rc = NULL;
2016 
2017         switch (st->state) {
2018         case TCP_SEQ_STATE_LISTENING:
2019                 if (st->bucket >= INET_LHTABLE_SIZE)
2020                         break;
2021                 st->state = TCP_SEQ_STATE_LISTENING;
2022                 rc = listening_get_next(seq, NULL);
2023                 while (offset-- && rc)
2024                         rc = listening_get_next(seq, rc);
2025                 if (rc)
2026                         break;
2027                 st->bucket = 0;
2028                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2029                 /* Fallthrough */
2030         case TCP_SEQ_STATE_ESTABLISHED:
2031                 if (st->bucket > tcp_hashinfo.ehash_mask)
2032                         break;
2033                 rc = established_get_first(seq);
2034                 while (offset-- && rc)
2035                         rc = established_get_next(seq, rc);
2036         }
2037 
2038         st->num = orig_num;
2039 
2040         return rc;
2041 }
2042 
2043 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2044 {
2045         struct tcp_iter_state *st = seq->private;
2046         void *rc;
2047 
2048         if (*pos && *pos == st->last_pos) {
2049                 rc = tcp_seek_last_pos(seq);
2050                 if (rc)
2051                         goto out;
2052         }
2053 
2054         st->state = TCP_SEQ_STATE_LISTENING;
2055         st->num = 0;
2056         st->bucket = 0;
2057         st->offset = 0;
2058         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2059 
2060 out:
2061         st->last_pos = *pos;
2062         return rc;
2063 }
2064 
2065 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2066 {
2067         struct tcp_iter_state *st = seq->private;
2068         void *rc = NULL;
2069 
2070         if (v == SEQ_START_TOKEN) {
2071                 rc = tcp_get_idx(seq, 0);
2072                 goto out;
2073         }
2074 
2075         switch (st->state) {
2076         case TCP_SEQ_STATE_LISTENING:
2077                 rc = listening_get_next(seq, v);
2078                 if (!rc) {
2079                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2080                         st->bucket = 0;
2081                         st->offset = 0;
2082                         rc        = established_get_first(seq);
2083                 }
2084                 break;
2085         case TCP_SEQ_STATE_ESTABLISHED:
2086                 rc = established_get_next(seq, v);
2087                 break;
2088         }
2089 out:
2090         ++*pos;
2091         st->last_pos = *pos;
2092         return rc;
2093 }
2094 
2095 static void tcp_seq_stop(struct seq_file *seq, void *v)
2096 {
2097         struct tcp_iter_state *st = seq->private;
2098 
2099         switch (st->state) {
2100         case TCP_SEQ_STATE_LISTENING:
2101                 if (v != SEQ_START_TOKEN)
2102                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2103                 break;
2104         case TCP_SEQ_STATE_ESTABLISHED:
2105                 if (v)
2106                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2107                 break;
2108         }
2109 }
2110 
2111 int tcp_seq_open(struct inode *inode, struct file *file)
2112 {
2113         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2114         struct tcp_iter_state *s;
2115         int err;
2116 
2117         err = seq_open_net(inode, file, &afinfo->seq_ops,
2118                           sizeof(struct tcp_iter_state));
2119         if (err < 0)
2120                 return err;
2121 
2122         s = ((struct seq_file *)file->private_data)->private;
2123         s->family               = afinfo->family;
2124         s->last_pos             = 0;
2125         return 0;
2126 }
2127 EXPORT_SYMBOL(tcp_seq_open);
2128 
2129 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2130 {
2131         int rc = 0;
2132         struct proc_dir_entry *p;
2133 
2134         afinfo->seq_ops.start           = tcp_seq_start;
2135         afinfo->seq_ops.next            = tcp_seq_next;
2136         afinfo->seq_ops.stop            = tcp_seq_stop;
2137 
2138         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2139                              afinfo->seq_fops, afinfo);
2140         if (!p)
2141                 rc = -ENOMEM;
2142         return rc;
2143 }
2144 EXPORT_SYMBOL(tcp_proc_register);
2145 
2146 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2147 {
2148         remove_proc_entry(afinfo->name, net->proc_net);
2149 }
2150 EXPORT_SYMBOL(tcp_proc_unregister);
2151 
2152 static void get_openreq4(const struct request_sock *req,
2153                          struct seq_file *f, int i)
2154 {
2155         const struct inet_request_sock *ireq = inet_rsk(req);
2156         long delta = req->rsk_timer.expires - jiffies;
2157 
2158         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2159                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2160                 i,
2161                 ireq->ir_loc_addr,
2162                 ireq->ir_num,
2163                 ireq->ir_rmt_addr,
2164                 ntohs(ireq->ir_rmt_port),
2165                 TCP_SYN_RECV,
2166                 0, 0, /* could print option size, but that is af dependent. */
2167                 1,    /* timers active (only the expire timer) */
2168                 jiffies_delta_to_clock_t(delta),
2169                 req->num_timeout,
2170                 from_kuid_munged(seq_user_ns(f),
2171                                  sock_i_uid(req->rsk_listener)),
2172                 0,  /* non standard timer */
2173                 0, /* open_requests have no inode */
2174                 0,
2175                 req);
2176 }
2177 
2178 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2179 {
2180         int timer_active;
2181         unsigned long timer_expires;
2182         const struct tcp_sock *tp = tcp_sk(sk);
2183         const struct inet_connection_sock *icsk = inet_csk(sk);
2184         const struct inet_sock *inet = inet_sk(sk);
2185         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2186         __be32 dest = inet->inet_daddr;
2187         __be32 src = inet->inet_rcv_saddr;
2188         __u16 destp = ntohs(inet->inet_dport);
2189         __u16 srcp = ntohs(inet->inet_sport);
2190         int rx_queue;
2191         int state;
2192 
2193         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2194             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2195             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2196                 timer_active    = 1;
2197                 timer_expires   = icsk->icsk_timeout;
2198         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2199                 timer_active    = 4;
2200                 timer_expires   = icsk->icsk_timeout;
2201         } else if (timer_pending(&sk->sk_timer)) {
2202                 timer_active    = 2;
2203                 timer_expires   = sk->sk_timer.expires;
2204         } else {
2205                 timer_active    = 0;
2206                 timer_expires = jiffies;
2207         }
2208 
2209         state = sk_state_load(sk);
2210         if (state == TCP_LISTEN)
2211                 rx_queue = sk->sk_ack_backlog;
2212         else
2213                 /* Because we don't lock the socket,
2214                  * we might find a transient negative value.
2215                  */
2216                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2217 
2218         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2219                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2220                 i, src, srcp, dest, destp, state,
2221                 tp->write_seq - tp->snd_una,
2222                 rx_queue,
2223                 timer_active,
2224                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2225                 icsk->icsk_retransmits,
2226                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2227                 icsk->icsk_probes_out,
2228                 sock_i_ino(sk),
2229                 atomic_read(&sk->sk_refcnt), sk,
2230                 jiffies_to_clock_t(icsk->icsk_rto),
2231                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2232                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2233                 tp->snd_cwnd,
2234                 state == TCP_LISTEN ?
2235                     fastopenq->max_qlen :
2236                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2237 }
2238 
2239 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2240                                struct seq_file *f, int i)
2241 {
2242         long delta = tw->tw_timer.expires - jiffies;
2243         __be32 dest, src;
2244         __u16 destp, srcp;
2245 
2246         dest  = tw->tw_daddr;
2247         src   = tw->tw_rcv_saddr;
2248         destp = ntohs(tw->tw_dport);
2249         srcp  = ntohs(tw->tw_sport);
2250 
2251         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2252                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2253                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2254                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2255                 atomic_read(&tw->tw_refcnt), tw);
2256 }
2257 
2258 #define TMPSZ 150
2259 
2260 static int tcp4_seq_show(struct seq_file *seq, void *v)
2261 {
2262         struct tcp_iter_state *st;
2263         struct sock *sk = v;
2264 
2265         seq_setwidth(seq, TMPSZ - 1);
2266         if (v == SEQ_START_TOKEN) {
2267                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2268                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2269                            "inode");
2270                 goto out;
2271         }
2272         st = seq->private;
2273 
2274         if (sk->sk_state == TCP_TIME_WAIT)
2275                 get_timewait4_sock(v, seq, st->num);
2276         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2277                 get_openreq4(v, seq, st->num);
2278         else
2279                 get_tcp4_sock(v, seq, st->num);
2280 out:
2281         seq_pad(seq, '\n');
2282         return 0;
2283 }
2284 
2285 static const struct file_operations tcp_afinfo_seq_fops = {
2286         .owner   = THIS_MODULE,
2287         .open    = tcp_seq_open,
2288         .read    = seq_read,
2289         .llseek  = seq_lseek,
2290         .release = seq_release_net
2291 };
2292 
2293 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2294         .name           = "tcp",
2295         .family         = AF_INET,
2296         .seq_fops       = &tcp_afinfo_seq_fops,
2297         .seq_ops        = {
2298                 .show           = tcp4_seq_show,
2299         },
2300 };
2301 
2302 static int __net_init tcp4_proc_init_net(struct net *net)
2303 {
2304         return tcp_proc_register(net, &tcp4_seq_afinfo);
2305 }
2306 
2307 static void __net_exit tcp4_proc_exit_net(struct net *net)
2308 {
2309         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2310 }
2311 
2312 static struct pernet_operations tcp4_net_ops = {
2313         .init = tcp4_proc_init_net,
2314         .exit = tcp4_proc_exit_net,
2315 };
2316 
2317 int __init tcp4_proc_init(void)
2318 {
2319         return register_pernet_subsys(&tcp4_net_ops);
2320 }
2321 
2322 void tcp4_proc_exit(void)
2323 {
2324         unregister_pernet_subsys(&tcp4_net_ops);
2325 }
2326 #endif /* CONFIG_PROC_FS */
2327 
2328 struct proto tcp_prot = {
2329         .name                   = "TCP",
2330         .owner                  = THIS_MODULE,
2331         .close                  = tcp_close,
2332         .connect                = tcp_v4_connect,
2333         .disconnect             = tcp_disconnect,
2334         .accept                 = inet_csk_accept,
2335         .ioctl                  = tcp_ioctl,
2336         .init                   = tcp_v4_init_sock,
2337         .destroy                = tcp_v4_destroy_sock,
2338         .shutdown               = tcp_shutdown,
2339         .setsockopt             = tcp_setsockopt,
2340         .getsockopt             = tcp_getsockopt,
2341         .recvmsg                = tcp_recvmsg,
2342         .sendmsg                = tcp_sendmsg,
2343         .sendpage               = tcp_sendpage,
2344         .backlog_rcv            = tcp_v4_do_rcv,
2345         .release_cb             = tcp_release_cb,
2346         .hash                   = inet_hash,
2347         .unhash                 = inet_unhash,
2348         .get_port               = inet_csk_get_port,
2349         .enter_memory_pressure  = tcp_enter_memory_pressure,
2350         .stream_memory_free     = tcp_stream_memory_free,
2351         .sockets_allocated      = &tcp_sockets_allocated,
2352         .orphan_count           = &tcp_orphan_count,
2353         .memory_allocated       = &tcp_memory_allocated,
2354         .memory_pressure        = &tcp_memory_pressure,
2355         .sysctl_mem             = sysctl_tcp_mem,
2356         .sysctl_wmem            = sysctl_tcp_wmem,
2357         .sysctl_rmem            = sysctl_tcp_rmem,
2358         .max_header             = MAX_TCP_HEADER,
2359         .obj_size               = sizeof(struct tcp_sock),
2360         .slab_flags             = SLAB_DESTROY_BY_RCU,
2361         .twsk_prot              = &tcp_timewait_sock_ops,
2362         .rsk_prot               = &tcp_request_sock_ops,
2363         .h.hashinfo             = &tcp_hashinfo,
2364         .no_autobind            = true,
2365 #ifdef CONFIG_COMPAT
2366         .compat_setsockopt      = compat_tcp_setsockopt,
2367         .compat_getsockopt      = compat_tcp_getsockopt,
2368 #endif
2369         .diag_destroy           = tcp_abort,
2370 };
2371 EXPORT_SYMBOL(tcp_prot);
2372 
2373 static void __net_exit tcp_sk_exit(struct net *net)
2374 {
2375         int cpu;
2376 
2377         for_each_possible_cpu(cpu)
2378                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2379         free_percpu(net->ipv4.tcp_sk);
2380 }
2381 
2382 static int __net_init tcp_sk_init(struct net *net)
2383 {
2384         int res, cpu;
2385 
2386         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2387         if (!net->ipv4.tcp_sk)
2388                 return -ENOMEM;
2389 
2390         for_each_possible_cpu(cpu) {
2391                 struct sock *sk;
2392 
2393                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2394                                            IPPROTO_TCP, net);
2395                 if (res)
2396                         goto fail;
2397                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2398                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2399         }
2400 
2401         net->ipv4.sysctl_tcp_ecn = 2;
2402         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2403 
2404         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2405         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2406         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2407 
2408         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2409         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2410         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2411 
2412         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2413         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2414         net->ipv4.sysctl_tcp_syncookies = 1;
2415         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2416         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2417         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2418         net->ipv4.sysctl_tcp_orphan_retries = 0;
2419         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2420         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2421 
2422         return 0;
2423 fail:
2424         tcp_sk_exit(net);
2425 
2426         return res;
2427 }
2428 
2429 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2430 {
2431         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2432 }
2433 
2434 static struct pernet_operations __net_initdata tcp_sk_ops = {
2435        .init       = tcp_sk_init,
2436        .exit       = tcp_sk_exit,
2437        .exit_batch = tcp_sk_exit_batch,
2438 };
2439 
2440 void __init tcp_v4_init(void)
2441 {
2442         inet_hashinfo_init(&tcp_hashinfo);
2443         if (register_pernet_subsys(&tcp_sk_ops))
2444                 panic("Failed to create the TCP control socket.\n");
2445 }
2446 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp