~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/ipv4/tcp.c

Version: ~ [ linux-5.8 ] ~ [ linux-5.7.14 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.57 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.138 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.193 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.232 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.232 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              Implementation of the Transmission Control Protocol(TCP).
  7  *
  8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
  9  *
 10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
 11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
 14  *              Florian La Roche, <flla@stud.uni-sb.de>
 15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
 17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
 19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 20  *              Jorge Cwik, <jorge@laser.satlink.net>
 21  *
 22  * Fixes:
 23  *              Alan Cox        :       Numerous verify_area() calls
 24  *              Alan Cox        :       Set the ACK bit on a reset
 25  *              Alan Cox        :       Stopped it crashing if it closed while
 26  *                                      sk->inuse=1 and was trying to connect
 27  *                                      (tcp_err()).
 28  *              Alan Cox        :       All icmp error handling was broken
 29  *                                      pointers passed where wrong and the
 30  *                                      socket was looked up backwards. Nobody
 31  *                                      tested any icmp error code obviously.
 32  *              Alan Cox        :       tcp_err() now handled properly. It
 33  *                                      wakes people on errors. poll
 34  *                                      behaves and the icmp error race
 35  *                                      has gone by moving it into sock.c
 36  *              Alan Cox        :       tcp_send_reset() fixed to work for
 37  *                                      everything not just packets for
 38  *                                      unknown sockets.
 39  *              Alan Cox        :       tcp option processing.
 40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
 41  *                                      syn rule wrong]
 42  *              Herp Rosmanith  :       More reset fixes
 43  *              Alan Cox        :       No longer acks invalid rst frames.
 44  *                                      Acking any kind of RST is right out.
 45  *              Alan Cox        :       Sets an ignore me flag on an rst
 46  *                                      receive otherwise odd bits of prattle
 47  *                                      escape still
 48  *              Alan Cox        :       Fixed another acking RST frame bug.
 49  *                                      Should stop LAN workplace lockups.
 50  *              Alan Cox        :       Some tidyups using the new skb list
 51  *                                      facilities
 52  *              Alan Cox        :       sk->keepopen now seems to work
 53  *              Alan Cox        :       Pulls options out correctly on accepts
 54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
 55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
 56  *                                      bit to skb ops.
 57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
 58  *                                      nasty.
 59  *              Alan Cox        :       Added some better commenting, as the
 60  *                                      tcp is hard to follow
 61  *              Alan Cox        :       Removed incorrect check for 20 * psh
 62  *      Michael O'Reilly        :       ack < copied bug fix.
 63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
 64  *              Alan Cox        :       FIN with no memory -> CRASH
 65  *              Alan Cox        :       Added socket option proto entries.
 66  *                                      Also added awareness of them to accept.
 67  *              Alan Cox        :       Added TCP options (SOL_TCP)
 68  *              Alan Cox        :       Switched wakeup calls to callbacks,
 69  *                                      so the kernel can layer network
 70  *                                      sockets.
 71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
 72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
 73  *              Alan Cox        :       RST frames sent on unsynchronised
 74  *                                      state ack error.
 75  *              Alan Cox        :       Put in missing check for SYN bit.
 76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
 77  *                                      window non shrink trick.
 78  *              Alan Cox        :       Added a couple of small NET2E timer
 79  *                                      fixes
 80  *              Charles Hedrick :       TCP fixes
 81  *              Toomas Tamm     :       TCP window fixes
 82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
 83  *              Charles Hedrick :       Rewrote most of it to actually work
 84  *              Linus           :       Rewrote tcp_read() and URG handling
 85  *                                      completely
 86  *              Gerhard Koerting:       Fixed some missing timer handling
 87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
 88  *              Gerhard Koerting:       PC/TCP workarounds
 89  *              Adam Caldwell   :       Assorted timer/timing errors
 90  *              Matthew Dillon  :       Fixed another RST bug
 91  *              Alan Cox        :       Move to kernel side addressing changes.
 92  *              Alan Cox        :       Beginning work on TCP fastpathing
 93  *                                      (not yet usable)
 94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
 95  *              Alan Cox        :       TCP fast path debugging
 96  *              Alan Cox        :       Window clamping
 97  *              Michael Riepe   :       Bug in tcp_check()
 98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
 99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249 
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 
260 #include <net/icmp.h>
261 #include <net/tcp.h>
262 #include <net/xfrm.h>
263 #include <net/ip.h>
264 
265 
266 #include <asm/uaccess.h>
267 #include <asm/ioctls.h>
268 
269 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
270 
271 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
272 
273 kmem_cache_t *tcp_openreq_cachep;
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276 
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278 
279 int sysctl_tcp_mem[3];
280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282 
283 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
284 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
285 
286 /* Pressure flag: try to collapse.
287  * Technical note: it is used by multiple contexts non atomically.
288  * All the tcp_mem_schedule() is of this nature: accounting
289  * is strict, actions are advisory and have some latency. */
290 int tcp_memory_pressure;
291 
292 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
293 
294 int tcp_mem_schedule(struct sock *sk, int size, int kind)
295 {
296         int amt = TCP_PAGES(size);
297 
298         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
299         atomic_add(amt, &tcp_memory_allocated);
300 
301         /* Under limit. */
302         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
303                 if (tcp_memory_pressure)
304                         tcp_memory_pressure = 0;
305                 return 1;
306         }
307 
308         /* Over hard limit. */
309         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
310                 tcp_enter_memory_pressure();
311                 goto suppress_allocation;
312         }
313 
314         /* Under pressure. */
315         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
316                 tcp_enter_memory_pressure();
317 
318         if (kind) {
319                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
320                         return 1;
321         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
322                 return 1;
323 
324         if (!tcp_memory_pressure ||
325             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
326                                 TCP_PAGES(sk->sk_wmem_queued +
327                                           atomic_read(&sk->sk_rmem_alloc) +
328                                           sk->sk_forward_alloc))
329                 return 1;
330 
331 suppress_allocation:
332 
333         if (!kind) {
334                 tcp_moderate_sndbuf(sk);
335 
336                 /* Fail only if socket is _under_ its sndbuf.
337                  * In this case we cannot block, so that we have to fail.
338                  */
339                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
340                         return 1;
341         }
342 
343         /* Alas. Undo changes. */
344         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
345         atomic_sub(amt, &tcp_memory_allocated);
346         return 0;
347 }
348 
349 void __tcp_mem_reclaim(struct sock *sk)
350 {
351         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
352                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
353                            &tcp_memory_allocated);
354                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
355                 if (tcp_memory_pressure &&
356                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
357                         tcp_memory_pressure = 0;
358         }
359 }
360 
361 void tcp_rfree(struct sk_buff *skb)
362 {
363         struct sock *sk = skb->sk;
364 
365         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
366         sk->sk_forward_alloc += skb->truesize;
367 }
368 
369 /*
370  * LISTEN is a special case for poll..
371  */
372 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
373                                                poll_table *wait)
374 {
375         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
376 }
377 
378 /*
379  *      Wait for a TCP event.
380  *
381  *      Note that we don't need to lock the socket, as the upper poll layers
382  *      take care of normal races (between the test and the event) and we don't
383  *      go look at any of the socket buffers directly.
384  */
385 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
386 {
387         unsigned int mask;
388         struct sock *sk = sock->sk;
389         struct tcp_opt *tp = tcp_sk(sk);
390 
391         poll_wait(file, sk->sk_sleep, wait);
392         if (sk->sk_state == TCP_LISTEN)
393                 return tcp_listen_poll(sk, wait);
394 
395         /* Socket is not locked. We are protected from async events
396            by poll logic and correct handling of state changes
397            made by another threads is impossible in any case.
398          */
399 
400         mask = 0;
401         if (sk->sk_err)
402                 mask = POLLERR;
403 
404         /*
405          * POLLHUP is certainly not done right. But poll() doesn't
406          * have a notion of HUP in just one direction, and for a
407          * socket the read side is more interesting.
408          *
409          * Some poll() documentation says that POLLHUP is incompatible
410          * with the POLLOUT/POLLWR flags, so somebody should check this
411          * all. But careful, it tends to be safer to return too many
412          * bits than too few, and you can easily break real applications
413          * if you don't tell them that something has hung up!
414          *
415          * Check-me.
416          *
417          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
418          * our fs/select.c). It means that after we received EOF,
419          * poll always returns immediately, making impossible poll() on write()
420          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
421          * if and only if shutdown has been made in both directions.
422          * Actually, it is interesting to look how Solaris and DUX
423          * solve this dilemma. I would prefer, if PULLHUP were maskable,
424          * then we could set it on SND_SHUTDOWN. BTW examples given
425          * in Stevens' books assume exactly this behaviour, it explains
426          * why PULLHUP is incompatible with POLLOUT.    --ANK
427          *
428          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
429          * blocking on fresh not-connected or disconnected socket. --ANK
430          */
431         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
432                 mask |= POLLHUP;
433         if (sk->sk_shutdown & RCV_SHUTDOWN)
434                 mask |= POLLIN | POLLRDNORM;
435 
436         /* Connected? */
437         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
438                 /* Potential race condition. If read of tp below will
439                  * escape above sk->sk_state, we can be illegally awaken
440                  * in SYN_* states. */
441                 if ((tp->rcv_nxt != tp->copied_seq) &&
442                     (tp->urg_seq != tp->copied_seq ||
443                      tp->rcv_nxt != tp->copied_seq + 1 ||
444                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
445                         mask |= POLLIN | POLLRDNORM;
446 
447                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
448                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
449                                 mask |= POLLOUT | POLLWRNORM;
450                         } else {  /* send SIGIO later */
451                                 set_bit(SOCK_ASYNC_NOSPACE,
452                                         &sk->sk_socket->flags);
453                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
454 
455                                 /* Race breaker. If space is freed after
456                                  * wspace test but before the flags are set,
457                                  * IO signal will be lost.
458                                  */
459                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
460                                         mask |= POLLOUT | POLLWRNORM;
461                         }
462                 }
463 
464                 if (tp->urg_data & TCP_URG_VALID)
465                         mask |= POLLPRI;
466         }
467         return mask;
468 }
469 
470 /*
471  *      TCP socket write_space callback.
472  */
473 void tcp_write_space(struct sock *sk)
474 {
475         struct socket *sock = sk->sk_socket;
476 
477         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
478                 clear_bit(SOCK_NOSPACE, &sock->flags);
479 
480                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
481                         wake_up_interruptible(sk->sk_sleep);
482 
483                 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
484                         sock_wake_async(sock, 2, POLL_OUT);
485         }
486 }
487 
488 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
489 {
490         struct tcp_opt *tp = tcp_sk(sk);
491         int answ;
492 
493         switch (cmd) {
494         case SIOCINQ:
495                 if (sk->sk_state == TCP_LISTEN)
496                         return -EINVAL;
497 
498                 lock_sock(sk);
499                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
500                         answ = 0;
501                 else if (sock_flag(sk, SOCK_URGINLINE) ||
502                          !tp->urg_data ||
503                          before(tp->urg_seq, tp->copied_seq) ||
504                          !before(tp->urg_seq, tp->rcv_nxt)) {
505                         answ = tp->rcv_nxt - tp->copied_seq;
506 
507                         /* Subtract 1, if FIN is in queue. */
508                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
509                                 answ -=
510                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
511                 } else
512                         answ = tp->urg_seq - tp->copied_seq;
513                 release_sock(sk);
514                 break;
515         case SIOCATMARK:
516                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
517                 break;
518         case SIOCOUTQ:
519                 if (sk->sk_state == TCP_LISTEN)
520                         return -EINVAL;
521 
522                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
523                         answ = 0;
524                 else
525                         answ = tp->write_seq - tp->snd_una;
526                 break;
527         default:
528                 return -ENOIOCTLCMD;
529         };
530 
531         return put_user(answ, (int *)arg);
532 }
533 
534 
535 int tcp_listen_start(struct sock *sk)
536 {
537         struct inet_opt *inet = inet_sk(sk);
538         struct tcp_opt *tp = tcp_sk(sk);
539         struct tcp_listen_opt *lopt;
540 
541         sk->sk_max_ack_backlog = 0;
542         sk->sk_ack_backlog = 0;
543         tp->accept_queue = tp->accept_queue_tail = NULL;
544         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
545         tcp_delack_init(tp);
546 
547         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
548         if (!lopt)
549                 return -ENOMEM;
550 
551         memset(lopt, 0, sizeof(struct tcp_listen_opt));
552         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
553                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
554                         break;
555         get_random_bytes(&lopt->hash_rnd, 4);
556 
557         write_lock_bh(&tp->syn_wait_lock);
558         tp->listen_opt = lopt;
559         write_unlock_bh(&tp->syn_wait_lock);
560 
561         /* There is race window here: we announce ourselves listening,
562          * but this transition is still not validated by get_port().
563          * It is OK, because this socket enters to hash table only
564          * after validation is complete.
565          */
566         sk->sk_state = TCP_LISTEN;
567         if (!sk->sk_prot->get_port(sk, inet->num)) {
568                 inet->sport = htons(inet->num);
569 
570                 sk_dst_reset(sk);
571                 sk->sk_prot->hash(sk);
572 
573                 return 0;
574         }
575 
576         sk->sk_state = TCP_CLOSE;
577         write_lock_bh(&tp->syn_wait_lock);
578         tp->listen_opt = NULL;
579         write_unlock_bh(&tp->syn_wait_lock);
580         kfree(lopt);
581         return -EADDRINUSE;
582 }
583 
584 /*
585  *      This routine closes sockets which have been at least partially
586  *      opened, but not yet accepted.
587  */
588 
589 static void tcp_listen_stop (struct sock *sk)
590 {
591         struct tcp_opt *tp = tcp_sk(sk);
592         struct tcp_listen_opt *lopt = tp->listen_opt;
593         struct open_request *acc_req = tp->accept_queue;
594         struct open_request *req;
595         int i;
596 
597         tcp_delete_keepalive_timer(sk);
598 
599         /* make all the listen_opt local to us */
600         write_lock_bh(&tp->syn_wait_lock);
601         tp->listen_opt = NULL;
602         write_unlock_bh(&tp->syn_wait_lock);
603         tp->accept_queue = tp->accept_queue_tail = NULL;
604 
605         if (lopt->qlen) {
606                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
607                         while ((req = lopt->syn_table[i]) != NULL) {
608                                 lopt->syn_table[i] = req->dl_next;
609                                 lopt->qlen--;
610                                 tcp_openreq_free(req);
611 
612                 /* Following specs, it would be better either to send FIN
613                  * (and enter FIN-WAIT-1, it is normal close)
614                  * or to send active reset (abort).
615                  * Certainly, it is pretty dangerous while synflood, but it is
616                  * bad justification for our negligence 8)
617                  * To be honest, we are not able to make either
618                  * of the variants now.                 --ANK
619                  */
620                         }
621                 }
622         }
623         BUG_TRAP(!lopt->qlen);
624 
625         kfree(lopt);
626 
627         while ((req = acc_req) != NULL) {
628                 struct sock *child = req->sk;
629 
630                 acc_req = req->dl_next;
631 
632                 local_bh_disable();
633                 bh_lock_sock(child);
634                 BUG_TRAP(!sock_owned_by_user(child));
635                 sock_hold(child);
636 
637                 tcp_disconnect(child, O_NONBLOCK);
638 
639                 sock_orphan(child);
640 
641                 atomic_inc(&tcp_orphan_count);
642 
643                 tcp_destroy_sock(child);
644 
645                 bh_unlock_sock(child);
646                 local_bh_enable();
647                 sock_put(child);
648 
649                 tcp_acceptq_removed(sk);
650                 tcp_openreq_fastfree(req);
651         }
652         BUG_TRAP(!sk->sk_ack_backlog);
653 }
654 
655 /*
656  *      Wait for a socket to get into the connected state
657  *
658  *      Note: Must be called with the socket locked.
659  */
660 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
661 {
662         struct tcp_opt *tp = tcp_sk(sk);
663         struct task_struct *tsk = current;
664         DEFINE_WAIT(wait);
665 
666         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
667                 if (sk->sk_err)
668                         return sock_error(sk);
669                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
670                         return -EPIPE;
671                 if (!*timeo_p)
672                         return -EAGAIN;
673                 if (signal_pending(tsk))
674                         return sock_intr_errno(*timeo_p);
675 
676                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
677                 tp->write_pending++;
678 
679                 release_sock(sk);
680                 *timeo_p = schedule_timeout(*timeo_p);
681                 lock_sock(sk);
682 
683                 finish_wait(sk->sk_sleep, &wait);
684                 tp->write_pending--;
685         }
686         return 0;
687 }
688 
689 static inline int tcp_memory_free(struct sock *sk)
690 {
691         return sk->sk_wmem_queued < sk->sk_sndbuf;
692 }
693 
694 /*
695  *      Wait for more memory for a socket
696  */
697 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
698 {
699         struct tcp_opt *tp = tcp_sk(sk);
700         int err = 0;
701         long vm_wait = 0;
702         long current_timeo = *timeo;
703         DEFINE_WAIT(wait);
704 
705         if (tcp_memory_free(sk))
706                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
707 
708         for (;;) {
709                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
710 
711                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
712 
713                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
714                         goto do_error;
715                 if (!*timeo)
716                         goto do_nonblock;
717                 if (signal_pending(current))
718                         goto do_interrupted;
719                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
720                 if (tcp_memory_free(sk) && !vm_wait)
721                         break;
722 
723                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
724                 tp->write_pending++;
725                 release_sock(sk);
726                 if (!tcp_memory_free(sk) || vm_wait)
727                         current_timeo = schedule_timeout(current_timeo);
728                 lock_sock(sk);
729                 tp->write_pending--;
730 
731                 if (vm_wait) {
732                         vm_wait -= current_timeo;
733                         current_timeo = *timeo;
734                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
735                             (current_timeo -= vm_wait) < 0)
736                                 current_timeo = 0;
737                         vm_wait = 0;
738                 }
739                 *timeo = current_timeo;
740         }
741 out:
742         finish_wait(sk->sk_sleep, &wait);
743         return err;
744 
745 do_error:
746         err = -EPIPE;
747         goto out;
748 do_nonblock:
749         err = -EAGAIN;
750         goto out;
751 do_interrupted:
752         err = sock_intr_errno(*timeo);
753         goto out;
754 }
755 
756 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
757                          size_t psize, int flags);
758 
759 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
760                                int off)
761 {
762         if (i) {
763                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
764                 return page == frag->page &&
765                        off == frag->page_offset + frag->size;
766         }
767         return 0;
768 }
769 
770 static inline void fill_page_desc(struct sk_buff *skb, int i,
771                                   struct page *page, int off, int size)
772 {
773         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
774         frag->page = page;
775         frag->page_offset = off;
776         frag->size = size;
777         skb_shinfo(skb)->nr_frags = i + 1;
778 }
779 
780 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
781 {
782         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
783         tp->pushed_seq = tp->write_seq;
784 }
785 
786 static inline int forced_push(struct tcp_opt *tp)
787 {
788         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
789 }
790 
791 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
792                               struct sk_buff *skb)
793 {
794         skb->csum = 0;
795         TCP_SKB_CB(skb)->seq = tp->write_seq;
796         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
797         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
798         TCP_SKB_CB(skb)->sacked = 0;
799         __skb_queue_tail(&sk->sk_write_queue, skb);
800         tcp_charge_skb(sk, skb);
801         if (!tp->send_head)
802                 tp->send_head = skb;
803         else if (tp->nonagle&TCP_NAGLE_PUSH)
804                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
805 }
806 
807 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
808                                 struct sk_buff *skb)
809 {
810         if (flags & MSG_OOB) {
811                 tp->urg_mode = 1;
812                 tp->snd_up = tp->write_seq;
813                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
814         }
815 }
816 
817 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
818                             int mss_now, int nonagle)
819 {
820         if (tp->send_head) {
821                 struct sk_buff *skb = sk->sk_write_queue.prev;
822                 if (!(flags & MSG_MORE) || forced_push(tp))
823                         tcp_mark_push(tp, skb);
824                 tcp_mark_urg(tp, flags, skb);
825                 __tcp_push_pending_frames(sk, tp, mss_now,
826                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
827         }
828 }
829 
830 static int tcp_error(struct sock *sk, int flags, int err)
831 {
832         if (err == -EPIPE)
833                 err = sock_error(sk) ? : -EPIPE;
834         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
835                 send_sig(SIGPIPE, current, 0);
836         return err;
837 }
838 
839 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
840                          size_t psize, int flags)
841 {
842         struct tcp_opt *tp = tcp_sk(sk);
843         int mss_now;
844         int err;
845         ssize_t copied;
846         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
847 
848         /* Wait for a connection to finish. */
849         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
850                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
851                         goto out_err;
852 
853         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
854 
855         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
856         copied = 0;
857 
858         err = -EPIPE;
859         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
860                 goto do_error;
861 
862         while (psize > 0) {
863                 struct sk_buff *skb = sk->sk_write_queue.prev;
864                 struct page *page = pages[poffset / PAGE_SIZE];
865                 int copy, i;
866                 int offset = poffset % PAGE_SIZE;
867                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
868 
869                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
870 new_segment:
871                         if (!tcp_memory_free(sk))
872                                 goto wait_for_sndbuf;
873 
874                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
875                                              sk->sk_allocation);
876                         if (!skb)
877                                 goto wait_for_memory;
878 
879                         skb_entail(sk, tp, skb);
880                         copy = mss_now;
881                 }
882 
883                 if (copy > size)
884                         copy = size;
885 
886                 i = skb_shinfo(skb)->nr_frags;
887                 if (can_coalesce(skb, i, page, offset)) {
888                         skb_shinfo(skb)->frags[i - 1].size += copy;
889                 } else if (i < MAX_SKB_FRAGS) {
890                         get_page(page);
891                         fill_page_desc(skb, i, page, offset, copy);
892                 } else {
893                         tcp_mark_push(tp, skb);
894                         goto new_segment;
895                 }
896 
897                 skb->len += copy;
898                 skb->data_len += copy;
899                 skb->ip_summed = CHECKSUM_HW;
900                 tp->write_seq += copy;
901                 TCP_SKB_CB(skb)->end_seq += copy;
902 
903                 if (!copied)
904                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
905 
906                 copied += copy;
907                 poffset += copy;
908                 if (!(psize -= copy))
909                         goto out;
910 
911                 if (skb->len != mss_now || (flags & MSG_OOB))
912                         continue;
913 
914                 if (forced_push(tp)) {
915                         tcp_mark_push(tp, skb);
916                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
917                 } else if (skb == tp->send_head)
918                         tcp_push_one(sk, mss_now);
919                 continue;
920 
921 wait_for_sndbuf:
922                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
923 wait_for_memory:
924                 if (copied)
925                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
926 
927                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
928                         goto do_error;
929 
930                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
931         }
932 
933 out:
934         if (copied)
935                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
936         return copied;
937 
938 do_error:
939         if (copied)
940                 goto out;
941 out_err:
942         return tcp_error(sk, flags, err);
943 }
944 
945 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
946                      size_t size, int flags)
947 {
948         ssize_t res;
949         struct sock *sk = sock->sk;
950 
951 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
952 
953         if (!(sk->sk_route_caps & NETIF_F_SG) ||
954             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
955                 return sock_no_sendpage(sock, page, offset, size, flags);
956 
957 #undef TCP_ZC_CSUM_FLAGS
958 
959         lock_sock(sk);
960         TCP_CHECK_TIMER(sk);
961         res = do_tcp_sendpages(sk, &page, offset, size, flags);
962         TCP_CHECK_TIMER(sk);
963         release_sock(sk);
964         return res;
965 }
966 
967 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
968 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
969 
970 static inline int tcp_copy_to_page(struct sock *sk, char *from,
971                                    struct sk_buff *skb, struct page *page,
972                                    int off, int copy)
973 {
974         int err = 0;
975         unsigned int csum;
976 
977         if (skb->ip_summed == CHECKSUM_NONE) {
978                 csum = csum_and_copy_from_user(from, page_address(page) + off,
979                                        copy, 0, &err);
980                 if (err) return err;
981                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
982         } else {
983                 if (copy_from_user(page_address(page) + off, from, copy))
984                         return -EFAULT;
985         }
986 
987         skb->len += copy;
988         skb->data_len += copy;
989         skb->truesize += copy;
990         sk->sk_wmem_queued += copy;
991         sk->sk_forward_alloc -= copy;
992         return 0;
993 }
994 
995 static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
996 {
997         int err = 0;
998         unsigned int csum;
999         int off = skb->len;
1000 
1001         if (skb->ip_summed == CHECKSUM_NONE) {
1002                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1003                                        copy, 0, &err);
1004                 if (!err) {
1005                         skb->csum = csum_block_add(skb->csum, csum, off);
1006                         return 0;
1007                 }
1008         } else {
1009                 if (!copy_from_user(skb_put(skb, copy), from, copy))
1010                         return 0;
1011         }
1012 
1013         __skb_trim(skb, off);
1014         return -EFAULT;
1015 }
1016 
1017 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1018 {
1019         int tmp = tp->mss_cache_std;
1020 
1021         if (sk->sk_route_caps & NETIF_F_SG) {
1022                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1023 
1024                 if (tmp >= pgbreak &&
1025                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1026                         tmp = pgbreak;
1027         }
1028         return tmp;
1029 }
1030 
1031 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1032                 int size)
1033 {
1034         struct iovec *iov;
1035         struct tcp_opt *tp = tcp_sk(sk);
1036         struct sk_buff *skb;
1037         int iovlen, flags;
1038         int mss_now;
1039         int err, copied;
1040         long timeo;
1041 
1042         lock_sock(sk);
1043         TCP_CHECK_TIMER(sk);
1044 
1045         flags = msg->msg_flags;
1046         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1047 
1048         /* Wait for a connection to finish. */
1049         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1050                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1051                         goto out_err;
1052 
1053         /* This should be in poll */
1054         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1055 
1056         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1057 
1058         /* Ok commence sending. */
1059         iovlen = msg->msg_iovlen;
1060         iov = msg->msg_iov;
1061         copied = 0;
1062 
1063         err = -EPIPE;
1064         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1065                 goto do_error;
1066 
1067         while (--iovlen >= 0) {
1068                 int seglen = iov->iov_len;
1069                 unsigned char *from = iov->iov_base;
1070 
1071                 iov++;
1072 
1073                 while (seglen > 0) {
1074                         int copy;
1075 
1076                         skb = sk->sk_write_queue.prev;
1077 
1078                         if (!tp->send_head ||
1079                             (copy = mss_now - skb->len) <= 0) {
1080 
1081 new_segment:
1082                                 /* Allocate new segment. If the interface is SG,
1083                                  * allocate skb fitting to single page.
1084                                  */
1085                                 if (!tcp_memory_free(sk))
1086                                         goto wait_for_sndbuf;
1087 
1088                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1089                                                      0, sk->sk_allocation);
1090                                 if (!skb)
1091                                         goto wait_for_memory;
1092 
1093                                 /*
1094                                  * Check whether we can use HW checksum.
1095                                  */
1096                                 if (sk->sk_route_caps &
1097                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1098                                      NETIF_F_HW_CSUM))
1099                                         skb->ip_summed = CHECKSUM_HW;
1100 
1101                                 skb_entail(sk, tp, skb);
1102                                 copy = mss_now;
1103                         }
1104 
1105                         /* Try to append data to the end of skb. */
1106                         if (copy > seglen)
1107                                 copy = seglen;
1108 
1109                         /* Where to copy to? */
1110                         if (skb_tailroom(skb) > 0) {
1111                                 /* We have some space in skb head. Superb! */
1112                                 if (copy > skb_tailroom(skb))
1113                                         copy = skb_tailroom(skb);
1114                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1115                                         goto do_fault;
1116                         } else {
1117                                 int merge = 0;
1118                                 int i = skb_shinfo(skb)->nr_frags;
1119                                 struct page *page = TCP_PAGE(sk);
1120                                 int off = TCP_OFF(sk);
1121 
1122                                 if (can_coalesce(skb, i, page, off) &&
1123                                     off != PAGE_SIZE) {
1124                                         /* We can extend the last page
1125                                          * fragment. */
1126                                         merge = 1;
1127                                 } else if (i == MAX_SKB_FRAGS ||
1128                                            (!i &&
1129                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1130                                         /* Need to add new fragment and cannot
1131                                          * do this because interface is non-SG,
1132                                          * or because all the page slots are
1133                                          * busy. */
1134                                         tcp_mark_push(tp, skb);
1135                                         goto new_segment;
1136                                 } else if (page) {
1137                                         /* If page is cached, align
1138                                          * offset to L1 cache boundary
1139                                          */
1140                                         off = (off + L1_CACHE_BYTES - 1) &
1141                                               ~(L1_CACHE_BYTES - 1);
1142                                         if (off == PAGE_SIZE) {
1143                                                 put_page(page);
1144                                                 TCP_PAGE(sk) = page = NULL;
1145                                         }
1146                                 }
1147 
1148                                 if (!page) {
1149                                         /* Allocate new cache page. */
1150                                         if (!(page = tcp_alloc_page(sk)))
1151                                                 goto wait_for_memory;
1152                                         off = 0;
1153                                 }
1154 
1155                                 if (copy > PAGE_SIZE - off)
1156                                         copy = PAGE_SIZE - off;
1157 
1158                                 /* Time to copy data. We are close to
1159                                  * the end! */
1160                                 err = tcp_copy_to_page(sk, from, skb, page,
1161                                                        off, copy);
1162                                 if (err) {
1163                                         /* If this page was new, give it to the
1164                                          * socket so it does not get leaked.
1165                                          */
1166                                         if (!TCP_PAGE(sk)) {
1167                                                 TCP_PAGE(sk) = page;
1168                                                 TCP_OFF(sk) = 0;
1169                                         }
1170                                         goto do_error;
1171                                 }
1172 
1173                                 /* Update the skb. */
1174                                 if (merge) {
1175                                         skb_shinfo(skb)->frags[i - 1].size +=
1176                                                                         copy;
1177                                 } else {
1178                                         fill_page_desc(skb, i, page, off, copy);
1179                                         if (TCP_PAGE(sk)) {
1180                                                 get_page(page);
1181                                         } else if (off + copy < PAGE_SIZE) {
1182                                                 get_page(page);
1183                                                 TCP_PAGE(sk) = page;
1184                                         }
1185                                 }
1186 
1187                                 TCP_OFF(sk) = off + copy;
1188                         }
1189 
1190                         if (!copied)
1191                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1192 
1193                         tp->write_seq += copy;
1194                         TCP_SKB_CB(skb)->end_seq += copy;
1195 
1196                         from += copy;
1197                         copied += copy;
1198                         if ((seglen -= copy) == 0 && iovlen == 0)
1199                                 goto out;
1200 
1201                         if (skb->len != mss_now || (flags & MSG_OOB))
1202                                 continue;
1203 
1204                         if (forced_push(tp)) {
1205                                 tcp_mark_push(tp, skb);
1206                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1207                         } else if (skb == tp->send_head)
1208                                 tcp_push_one(sk, mss_now);
1209                         continue;
1210 
1211 wait_for_sndbuf:
1212                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1213 wait_for_memory:
1214                         if (copied)
1215                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1216 
1217                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1218                                 goto do_error;
1219 
1220                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1221                 }
1222         }
1223 
1224 out:
1225         if (copied)
1226                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1227         TCP_CHECK_TIMER(sk);
1228         release_sock(sk);
1229         return copied;
1230 
1231 do_fault:
1232         if (!skb->len) {
1233                 if (tp->send_head == skb)
1234                         tp->send_head = NULL;
1235                 __skb_unlink(skb, skb->list);
1236                 tcp_free_skb(sk, skb);
1237         }
1238 
1239 do_error:
1240         if (copied)
1241                 goto out;
1242 out_err:
1243         err = tcp_error(sk, flags, err);
1244         TCP_CHECK_TIMER(sk);
1245         release_sock(sk);
1246         return err;
1247 }
1248 
1249 /*
1250  *      Handle reading urgent data. BSD has very simple semantics for
1251  *      this, no blocking and very strange errors 8)
1252  */
1253 
1254 static int tcp_recv_urg(struct sock *sk, long timeo,
1255                         struct msghdr *msg, int len, int flags,
1256                         int *addr_len)
1257 {
1258         struct tcp_opt *tp = tcp_sk(sk);
1259 
1260         /* No URG data to read. */
1261         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1262             tp->urg_data == TCP_URG_READ)
1263                 return -EINVAL; /* Yes this is right ! */
1264 
1265         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1266                 return -ENOTCONN;
1267 
1268         if (tp->urg_data & TCP_URG_VALID) {
1269                 int err = 0;
1270                 char c = tp->urg_data;
1271 
1272                 if (!(flags & MSG_PEEK))
1273                         tp->urg_data = TCP_URG_READ;
1274 
1275                 /* Read urgent data. */
1276                 msg->msg_flags |= MSG_OOB;
1277 
1278                 if (len > 0) {
1279                         if (!(flags & MSG_TRUNC))
1280                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1281                         len = 1;
1282                 } else
1283                         msg->msg_flags |= MSG_TRUNC;
1284 
1285                 return err ? -EFAULT : len;
1286         }
1287 
1288         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1289                 return 0;
1290 
1291         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1292          * the available implementations agree in this case:
1293          * this call should never block, independent of the
1294          * blocking state of the socket.
1295          * Mike <pall@rz.uni-karlsruhe.de>
1296          */
1297         return -EAGAIN;
1298 }
1299 
1300 /*
1301  *      Release a skb if it is no longer needed. This routine
1302  *      must be called with interrupts disabled or with the
1303  *      socket locked so that the sk_buff queue operation is ok.
1304  */
1305 
1306 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1307 {
1308         __skb_unlink(skb, &sk->sk_receive_queue);
1309         __kfree_skb(skb);
1310 }
1311 
1312 /* Clean up the receive buffer for full frames taken by the user,
1313  * then send an ACK if necessary.  COPIED is the number of bytes
1314  * tcp_recvmsg has given to the user so far, it speeds up the
1315  * calculation of whether or not we must ACK for the sake of
1316  * a window update.
1317  */
1318 static void cleanup_rbuf(struct sock *sk, int copied)
1319 {
1320         struct tcp_opt *tp = tcp_sk(sk);
1321         int time_to_ack = 0;
1322 
1323 #if TCP_DEBUG
1324         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1325 
1326         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1327 #endif
1328 
1329         if (tcp_ack_scheduled(tp)) {
1330                    /* Delayed ACKs frequently hit locked sockets during bulk
1331                     * receive. */
1332                 if (tp->ack.blocked ||
1333                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1334                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1335                     /*
1336                      * If this read emptied read buffer, we send ACK, if
1337                      * connection is not bidirectional, user drained
1338                      * receive buffer and there was a small segment
1339                      * in queue.
1340                      */
1341                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1342                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1343                         time_to_ack = 1;
1344         }
1345 
1346         /* We send an ACK if we can now advertise a non-zero window
1347          * which has been raised "significantly".
1348          *
1349          * Even if window raised up to infinity, do not send window open ACK
1350          * in states, where we will not receive more. It is useless.
1351          */
1352         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1353                 __u32 rcv_window_now = tcp_receive_window(tp);
1354 
1355                 /* Optimize, __tcp_select_window() is not cheap. */
1356                 if (2*rcv_window_now <= tp->window_clamp) {
1357                         __u32 new_window = __tcp_select_window(sk);
1358 
1359                         /* Send ACK now, if this read freed lots of space
1360                          * in our buffer. Certainly, new_window is new window.
1361                          * We can advertise it now, if it is not less than current one.
1362                          * "Lots" means "at least twice" here.
1363                          */
1364                         if (new_window && new_window >= 2 * rcv_window_now)
1365                                 time_to_ack = 1;
1366                 }
1367         }
1368         if (time_to_ack)
1369                 tcp_send_ack(sk);
1370 }
1371 
1372 /* Now socket state including sk->sk_err is changed only under lock,
1373  * hence we may omit checks after joining wait queue.
1374  * We check receive queue before schedule() only as optimization;
1375  * it is very likely that release_sock() added new data.
1376  */
1377 
1378 static long tcp_data_wait(struct sock *sk, long timeo)
1379 {
1380         DEFINE_WAIT(wait);
1381 
1382         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1383 
1384         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1385         release_sock(sk);
1386 
1387         if (skb_queue_empty(&sk->sk_receive_queue))
1388                 timeo = schedule_timeout(timeo);
1389 
1390         lock_sock(sk);
1391         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1392 
1393         finish_wait(sk->sk_sleep, &wait);
1394         return timeo;
1395 }
1396 
1397 static void tcp_prequeue_process(struct sock *sk)
1398 {
1399         struct sk_buff *skb;
1400         struct tcp_opt *tp = tcp_sk(sk);
1401 
1402         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1403 
1404         /* RX process wants to run with disabled BHs, though it is not
1405          * necessary */
1406         local_bh_disable();
1407         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1408                 sk->sk_backlog_rcv(sk, skb);
1409         local_bh_enable();
1410 
1411         /* Clear memory counter. */
1412         tp->ucopy.memory = 0;
1413 }
1414 
1415 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1416 {
1417         struct sk_buff *skb;
1418         u32 offset;
1419 
1420         skb_queue_walk(&sk->sk_receive_queue, skb) {
1421                 offset = seq - TCP_SKB_CB(skb)->seq;
1422                 if (skb->h.th->syn)
1423                         offset--;
1424                 if (offset < skb->len || skb->h.th->fin) {
1425                         *off = offset;
1426                         return skb;
1427                 }
1428         }
1429         return NULL;
1430 }
1431 
1432 /*
1433  * This routine provides an alternative to tcp_recvmsg() for routines
1434  * that would like to handle copying from skbuffs directly in 'sendfile'
1435  * fashion.
1436  * Note:
1437  *      - It is assumed that the socket was locked by the caller.
1438  *      - The routine does not block.
1439  *      - At present, there is no support for reading OOB data
1440  *        or for 'peeking' the socket using this routine
1441  *        (although both would be easy to implement).
1442  */
1443 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1444                   sk_read_actor_t recv_actor)
1445 {
1446         struct sk_buff *skb;
1447         struct tcp_opt *tp = tcp_sk(sk);
1448         u32 seq = tp->copied_seq;
1449         u32 offset;
1450         int copied = 0;
1451 
1452         if (sk->sk_state == TCP_LISTEN)
1453                 return -ENOTCONN;
1454         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1455                 if (offset < skb->len) {
1456                         size_t used, len;
1457 
1458                         len = skb->len - offset;
1459                         /* Stop reading if we hit a patch of urgent data */
1460                         if (tp->urg_data) {
1461                                 u32 urg_offset = tp->urg_seq - seq;
1462                                 if (urg_offset < len)
1463                                         len = urg_offset;
1464                                 if (!len)
1465                                         break;
1466                         }
1467                         used = recv_actor(desc, skb, offset, len);
1468                         if (used <= len) {
1469                                 seq += used;
1470                                 copied += used;
1471                                 offset += used;
1472                         }
1473                         if (offset != skb->len)
1474                                 break;
1475                 }
1476                 if (skb->h.th->fin) {
1477                         tcp_eat_skb(sk, skb);
1478                         ++seq;
1479                         break;
1480                 }
1481                 tcp_eat_skb(sk, skb);
1482                 if (!desc->count)
1483                         break;
1484         }
1485         tp->copied_seq = seq;
1486         /* Clean up data we have read: This will do ACK frames. */
1487         if (copied)
1488                 cleanup_rbuf(sk, copied);
1489         return copied;
1490 }
1491 
1492 /*
1493  *      This routine copies from a sock struct into the user buffer.
1494  *
1495  *      Technical note: in 2.3 we work on _locked_ socket, so that
1496  *      tricks with *seq access order and skb->users are not required.
1497  *      Probably, code can be easily improved even more.
1498  */
1499 
1500 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1501                 int len, int nonblock, int flags, int *addr_len)
1502 {
1503         struct tcp_opt *tp = tcp_sk(sk);
1504         int copied = 0;
1505         u32 peek_seq;
1506         u32 *seq;
1507         unsigned long used;
1508         int err;
1509         int target;             /* Read at least this many bytes */
1510         long timeo;
1511         struct task_struct *user_recv = NULL;
1512 
1513         lock_sock(sk);
1514 
1515         TCP_CHECK_TIMER(sk);
1516 
1517         err = -ENOTCONN;
1518         if (sk->sk_state == TCP_LISTEN)
1519                 goto out;
1520 
1521         timeo = sock_rcvtimeo(sk, nonblock);
1522 
1523         /* Urgent data needs to be handled specially. */
1524         if (flags & MSG_OOB)
1525                 goto recv_urg;
1526 
1527         seq = &tp->copied_seq;
1528         if (flags & MSG_PEEK) {
1529                 peek_seq = tp->copied_seq;
1530                 seq = &peek_seq;
1531         }
1532 
1533         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1534 
1535         do {
1536                 struct sk_buff *skb;
1537                 u32 offset;
1538 
1539                 /* Are we at urgent data? Stop if we have read anything. */
1540                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1541                         break;
1542 
1543                 /* We need to check signals first, to get correct SIGURG
1544                  * handling. FIXME: Need to check this doesn't impact 1003.1g
1545                  * and move it down to the bottom of the loop
1546                  */
1547                 if (signal_pending(current)) {
1548                         if (copied)
1549                                 break;
1550                         copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1551                         break;
1552                 }
1553 
1554                 /* Next get a buffer. */
1555 
1556                 skb = skb_peek(&sk->sk_receive_queue);
1557                 do {
1558                         if (!skb)
1559                                 break;
1560 
1561                         /* Now that we have two receive queues this
1562                          * shouldn't happen.
1563                          */
1564                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1565                                 printk(KERN_INFO "recvmsg bug: copied %X "
1566                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1567                                 break;
1568                         }
1569                         offset = *seq - TCP_SKB_CB(skb)->seq;
1570                         if (skb->h.th->syn)
1571                                 offset--;
1572                         if (offset < skb->len)
1573                                 goto found_ok_skb;
1574                         if (skb->h.th->fin)
1575                                 goto found_fin_ok;
1576                         BUG_TRAP(flags & MSG_PEEK);
1577                         skb = skb->next;
1578                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1579 
1580                 /* Well, if we have backlog, try to process it now yet. */
1581 
1582                 if (copied >= target && !sk->sk_backlog.tail)
1583                         break;
1584 
1585                 if (copied) {
1586                         if (sk->sk_err ||
1587                             sk->sk_state == TCP_CLOSE ||
1588                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1589                             !timeo ||
1590                             (flags & MSG_PEEK))
1591                                 break;
1592                 } else {
1593                         if (sock_flag(sk, SOCK_DONE))
1594                                 break;
1595 
1596                         if (sk->sk_err) {
1597                                 copied = sock_error(sk);
1598                                 break;
1599                         }
1600 
1601                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1602                                 break;
1603 
1604                         if (sk->sk_state == TCP_CLOSE) {
1605                                 if (!sock_flag(sk, SOCK_DONE)) {
1606                                         /* This occurs when user tries to read
1607                                          * from never connected socket.
1608                                          */
1609                                         copied = -ENOTCONN;
1610                                         break;
1611                                 }
1612                                 break;
1613                         }
1614 
1615                         if (!timeo) {
1616                                 copied = -EAGAIN;
1617                                 break;
1618                         }
1619                 }
1620 
1621                 cleanup_rbuf(sk, copied);
1622 
1623                 if (tp->ucopy.task == user_recv) {
1624                         /* Install new reader */
1625                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1626                                 user_recv = current;
1627                                 tp->ucopy.task = user_recv;
1628                                 tp->ucopy.iov = msg->msg_iov;
1629                         }
1630 
1631                         tp->ucopy.len = len;
1632 
1633                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1634                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1635 
1636                         /* Ugly... If prequeue is not empty, we have to
1637                          * process it before releasing socket, otherwise
1638                          * order will be broken at second iteration.
1639                          * More elegant solution is required!!!
1640                          *
1641                          * Look: we have the following (pseudo)queues:
1642                          *
1643                          * 1. packets in flight
1644                          * 2. backlog
1645                          * 3. prequeue
1646                          * 4. receive_queue
1647                          *
1648                          * Each queue can be processed only if the next ones
1649                          * are empty. At this point we have empty receive_queue.
1650                          * But prequeue _can_ be not empty after 2nd iteration,
1651                          * when we jumped to start of loop because backlog
1652                          * processing added something to receive_queue.
1653                          * We cannot release_sock(), because backlog contains
1654                          * packets arrived _after_ prequeued ones.
1655                          *
1656                          * Shortly, algorithm is clear --- to process all
1657                          * the queues in order. We could make it more directly,
1658                          * requeueing packets from backlog to prequeue, if
1659                          * is not empty. It is more elegant, but eats cycles,
1660                          * unfortunately.
1661                          */
1662                         if (skb_queue_len(&tp->ucopy.prequeue))
1663                                 goto do_prequeue;
1664 
1665                         /* __ Set realtime policy in scheduler __ */
1666                 }
1667 
1668                 if (copied >= target) {
1669                         /* Do not sleep, just process backlog. */
1670                         release_sock(sk);
1671                         lock_sock(sk);
1672                 } else {
1673                         timeo = tcp_data_wait(sk, timeo);
1674                 }
1675 
1676                 if (user_recv) {
1677                         int chunk;
1678 
1679                         /* __ Restore normal policy in scheduler __ */
1680 
1681                         if ((chunk = len - tp->ucopy.len) != 0) {
1682                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1683                                 len -= chunk;
1684                                 copied += chunk;
1685                         }
1686 
1687                         if (tp->rcv_nxt == tp->copied_seq &&
1688                             skb_queue_len(&tp->ucopy.prequeue)) {
1689 do_prequeue:
1690                                 tcp_prequeue_process(sk);
1691 
1692                                 if ((chunk = len - tp->ucopy.len) != 0) {
1693                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1694                                         len -= chunk;
1695                                         copied += chunk;
1696                                 }
1697                         }
1698                 }
1699                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1700                         if (net_ratelimit())
1701                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1702                                        current->comm, current->pid);
1703                         peek_seq = tp->copied_seq;
1704                 }
1705                 continue;
1706 
1707         found_ok_skb:
1708                 /* Ok so how much can we use? */
1709                 used = skb->len - offset;
1710                 if (len < used)
1711                         used = len;
1712 
1713                 /* Do we have urgent data here? */
1714                 if (tp->urg_data) {
1715                         u32 urg_offset = tp->urg_seq - *seq;
1716                         if (urg_offset < used) {
1717                                 if (!urg_offset) {
1718                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1719                                                 ++*seq;
1720                                                 offset++;
1721                                                 used--;
1722                                                 if (!used)
1723                                                         goto skip_copy;
1724                                         }
1725                                 } else
1726                                         used = urg_offset;
1727                         }
1728                 }
1729 
1730                 if (!(flags & MSG_TRUNC)) {
1731                         err = skb_copy_datagram_iovec(skb, offset,
1732                                                       msg->msg_iov, used);
1733                         if (err) {
1734                                 /* Exception. Bailout! */
1735                                 if (!copied)
1736                                         copied = -EFAULT;
1737                                 break;
1738                         }
1739                 }
1740 
1741                 *seq += used;
1742                 copied += used;
1743                 len -= used;
1744 
1745 skip_copy:
1746                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1747                         tp->urg_data = 0;
1748                         tcp_fast_path_check(sk, tp);
1749                 }
1750                 if (used + offset < skb->len)
1751                         continue;
1752 
1753                 if (skb->h.th->fin)
1754                         goto found_fin_ok;
1755                 if (!(flags & MSG_PEEK))
1756                         tcp_eat_skb(sk, skb);
1757                 continue;
1758 
1759         found_fin_ok:
1760                 /* Process the FIN. */
1761                 ++*seq;
1762                 if (!(flags & MSG_PEEK))
1763                         tcp_eat_skb(sk, skb);
1764                 break;
1765         } while (len > 0);
1766 
1767         if (user_recv) {
1768                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1769                         int chunk;
1770 
1771                         tp->ucopy.len = copied > 0 ? len : 0;
1772 
1773                         tcp_prequeue_process(sk);
1774 
1775                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1776                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1777                                 len -= chunk;
1778                                 copied += chunk;
1779                         }
1780                 }
1781 
1782                 tp->ucopy.task = NULL;
1783                 tp->ucopy.len = 0;
1784         }
1785 
1786         /* According to UNIX98, msg_name/msg_namelen are ignored
1787          * on connected socket. I was just happy when found this 8) --ANK
1788          */
1789 
1790         /* Clean up data we have read: This will do ACK frames. */
1791         cleanup_rbuf(sk, copied);
1792 
1793         TCP_CHECK_TIMER(sk);
1794         release_sock(sk);
1795         return copied;
1796 
1797 out:
1798         TCP_CHECK_TIMER(sk);
1799         release_sock(sk);
1800         return err;
1801 
1802 recv_urg:
1803         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1804         goto out;
1805 }
1806 
1807 /*
1808  *      State processing on a close. This implements the state shift for
1809  *      sending our FIN frame. Note that we only send a FIN for some
1810  *      states. A shutdown() may have already sent the FIN, or we may be
1811  *      closed.
1812  */
1813 
1814 static unsigned char new_state[16] = {
1815   /* current state:        new state:      action:      */
1816   /* (Invalid)          */ TCP_CLOSE,
1817   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1818   /* TCP_SYN_SENT       */ TCP_CLOSE,
1819   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1820   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1821   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1822   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1823   /* TCP_CLOSE          */ TCP_CLOSE,
1824   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1825   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1826   /* TCP_LISTEN         */ TCP_CLOSE,
1827   /* TCP_CLOSING        */ TCP_CLOSING,
1828 };
1829 
1830 static int tcp_close_state(struct sock *sk)
1831 {
1832         int next = (int)new_state[sk->sk_state];
1833         int ns = next & TCP_STATE_MASK;
1834 
1835         tcp_set_state(sk, ns);
1836 
1837         return next & TCP_ACTION_FIN;
1838 }
1839 
1840 /*
1841  *      Shutdown the sending side of a connection. Much like close except
1842  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1843  */
1844 
1845 void tcp_shutdown(struct sock *sk, int how)
1846 {
1847         /*      We need to grab some memory, and put together a FIN,
1848          *      and then put it into the queue to be sent.
1849          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1850          */
1851         if (!(how & SEND_SHUTDOWN))
1852                 return;
1853 
1854         /* If we've already sent a FIN, or it's a closed state, skip this. */
1855         if ((1 << sk->sk_state) &
1856             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1857              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1858                 /* Clear out any half completed packets.  FIN if needed. */
1859                 if (tcp_close_state(sk))
1860                         tcp_send_fin(sk);
1861         }
1862 }
1863 
1864 
1865 /*
1866  *      Return 1 if we still have things to send in our buffers.
1867  */
1868 
1869 static inline int closing(struct sock *sk)
1870 {
1871         return (1 << sk->sk_state) &
1872                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1873 }
1874 
1875 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1876 {
1877         /* First the read buffer. */
1878         __skb_queue_purge(&sk->sk_receive_queue);
1879 
1880         /* Next, the error queue. */
1881         __skb_queue_purge(&sk->sk_error_queue);
1882 
1883         /* Next, the write queue. */
1884         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1885 
1886         /* Account for returned memory. */
1887         tcp_mem_reclaim(sk);
1888 
1889         BUG_TRAP(!sk->sk_wmem_queued);
1890         BUG_TRAP(!sk->sk_forward_alloc);
1891 
1892         /* It is _impossible_ for the backlog to contain anything
1893          * when we get here.  All user references to this socket
1894          * have gone away, only the net layer knows can touch it.
1895          */
1896 }
1897 
1898 /*
1899  * At this point, there should be no process reference to this
1900  * socket, and thus no user references at all.  Therefore we
1901  * can assume the socket waitqueue is inactive and nobody will
1902  * try to jump onto it.
1903  */
1904 void tcp_destroy_sock(struct sock *sk)
1905 {
1906         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1907         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1908 
1909         /* It cannot be in hash table! */
1910         BUG_TRAP(sk_unhashed(sk));
1911 
1912         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1913         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1914 
1915 #ifdef TCP_DEBUG
1916         if (sk->sk_zapped) {
1917                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1918                 sock_hold(sk);
1919         }
1920         sk->sk_zapped = 1;
1921 #endif
1922 
1923         sk->sk_prot->destroy(sk);
1924 
1925         tcp_kill_sk_queues(sk);
1926 
1927         xfrm_sk_free_policy(sk);
1928 
1929 #ifdef INET_REFCNT_DEBUG
1930         if (atomic_read(&sk->sk_refcnt) != 1) {
1931                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1932                        sk, atomic_read(&sk->sk_refcnt));
1933         }
1934 #endif
1935 
1936         atomic_dec(&tcp_orphan_count);
1937         sock_put(sk);
1938 }
1939 
1940 void tcp_close(struct sock *sk, long timeout)
1941 {
1942         struct sk_buff *skb;
1943         int data_was_unread = 0;
1944 
1945         lock_sock(sk);
1946         sk->sk_shutdown = SHUTDOWN_MASK;
1947 
1948         if (sk->sk_state == TCP_LISTEN) {
1949                 tcp_set_state(sk, TCP_CLOSE);
1950 
1951                 /* Special case. */
1952                 tcp_listen_stop(sk);
1953 
1954                 goto adjudge_to_death;
1955         }
1956 
1957         /*  We need to flush the recv. buffs.  We do this only on the
1958          *  descriptor close, not protocol-sourced closes, because the
1959          *  reader process may not have drained the data yet!
1960          */
1961         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1962                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1963                           skb->h.th->fin;
1964                 data_was_unread += len;
1965                 __kfree_skb(skb);
1966         }
1967 
1968         tcp_mem_reclaim(sk);
1969 
1970         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1971          * 3.10, we send a RST here because data was lost.  To
1972          * witness the awful effects of the old behavior of always
1973          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1974          * a bulk GET in an FTP client, suspend the process, wait
1975          * for the client to advertise a zero window, then kill -9
1976          * the FTP client, wheee...  Note: timeout is always zero
1977          * in such a case.
1978          */
1979         if (data_was_unread) {
1980                 /* Unread data was tossed, zap the connection. */
1981                 NET_INC_STATS_USER(TCPAbortOnClose);
1982                 tcp_set_state(sk, TCP_CLOSE);
1983                 tcp_send_active_reset(sk, GFP_KERNEL);
1984         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1985                 /* Check zero linger _after_ checking for unread data. */
1986                 sk->sk_prot->disconnect(sk, 0);
1987                 NET_INC_STATS_USER(TCPAbortOnData);
1988         } else if (tcp_close_state(sk)) {
1989                 /* We FIN if the application ate all the data before
1990                  * zapping the connection.
1991                  */
1992 
1993                 /* RED-PEN. Formally speaking, we have broken TCP state
1994                  * machine. State transitions:
1995                  *
1996                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1997                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1998                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1999                  *
2000                  * are legal only when FIN has been sent (i.e. in window),
2001                  * rather than queued out of window. Purists blame.
2002                  *
2003                  * F.e. "RFC state" is ESTABLISHED,
2004                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2005                  *
2006                  * The visible declinations are that sometimes
2007                  * we enter time-wait state, when it is not required really
2008                  * (harmless), do not send active resets, when they are
2009                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2010                  * they look as CLOSING or LAST_ACK for Linux)
2011                  * Probably, I missed some more holelets.
2012                  *                                              --ANK
2013                  */
2014                 tcp_send_fin(sk);
2015         }
2016 
2017         if (timeout) {
2018                 struct task_struct *tsk = current;
2019                 DEFINE_WAIT(wait);
2020 
2021                 do {
2022                         prepare_to_wait(sk->sk_sleep, &wait,
2023                                         TASK_INTERRUPTIBLE);
2024                         if (!closing(sk))
2025                                 break;
2026                         release_sock(sk);
2027                         timeout = schedule_timeout(timeout);
2028                         lock_sock(sk);
2029                 } while (!signal_pending(tsk) && timeout);
2030 
2031                 finish_wait(sk->sk_sleep, &wait);
2032         }
2033 
2034 adjudge_to_death:
2035         /* It is the last release_sock in its life. It will remove backlog. */
2036         release_sock(sk);
2037 
2038 
2039         /* Now socket is owned by kernel and we acquire BH lock
2040            to finish close. No need to check for user refs.
2041          */
2042         local_bh_disable();
2043         bh_lock_sock(sk);
2044         BUG_TRAP(!sock_owned_by_user(sk));
2045 
2046         sock_hold(sk);
2047         sock_orphan(sk);
2048 
2049         /*      This is a (useful) BSD violating of the RFC. There is a
2050          *      problem with TCP as specified in that the other end could
2051          *      keep a socket open forever with no application left this end.
2052          *      We use a 3 minute timeout (about the same as BSD) then kill
2053          *      our end. If they send after that then tough - BUT: long enough
2054          *      that we won't make the old 4*rto = almost no time - whoops
2055          *      reset mistake.
2056          *
2057          *      Nope, it was not mistake. It is really desired behaviour
2058          *      f.e. on http servers, when such sockets are useless, but
2059          *      consume significant resources. Let's do it with special
2060          *      linger2 option.                                 --ANK
2061          */
2062 
2063         if (sk->sk_state == TCP_FIN_WAIT2) {
2064                 struct tcp_opt *tp = tcp_sk(sk);
2065                 if (tp->linger2 < 0) {
2066                         tcp_set_state(sk, TCP_CLOSE);
2067                         tcp_send_active_reset(sk, GFP_ATOMIC);
2068                         NET_INC_STATS_BH(TCPAbortOnLinger);
2069                 } else {
2070                         int tmo = tcp_fin_time(tp);
2071 
2072                         if (tmo > TCP_TIMEWAIT_LEN) {
2073                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2074                         } else {
2075                                 atomic_inc(&tcp_orphan_count);
2076                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2077                                 goto out;
2078                         }
2079                 }
2080         }
2081         if (sk->sk_state != TCP_CLOSE) {
2082                 tcp_mem_reclaim(sk);
2083                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2084                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2085                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2086                         if (net_ratelimit())
2087                                 printk(KERN_INFO "TCP: too many of orphaned "
2088                                        "sockets\n");
2089                         tcp_set_state(sk, TCP_CLOSE);
2090                         tcp_send_active_reset(sk, GFP_ATOMIC);
2091                         NET_INC_STATS_BH(TCPAbortOnMemory);
2092                 }
2093         }
2094         atomic_inc(&tcp_orphan_count);
2095 
2096         if (sk->sk_state == TCP_CLOSE)
2097                 tcp_destroy_sock(sk);
2098         /* Otherwise, socket is reprieved until protocol close. */
2099 
2100 out:
2101         bh_unlock_sock(sk);
2102         local_bh_enable();
2103         sock_put(sk);
2104 }
2105 
2106 /* These states need RST on ABORT according to RFC793 */
2107 
2108 static inline int tcp_need_reset(int state)
2109 {
2110         return (1 << state) &
2111                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2112                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2113 }
2114 
2115 int tcp_disconnect(struct sock *sk, int flags)
2116 {
2117         struct inet_opt *inet = inet_sk(sk);
2118         struct tcp_opt *tp = tcp_sk(sk);
2119         int err = 0;
2120         int old_state = sk->sk_state;
2121 
2122         if (old_state != TCP_CLOSE)
2123                 tcp_set_state(sk, TCP_CLOSE);
2124 
2125         /* ABORT function of RFC793 */
2126         if (old_state == TCP_LISTEN) {
2127                 tcp_listen_stop(sk);
2128         } else if (tcp_need_reset(old_state) ||
2129                    (tp->snd_nxt != tp->write_seq &&
2130                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2131                 /* The last check adjusts for discrepance of Linux wrt. RFC
2132                  * states
2133                  */
2134                 tcp_send_active_reset(sk, gfp_any());
2135                 sk->sk_err = ECONNRESET;
2136         } else if (old_state == TCP_SYN_SENT)
2137                 sk->sk_err = ECONNRESET;
2138 
2139         tcp_clear_xmit_timers(sk);
2140         __skb_queue_purge(&sk->sk_receive_queue);
2141         tcp_writequeue_purge(sk);
2142         __skb_queue_purge(&tp->out_of_order_queue);
2143 
2144         inet->dport = 0;
2145 
2146         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2147                 inet_reset_saddr(sk);
2148 
2149         sk->sk_shutdown = 0;
2150         sock_reset_flag(sk, SOCK_DONE);
2151         tp->srtt = 0;
2152         if ((tp->write_seq += tp->max_window + 2) == 0)
2153                 tp->write_seq = 1;
2154         tp->backoff = 0;
2155         tp->snd_cwnd = 2;
2156         tp->probes_out = 0;
2157         tp->packets_out = 0;
2158         tp->snd_ssthresh = 0x7fffffff;
2159         tp->snd_cwnd_cnt = 0;
2160         tp->ca_state = TCP_CA_Open;
2161         tcp_clear_retrans(tp);
2162         tcp_delack_init(tp);
2163         tp->send_head = NULL;
2164         tp->saw_tstamp = 0;
2165         tcp_sack_reset(tp);
2166         __sk_dst_reset(sk);
2167 
2168         BUG_TRAP(!inet->num || tp->bind_hash);
2169 
2170         sk->sk_error_report(sk);
2171         return err;
2172 }
2173 
2174 /*
2175  *      Wait for an incoming connection, avoid race
2176  *      conditions. This must be called with the socket locked.
2177  */
2178 static int wait_for_connect(struct sock *sk, long timeo)
2179 {
2180         struct tcp_opt *tp = tcp_sk(sk);
2181         DEFINE_WAIT(wait);
2182         int err;
2183 
2184         /*
2185          * True wake-one mechanism for incoming connections: only
2186          * one process gets woken up, not the 'whole herd'.
2187          * Since we do not 'race & poll' for established sockets
2188          * anymore, the common case will execute the loop only once.
2189          *
2190          * Subtle issue: "add_wait_queue_exclusive()" will be added
2191          * after any current non-exclusive waiters, and we know that
2192          * it will always _stay_ after any new non-exclusive waiters
2193          * because all non-exclusive waiters are added at the
2194          * beginning of the wait-queue. As such, it's ok to "drop"
2195          * our exclusiveness temporarily when we get woken up without
2196          * having to remove and re-insert us on the wait queue.
2197          */
2198         for (;;) {
2199                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2200                                           TASK_INTERRUPTIBLE);
2201                 release_sock(sk);
2202                 if (!tp->accept_queue)
2203                         timeo = schedule_timeout(timeo);
2204                 lock_sock(sk);
2205                 err = 0;
2206                 if (tp->accept_queue)
2207                         break;
2208                 err = -EINVAL;
2209                 if (sk->sk_state != TCP_LISTEN)
2210                         break;
2211                 err = sock_intr_errno(timeo);
2212                 if (signal_pending(current))
2213                         break;
2214                 err = -EAGAIN;
2215                 if (!timeo)
2216                         break;
2217         }
2218         finish_wait(sk->sk_sleep, &wait);
2219         return err;
2220 }
2221 
2222 /*
2223  *      This will accept the next outstanding connection.
2224  */
2225 
2226 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2227 {
2228         struct tcp_opt *tp = tcp_sk(sk);
2229         struct open_request *req;
2230         struct sock *newsk;
2231         int error;
2232 
2233         lock_sock(sk);
2234 
2235         /* We need to make sure that this socket is listening,
2236          * and that it has something pending.
2237          */
2238         error = -EINVAL;
2239         if (sk->sk_state != TCP_LISTEN)
2240                 goto out;
2241 
2242         /* Find already established connection */
2243         if (!tp->accept_queue) {
2244                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2245 
2246                 /* If this is a non blocking socket don't sleep */
2247                 error = -EAGAIN;
2248                 if (!timeo)
2249                         goto out;
2250 
2251                 error = wait_for_connect(sk, timeo);
2252                 if (error)
2253                         goto out;
2254         }
2255 
2256         req = tp->accept_queue;
2257         if ((tp->accept_queue = req->dl_next) == NULL)
2258                 tp->accept_queue_tail = NULL;
2259 
2260         newsk = req->sk;
2261         tcp_acceptq_removed(sk);
2262         tcp_openreq_fastfree(req);
2263         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2264         release_sock(sk);
2265         return newsk;
2266 
2267 out:
2268         release_sock(sk);
2269         *err = error;
2270         return NULL;
2271 }
2272 
2273 /*
2274  *      Socket option code for TCP.
2275  */
2276 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2277                    int optlen)
2278 {
2279         struct tcp_opt *tp = tcp_sk(sk);
2280         int val;
2281         int err = 0;
2282 
2283         if (level != SOL_TCP)
2284                 return tp->af_specific->setsockopt(sk, level, optname,
2285                                                    optval, optlen);
2286 
2287         if (optlen < sizeof(int))
2288                 return -EINVAL;
2289 
2290         if (get_user(val, (int *)optval))
2291                 return -EFAULT;
2292 
2293         lock_sock(sk);
2294 
2295         switch (optname) {
2296         case TCP_MAXSEG:
2297                 /* Values greater than interface MTU won't take effect. However
2298                  * at the point when this call is done we typically don't yet
2299                  * know which interface is going to be used */
2300                 if (val < 8 || val > MAX_TCP_WINDOW) {
2301                         err = -EINVAL;
2302                         break;
2303                 }
2304                 tp->user_mss = val;
2305                 break;
2306 
2307         case TCP_NODELAY:
2308                 if (val) {
2309                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2310                          * this option on corked socket is remembered, but
2311                          * it is not activated until cork is cleared.
2312                          *
2313                          * However, when TCP_NODELAY is set we make
2314                          * an explicit push, which overrides even TCP_CORK
2315                          * for currently queued segments.
2316                          */
2317                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2318                         tcp_push_pending_frames(sk, tp);
2319                 } else {
2320                         tp->nonagle &= ~TCP_NAGLE_OFF;
2321                 }
2322                 break;
2323 
2324         case TCP_CORK:
2325                 /* When set indicates to always queue non-full frames.
2326                  * Later the user clears this option and we transmit
2327                  * any pending partial frames in the queue.  This is
2328                  * meant to be used alongside sendfile() to get properly
2329                  * filled frames when the user (for example) must write
2330                  * out headers with a write() call first and then use
2331                  * sendfile to send out the data parts.
2332                  *
2333                  * TCP_CORK can be set together with TCP_NODELAY and it is
2334                  * stronger than TCP_NODELAY.
2335                  */
2336                 if (val) {
2337                         tp->nonagle |= TCP_NAGLE_CORK;
2338                 } else {
2339                         tp->nonagle &= ~TCP_NAGLE_CORK;
2340                         if (tp->nonagle&TCP_NAGLE_OFF)
2341                                 tp->nonagle |= TCP_NAGLE_PUSH;
2342                         tcp_push_pending_frames(sk, tp);
2343                 }
2344                 break;
2345 
2346         case TCP_KEEPIDLE:
2347                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2348                         err = -EINVAL;
2349                 else {
2350                         tp->keepalive_time = val * HZ;
2351                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2352                             !((1 << sk->sk_state) &
2353                               (TCPF_CLOSE | TCPF_LISTEN))) {
2354                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2355                                 if (tp->keepalive_time > elapsed)
2356                                         elapsed = tp->keepalive_time - elapsed;
2357                                 else
2358                                         elapsed = 0;
2359                                 tcp_reset_keepalive_timer(sk, elapsed);
2360                         }
2361                 }
2362                 break;
2363         case TCP_KEEPINTVL:
2364                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2365                         err = -EINVAL;
2366                 else
2367                         tp->keepalive_intvl = val * HZ;
2368                 break;
2369         case TCP_KEEPCNT:
2370                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2371                         err = -EINVAL;
2372                 else
2373                         tp->keepalive_probes = val;
2374                 break;
2375         case TCP_SYNCNT:
2376                 if (val < 1 || val > MAX_TCP_SYNCNT)
2377                         err = -EINVAL;
2378                 else
2379                         tp->syn_retries = val;
2380                 break;
2381 
2382         case TCP_LINGER2:
2383                 if (val < 0)
2384                         tp->linger2 = -1;
2385                 else if (val > sysctl_tcp_fin_timeout / HZ)
2386                         tp->linger2 = 0;
2387                 else
2388                         tp->linger2 = val * HZ;
2389                 break;
2390 
2391         case TCP_DEFER_ACCEPT:
2392                 tp->defer_accept = 0;
2393                 if (val > 0) {
2394                         /* Translate value in seconds to number of
2395                          * retransmits */
2396                         while (tp->defer_accept < 32 &&
2397                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2398                                        tp->defer_accept))
2399                                 tp->defer_accept++;
2400                         tp->defer_accept++;
2401                 }
2402                 break;
2403 
2404         case TCP_WINDOW_CLAMP:
2405                 if (!val) {
2406                         if (sk->sk_state != TCP_CLOSE) {
2407                                 err = -EINVAL;
2408                                 break;
2409                         }
2410                         tp->window_clamp = 0;
2411                 } else
2412                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2413                                                 SOCK_MIN_RCVBUF / 2 : val;
2414                 break;
2415 
2416         case TCP_QUICKACK:
2417                 if (!val) {
2418                         tp->ack.pingpong = 1;
2419                 } else {
2420                         tp->ack.pingpong = 0;
2421                         if ((1 << sk->sk_state) &
2422                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2423                             tcp_ack_scheduled(tp)) {
2424                                 tp->ack.pending |= TCP_ACK_PUSHED;
2425                                 cleanup_rbuf(sk, 1);
2426                                 if (!(val & 1))
2427                                         tp->ack.pingpong = 1;
2428                         }
2429                 }
2430                 break;
2431 
2432         default:
2433                 err = -ENOPROTOOPT;
2434                 break;
2435         };
2436         release_sock(sk);
2437         return err;
2438 }
2439 
2440 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2441                    int *optlen)
2442 {
2443         struct tcp_opt *tp = tcp_sk(sk);
2444         int val, len;
2445 
2446         if (level != SOL_TCP)
2447                 return tp->af_specific->getsockopt(sk, level, optname,
2448                                                    optval, optlen);
2449 
2450         if (get_user(len, optlen))
2451                 return -EFAULT;
2452 
2453         len = min_t(unsigned int, len, sizeof(int));
2454 
2455         if (len < 0)
2456                 return -EINVAL;
2457 
2458         switch (optname) {
2459         case TCP_MAXSEG:
2460                 val = tp->mss_cache_std;
2461                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2462                         val = tp->user_mss;
2463                 break;
2464         case TCP_NODELAY:
2465                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2466                 break;
2467         case TCP_CORK:
2468                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2469                 break;
2470         case TCP_KEEPIDLE:
2471                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2472                 break;
2473         case TCP_KEEPINTVL:
2474                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2475                 break;
2476         case TCP_KEEPCNT:
2477                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2478                 break;
2479         case TCP_SYNCNT:
2480                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2481                 break;
2482         case TCP_LINGER2:
2483                 val = tp->linger2;
2484                 if (val >= 0)
2485                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2486                 break;
2487         case TCP_DEFER_ACCEPT:
2488                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2489                                                (tp->defer_accept - 1));
2490                 break;
2491         case TCP_WINDOW_CLAMP:
2492                 val = tp->window_clamp;
2493                 break;
2494         case TCP_INFO: {
2495                 struct tcp_info info;
2496                 u32 now = tcp_time_stamp;
2497 
2498                 if (get_user(len, optlen))
2499                         return -EFAULT;
2500                 info.tcpi_state = sk->sk_state;
2501                 info.tcpi_ca_state = tp->ca_state;
2502                 info.tcpi_retransmits = tp->retransmits;
2503                 info.tcpi_probes = tp->probes_out;
2504                 info.tcpi_backoff = tp->backoff;
2505                 info.tcpi_options = 0;
2506                 if (tp->tstamp_ok)
2507                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2508                 if (tp->sack_ok)
2509                         info.tcpi_options |= TCPI_OPT_SACK;
2510                 if (tp->wscale_ok) {
2511                         info.tcpi_options |= TCPI_OPT_WSCALE;
2512                         info.tcpi_snd_wscale = tp->snd_wscale;
2513                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2514                 } else {
2515                         info.tcpi_snd_wscale = 0;
2516                         info.tcpi_rcv_wscale = 0;
2517                 }
2518                 if (tp->ecn_flags & TCP_ECN_OK)
2519                         info.tcpi_options |= TCPI_OPT_ECN;
2520 
2521                 info.tcpi_rto = (1000000 * tp->rto) / HZ;
2522                 info.tcpi_ato = (1000000 * tp->ack.ato) / HZ;
2523                 info.tcpi_snd_mss = tp->mss_cache_std;
2524                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2525 
2526                 info.tcpi_unacked = tp->packets_out;
2527                 info.tcpi_sacked = tp->sacked_out;
2528                 info.tcpi_lost = tp->lost_out;
2529                 info.tcpi_retrans = tp->retrans_out;
2530                 info.tcpi_fackets = tp->fackets_out;
2531 
2532                 info.tcpi_last_data_sent = ((now - tp->lsndtime) * 1000) / HZ;
2533                 info.tcpi_last_ack_sent = 0;
2534                 info.tcpi_last_data_recv = ((now -
2535                                              tp->ack.lrcvtime) * 1000) / HZ;
2536                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp) * 1000) / HZ;
2537 
2538                 info.tcpi_pmtu = tp->pmtu_cookie;
2539                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2540                 info.tcpi_rtt = ((1000000 * tp->srtt) / HZ) >> 3;
2541                 info.tcpi_rttvar = ((1000000 * tp->mdev) / HZ) >> 2;
2542                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2543                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2544                 info.tcpi_advmss = tp->advmss;
2545                 info.tcpi_reordering = tp->reordering;
2546 
2547                 len = min_t(unsigned int, len, sizeof(info));
2548                 if (put_user(len, optlen))
2549                         return -EFAULT;
2550                 if (copy_to_user(optval, &info, len))
2551                         return -EFAULT;
2552                 return 0;
2553         }
2554         case TCP_QUICKACK:
2555                 val = !tp->ack.pingpong;
2556                 break;
2557         default:
2558                 return -ENOPROTOOPT;
2559         };
2560 
2561         if (put_user(len, optlen))
2562                 return -EFAULT;
2563         if (copy_to_user(optval, &val, len))
2564                 return -EFAULT;
2565         return 0;
2566 }
2567 
2568 
2569 extern void __skb_cb_too_small_for_tcp(int, int);
2570 extern void tcpdiag_init(void);
2571 
2572 void __init tcp_init(void)
2573 {
2574         struct sk_buff *skb = NULL;
2575         unsigned long goal;
2576         int order, i;
2577 
2578         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2579                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2580                                            sizeof(skb->cb));
2581 
2582         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2583                                                    sizeof(struct open_request),
2584                                                0, SLAB_HWCACHE_ALIGN,
2585                                                NULL, NULL);
2586         if (!tcp_openreq_cachep)
2587                 panic("tcp_init: Cannot alloc open_request cache.");
2588 
2589         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2590                                               sizeof(struct tcp_bind_bucket),
2591                                               0, SLAB_HWCACHE_ALIGN,
2592                                               NULL, NULL);
2593         if (!tcp_bucket_cachep)
2594                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2595 
2596         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2597                                                 sizeof(struct tcp_tw_bucket),
2598                                                 0, SLAB_HWCACHE_ALIGN,
2599                                                 NULL, NULL);
2600         if (!tcp_timewait_cachep)
2601                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2602 
2603         /* Size and allocate the main established and bind bucket
2604          * hash tables.
2605          *
2606          * The methodology is similar to that of the buffer cache.
2607          */
2608         if (num_physpages >= (128 * 1024))
2609                 goal = num_physpages >> (21 - PAGE_SHIFT);
2610         else
2611                 goal = num_physpages >> (23 - PAGE_SHIFT);
2612 
2613         for (order = 0; (1UL << order) < goal; order++)
2614                 ;
2615         do {
2616                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2617                         sizeof(struct tcp_ehash_bucket);
2618                 tcp_ehash_size >>= 1;
2619                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2620                         tcp_ehash_size--;
2621                 tcp_ehash = (struct tcp_ehash_bucket *)
2622                         __get_free_pages(GFP_ATOMIC, order);
2623         } while (!tcp_ehash && --order > 0);
2624 
2625         if (!tcp_ehash)
2626                 panic("Failed to allocate TCP established hash table\n");
2627         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2628                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2629                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2630         }
2631 
2632         do {
2633                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2634                         sizeof(struct tcp_bind_hashbucket);
2635                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2636                         continue;
2637                 tcp_bhash = (struct tcp_bind_hashbucket *)
2638                         __get_free_pages(GFP_ATOMIC, order);
2639         } while (!tcp_bhash && --order >= 0);
2640 
2641         if (!tcp_bhash)
2642                 panic("Failed to allocate TCP bind hash table\n");
2643         for (i = 0; i < tcp_bhash_size; i++) {
2644                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2645                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2646         }
2647 
2648         /* Try to be a bit smarter and adjust defaults depending
2649          * on available memory.
2650          */
2651         if (order > 4) {
2652                 sysctl_local_port_range[0] = 32768;
2653                 sysctl_local_port_range[1] = 61000;
2654                 sysctl_tcp_max_tw_buckets = 180000;
2655                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2656                 sysctl_max_syn_backlog = 1024;
2657         } else if (order < 3) {
2658                 sysctl_local_port_range[0] = 1024 * (3 - order);
2659                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2660                 sysctl_tcp_max_orphans >>= (3 - order);
2661                 sysctl_max_syn_backlog = 128;
2662         }
2663         tcp_port_rover = sysctl_local_port_range[0] - 1;
2664 
2665         sysctl_tcp_mem[0] =  768 << order;
2666         sysctl_tcp_mem[1] = 1024 << order;
2667         sysctl_tcp_mem[2] = 1536 << order;
2668         if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2669                 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2670         if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2671                 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2672 
2673         if (order < 3) {
2674                 sysctl_tcp_wmem[2] = 64 * 1024;
2675                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2676                 sysctl_tcp_rmem[1] = 43689;
2677                 sysctl_tcp_rmem[2] = 2 * 43689;
2678         }
2679 
2680         printk(KERN_INFO "TCP: Hash tables configured "
2681                "(established %d bind %d)\n",
2682                tcp_ehash_size << 1, tcp_bhash_size);
2683 
2684         tcpdiag_init();
2685 }
2686 
2687 EXPORT_SYMBOL(__tcp_mem_reclaim);
2688 EXPORT_SYMBOL(sysctl_tcp_rmem);
2689 EXPORT_SYMBOL(sysctl_tcp_wmem);
2690 EXPORT_SYMBOL(tcp_accept);
2691 EXPORT_SYMBOL(tcp_close);
2692 EXPORT_SYMBOL(tcp_close_state);
2693 EXPORT_SYMBOL(tcp_destroy_sock);
2694 EXPORT_SYMBOL(tcp_disconnect);
2695 EXPORT_SYMBOL(tcp_getsockopt);
2696 EXPORT_SYMBOL(tcp_ioctl);
2697 EXPORT_SYMBOL(tcp_openreq_cachep);
2698 EXPORT_SYMBOL(tcp_poll);
2699 EXPORT_SYMBOL(tcp_read_sock);
2700 EXPORT_SYMBOL(tcp_recvmsg);
2701 EXPORT_SYMBOL(tcp_sendmsg);
2702 EXPORT_SYMBOL(tcp_sendpage);
2703 EXPORT_SYMBOL(tcp_setsockopt);
2704 EXPORT_SYMBOL(tcp_shutdown);
2705 EXPORT_SYMBOL(tcp_sockets_allocated);
2706 EXPORT_SYMBOL(tcp_statistics);
2707 EXPORT_SYMBOL(tcp_timewait_cachep);
2708 EXPORT_SYMBOL(tcp_write_space);
2709 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp