OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [net/] [ipv4/] [tcp.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * INET         An implementation of the TCP/IP protocol suite for the LINUX
3
 *              operating system.  INET is implemented using the  BSD Socket
4
 *              interface as the means of communication with the user level.
5
 *
6
 *              Implementation of the Transmission Control Protocol(TCP).
7
 *
8
 * Version:     $Id: tcp.c,v 1.1.1.1 2004-04-15 01:13:34 phoenix Exp $
9
 *
10
 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11
 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12
 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13
 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14
 *              Florian La Roche, <flla@stud.uni-sb.de>
15
 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16
 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17
 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18
 *              Matthew Dillon, <dillon@apollo.west.oic.com>
19
 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20
 *              Jorge Cwik, <jorge@laser.satlink.net>
21
 *
22
 * Fixes:
23
 *              Alan Cox        :       Numerous verify_area() calls
24
 *              Alan Cox        :       Set the ACK bit on a reset
25
 *              Alan Cox        :       Stopped it crashing if it closed while
26
 *                                      sk->inuse=1 and was trying to connect
27
 *                                      (tcp_err()).
28
 *              Alan Cox        :       All icmp error handling was broken
29
 *                                      pointers passed where wrong and the
30
 *                                      socket was looked up backwards. Nobody
31
 *                                      tested any icmp error code obviously.
32
 *              Alan Cox        :       tcp_err() now handled properly. It
33
 *                                      wakes people on errors. poll
34
 *                                      behaves and the icmp error race
35
 *                                      has gone by moving it into sock.c
36
 *              Alan Cox        :       tcp_send_reset() fixed to work for
37
 *                                      everything not just packets for
38
 *                                      unknown sockets.
39
 *              Alan Cox        :       tcp option processing.
40
 *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41
 *                                      syn rule wrong]
42
 *              Herp Rosmanith  :       More reset fixes
43
 *              Alan Cox        :       No longer acks invalid rst frames.
44
 *                                      Acking any kind of RST is right out.
45
 *              Alan Cox        :       Sets an ignore me flag on an rst
46
 *                                      receive otherwise odd bits of prattle
47
 *                                      escape still
48
 *              Alan Cox        :       Fixed another acking RST frame bug.
49
 *                                      Should stop LAN workplace lockups.
50
 *              Alan Cox        :       Some tidyups using the new skb list
51
 *                                      facilities
52
 *              Alan Cox        :       sk->keepopen now seems to work
53
 *              Alan Cox        :       Pulls options out correctly on accepts
54
 *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55
 *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56
 *                                      bit to skb ops.
57
 *              Alan Cox        :       Tidied tcp_data to avoid a potential
58
 *                                      nasty.
59
 *              Alan Cox        :       Added some better commenting, as the
60
 *                                      tcp is hard to follow
61
 *              Alan Cox        :       Removed incorrect check for 20 * psh
62
 *      Michael O'Reilly        :       ack < copied bug fix.
63
 *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64
 *              Alan Cox        :       FIN with no memory -> CRASH
65
 *              Alan Cox        :       Added socket option proto entries.
66
 *                                      Also added awareness of them to accept.
67
 *              Alan Cox        :       Added TCP options (SOL_TCP)
68
 *              Alan Cox        :       Switched wakeup calls to callbacks,
69
 *                                      so the kernel can layer network
70
 *                                      sockets.
71
 *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72
 *              Alan Cox        :       Handle FIN (more) properly (we hope).
73
 *              Alan Cox        :       RST frames sent on unsynchronised
74
 *                                      state ack error.
75
 *              Alan Cox        :       Put in missing check for SYN bit.
76
 *              Alan Cox        :       Added tcp_select_window() aka NET2E
77
 *                                      window non shrink trick.
78
 *              Alan Cox        :       Added a couple of small NET2E timer
79
 *                                      fixes
80
 *              Charles Hedrick :       TCP fixes
81
 *              Toomas Tamm     :       TCP window fixes
82
 *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83
 *              Charles Hedrick :       Rewrote most of it to actually work
84
 *              Linus           :       Rewrote tcp_read() and URG handling
85
 *                                      completely
86
 *              Gerhard Koerting:       Fixed some missing timer handling
87
 *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88
 *              Gerhard Koerting:       PC/TCP workarounds
89
 *              Adam Caldwell   :       Assorted timer/timing errors
90
 *              Matthew Dillon  :       Fixed another RST bug
91
 *              Alan Cox        :       Move to kernel side addressing changes.
92
 *              Alan Cox        :       Beginning work on TCP fastpathing
93
 *                                      (not yet usable)
94
 *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95
 *              Alan Cox        :       TCP fast path debugging
96
 *              Alan Cox        :       Window clamping
97
 *              Michael Riepe   :       Bug in tcp_check()
98
 *              Matt Dillon     :       More TCP improvements and RST bug fixes
99
 *              Matt Dillon     :       Yet more small nasties remove from the
100
 *                                      TCP code (Be very nice to this man if
101
 *                                      tcp finally works 100%) 8)
102
 *              Alan Cox        :       BSD accept semantics.
103
 *              Alan Cox        :       Reset on closedown bug.
104
 *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105
 *              Michael Pall    :       Handle poll() after URG properly in
106
 *                                      all cases.
107
 *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108
 *                                      (multi URG PUSH broke rlogin).
109
 *              Michael Pall    :       Fix the multi URG PUSH problem in
110
 *                                      tcp_readable(), poll() after URG
111
 *                                      works now.
112
 *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113
 *                                      BSD api.
114
 *              Alan Cox        :       Changed the semantics of sk->socket to
115
 *                                      fix a race and a signal problem with
116
 *                                      accept() and async I/O.
117
 *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118
 *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119
 *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120
 *                                      clients/servers which listen in on
121
 *                                      fixed ports.
122
 *              Alan Cox        :       Cleaned the above up and shrank it to
123
 *                                      a sensible code size.
124
 *              Alan Cox        :       Self connect lockup fix.
125
 *              Alan Cox        :       No connect to multicast.
126
 *              Ross Biro       :       Close unaccepted children on master
127
 *                                      socket close.
128
 *              Alan Cox        :       Reset tracing code.
129
 *              Alan Cox        :       Spurious resets on shutdown.
130
 *              Alan Cox        :       Giant 15 minute/60 second timer error
131
 *              Alan Cox        :       Small whoops in polling before an
132
 *                                      accept.
133
 *              Alan Cox        :       Kept the state trace facility since
134
 *                                      it's handy for debugging.
135
 *              Alan Cox        :       More reset handler fixes.
136
 *              Alan Cox        :       Started rewriting the code based on
137
 *                                      the RFC's for other useful protocol
138
 *                                      references see: Comer, KA9Q NOS, and
139
 *                                      for a reference on the difference
140
 *                                      between specifications and how BSD
141
 *                                      works see the 4.4lite source.
142
 *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143
 *                                      close.
144
 *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145
 *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146
 *              Alan Cox        :       Reimplemented timers as per the RFC
147
 *                                      and using multiple timers for sanity.
148
 *              Alan Cox        :       Small bug fixes, and a lot of new
149
 *                                      comments.
150
 *              Alan Cox        :       Fixed dual reader crash by locking
151
 *                                      the buffers (much like datagram.c)
152
 *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153
 *                                      now gets fed up of retrying without
154
 *                                      (even a no space) answer.
155
 *              Alan Cox        :       Extracted closing code better
156
 *              Alan Cox        :       Fixed the closing state machine to
157
 *                                      resemble the RFC.
158
 *              Alan Cox        :       More 'per spec' fixes.
159
 *              Jorge Cwik      :       Even faster checksumming.
160
 *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161
 *                                      only frames. At least one pc tcp stack
162
 *                                      generates them.
163
 *              Alan Cox        :       Cache last socket.
164
 *              Alan Cox        :       Per route irtt.
165
 *              Matt Day        :       poll()->select() match BSD precisely on error
166
 *              Alan Cox        :       New buffers
167
 *              Marc Tamsky     :       Various sk->prot->retransmits and
168
 *                                      sk->retransmits misupdating fixed.
169
 *                                      Fixed tcp_write_timeout: stuck close,
170
 *                                      and TCP syn retries gets used now.
171
 *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172
 *                                      ack if state is TCP_CLOSED.
173
 *              Alan Cox        :       Look up device on a retransmit - routes may
174
 *                                      change. Doesn't yet cope with MSS shrink right
175
 *                                      but its a start!
176
 *              Marc Tamsky     :       Closing in closing fixes.
177
 *              Mike Shaver     :       RFC1122 verifications.
178
 *              Alan Cox        :       rcv_saddr errors.
179
 *              Alan Cox        :       Block double connect().
180
 *              Alan Cox        :       Small hooks for enSKIP.
181
 *              Alexey Kuznetsov:       Path MTU discovery.
182
 *              Alan Cox        :       Support soft errors.
183
 *              Alan Cox        :       Fix MTU discovery pathological case
184
 *                                      when the remote claims no mtu!
185
 *              Marc Tamsky     :       TCP_CLOSE fix.
186
 *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187
 *                                      window but wrong (fixes NT lpd problems)
188
 *              Pedro Roque     :       Better TCP window handling, delayed ack.
189
 *              Joerg Reuter    :       No modification of locked buffers in
190
 *                                      tcp_do_retransmit()
191
 *              Eric Schenk     :       Changed receiver side silly window
192
 *                                      avoidance algorithm to BSD style
193
 *                                      algorithm. This doubles throughput
194
 *                                      against machines running Solaris,
195
 *                                      and seems to result in general
196
 *                                      improvement.
197
 *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198
 *      Willy Konynenberg       :       Transparent proxying support.
199
 *      Mike McLagan            :       Routing by source
200
 *              Keith Owens     :       Do proper merging with partial SKB's in
201
 *                                      tcp_do_sendmsg to avoid burstiness.
202
 *              Eric Schenk     :       Fix fast close down bug with
203
 *                                      shutdown() followed by close().
204
 *              Andi Kleen      :       Make poll agree with SIGIO
205
 *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206
 *                                      lingertime == 0 (RFC 793 ABORT Call)
207
 *
208
 *              This program is free software; you can redistribute it and/or
209
 *              modify it under the terms of the GNU General Public License
210
 *              as published by the Free Software Foundation; either version
211
 *              2 of the License, or(at your option) any later version.
212
 *
213
 * Description of States:
214
 *
215
 *      TCP_SYN_SENT            sent a connection request, waiting for ack
216
 *
217
 *      TCP_SYN_RECV            received a connection request, sent ack,
218
 *                              waiting for final ack in three-way handshake.
219
 *
220
 *      TCP_ESTABLISHED         connection established
221
 *
222
 *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
223
 *                              transmission of remaining buffered data
224
 *
225
 *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
226
 *                              to shutdown
227
 *
228
 *      TCP_CLOSING             both sides have shutdown but we still have
229
 *                              data we have to finish sending
230
 *
231
 *      TCP_TIME_WAIT           timeout to catch resent junk before entering
232
 *                              closed, can only be entered from FIN_WAIT2
233
 *                              or CLOSING.  Required because the other end
234
 *                              may not have gotten our last ACK causing it
235
 *                              to retransmit the data packet (which we ignore)
236
 *
237
 *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
238
 *                              us to finish writing our data and to shutdown
239
 *                              (we have to close() to move on to LAST_ACK)
240
 *
241
 *      TCP_LAST_ACK            out side has shutdown after remote has
242
 *                              shutdown.  There may still be data in our
243
 *                              buffer that we have to finish sending
244
 *
245
 *      TCP_CLOSE               socket is finished
246
 */
247
 
248
#include <linux/config.h>
249
#include <linux/types.h>
250
#include <linux/fcntl.h>
251
#include <linux/poll.h>
252
#include <linux/init.h>
253
#include <linux/smp_lock.h>
254
#include <linux/fs.h>
255
#include <linux/random.h>
256
 
257
#include <net/icmp.h>
258
#include <net/tcp.h>
259
 
260
#include <asm/uaccess.h>
261
#include <asm/ioctls.h>
262
 
263
int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
264
 
265
struct tcp_mib  tcp_statistics[NR_CPUS*2];
266
 
267
kmem_cache_t *tcp_openreq_cachep;
268
kmem_cache_t *tcp_bucket_cachep;
269
kmem_cache_t *tcp_timewait_cachep;
270
 
271
atomic_t tcp_orphan_count = ATOMIC_INIT(0);
272
 
273
int sysctl_tcp_mem[3];
274
int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
275
int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
276
 
277
atomic_t tcp_memory_allocated;  /* Current allocated memory. */
278
atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
279
 
280
/* Pressure flag: try to collapse.
281
 * Technical note: it is used by multiple contexts non atomically.
282
 * All the tcp_mem_schedule() is of this nature: accounting
283
 * is strict, actions are advisory and have some latency. */
284
int tcp_memory_pressure;
285
 
286
#define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
287
 
288
int tcp_mem_schedule(struct sock *sk, int size, int kind)
289
{
290
        int amt = TCP_PAGES(size);
291
 
292
        sk->forward_alloc += amt*TCP_MEM_QUANTUM;
293
        atomic_add(amt, &tcp_memory_allocated);
294
 
295
        /* Under limit. */
296
        if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
297
                if (tcp_memory_pressure)
298
                        tcp_memory_pressure = 0;
299
                return 1;
300
        }
301
 
302
        /* Over hard limit. */
303
        if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
304
                tcp_enter_memory_pressure();
305
                goto suppress_allocation;
306
        }
307
 
308
        /* Under pressure. */
309
        if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
310
                tcp_enter_memory_pressure();
311
 
312
        if (kind) {
313
                if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
314
                        return 1;
315
        } else {
316
                if (sk->wmem_queued < sysctl_tcp_wmem[0])
317
                        return 1;
318
        }
319
 
320
        if (!tcp_memory_pressure ||
321
            sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
322
            * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
323
                        sk->forward_alloc))
324
                return 1;
325
 
326
suppress_allocation:
327
 
328
        if (kind == 0) {
329
                tcp_moderate_sndbuf(sk);
330
 
331
                /* Fail only if socket is _under_ its sndbuf.
332
                 * In this case we cannot block, so that we have to fail.
333
                 */
334
                if (sk->wmem_queued+size >= sk->sndbuf)
335
                        return 1;
336
        }
337
 
338
        /* Alas. Undo changes. */
339
        sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
340
        atomic_sub(amt, &tcp_memory_allocated);
341
        return 0;
342
}
343
 
344
void __tcp_mem_reclaim(struct sock *sk)
345
{
346
        if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
347
                atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
348
                sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
349
                if (tcp_memory_pressure &&
350
                    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
351
                        tcp_memory_pressure = 0;
352
        }
353
}
354
 
355
void tcp_rfree(struct sk_buff *skb)
356
{
357
        struct sock *sk = skb->sk;
358
 
359
        atomic_sub(skb->truesize, &sk->rmem_alloc);
360
        sk->forward_alloc += skb->truesize;
361
}
362
 
363
/*
364
 * LISTEN is a special case for poll..
365
 */
366
static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
367
{
368
        return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
369
}
370
 
371
/*
372
 *      Wait for a TCP event.
373
 *
374
 *      Note that we don't need to lock the socket, as the upper poll layers
375
 *      take care of normal races (between the test and the event) and we don't
376
 *      go look at any of the socket buffers directly.
377
 */
378
unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
379
{
380
        unsigned int mask;
381
        struct sock *sk = sock->sk;
382
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
383
 
384
        poll_wait(file, sk->sleep, wait);
385
        if (sk->state == TCP_LISTEN)
386
                return tcp_listen_poll(sk, wait);
387
 
388
        /* Socket is not locked. We are protected from async events
389
           by poll logic and correct handling of state changes
390
           made by another threads is impossible in any case.
391
         */
392
 
393
        mask = 0;
394
        if (sk->err)
395
                mask = POLLERR;
396
 
397
        /*
398
         * POLLHUP is certainly not done right. But poll() doesn't
399
         * have a notion of HUP in just one direction, and for a
400
         * socket the read side is more interesting.
401
         *
402
         * Some poll() documentation says that POLLHUP is incompatible
403
         * with the POLLOUT/POLLWR flags, so somebody should check this
404
         * all. But careful, it tends to be safer to return too many
405
         * bits than too few, and you can easily break real applications
406
         * if you don't tell them that something has hung up!
407
         *
408
         * Check-me.
409
         *
410
         * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
411
         * our fs/select.c). It means that after we received EOF,
412
         * poll always returns immediately, making impossible poll() on write()
413
         * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
414
         * if and only if shutdown has been made in both directions.
415
         * Actually, it is interesting to look how Solaris and DUX
416
         * solve this dilemma. I would prefer, if PULLHUP were maskable,
417
         * then we could set it on SND_SHUTDOWN. BTW examples given
418
         * in Stevens' books assume exactly this behaviour, it explains
419
         * why PULLHUP is incompatible with POLLOUT.    --ANK
420
         *
421
         * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
422
         * blocking on fresh not-connected or disconnected socket. --ANK
423
         */
424
        if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
425
                mask |= POLLHUP;
426
        if (sk->shutdown & RCV_SHUTDOWN)
427
                mask |= POLLIN | POLLRDNORM;
428
 
429
        /* Connected? */
430
        if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
431
                /* Potential race condition. If read of tp below will
432
                 * escape above sk->state, we can be illegally awaken
433
                 * in SYN_* states. */
434
                if ((tp->rcv_nxt != tp->copied_seq) &&
435
                    (tp->urg_seq != tp->copied_seq ||
436
                     tp->rcv_nxt != tp->copied_seq+1 ||
437
                     sk->urginline || !tp->urg_data))
438
                        mask |= POLLIN | POLLRDNORM;
439
 
440
                if (!(sk->shutdown & SEND_SHUTDOWN)) {
441
                        if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
442
                                mask |= POLLOUT | POLLWRNORM;
443
                        } else {  /* send SIGIO later */
444
                                set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
445
                                set_bit(SOCK_NOSPACE, &sk->socket->flags);
446
 
447
                                /* Race breaker. If space is freed after
448
                                 * wspace test but before the flags are set,
449
                                 * IO signal will be lost.
450
                                 */
451
                                if (tcp_wspace(sk) >= tcp_min_write_space(sk))
452
                                        mask |= POLLOUT | POLLWRNORM;
453
                        }
454
                }
455
 
456
                if (tp->urg_data & TCP_URG_VALID)
457
                        mask |= POLLPRI;
458
        }
459
        return mask;
460
}
461
 
462
/*
463
 *      TCP socket write_space callback.
464
 */
465
void tcp_write_space(struct sock *sk)
466
{
467
        struct socket *sock = sk->socket;
468
 
469
        if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
470
                clear_bit(SOCK_NOSPACE, &sock->flags);
471
 
472
                if (sk->sleep && waitqueue_active(sk->sleep))
473
                        wake_up_interruptible(sk->sleep);
474
 
475
                if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
476
                        sock_wake_async(sock, 2, POLL_OUT);
477
        }
478
}
479
 
480
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
481
{
482
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
483
        int answ;
484
 
485
        switch(cmd) {
486
        case SIOCINQ:
487
                if (sk->state == TCP_LISTEN)
488
                        return(-EINVAL);
489
 
490
                lock_sock(sk);
491
                if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
492
                        answ = 0;
493
                else if (sk->urginline || !tp->urg_data ||
494
                         before(tp->urg_seq,tp->copied_seq) ||
495
                         !before(tp->urg_seq,tp->rcv_nxt)) {
496
                        answ = tp->rcv_nxt - tp->copied_seq;
497
 
498
                        /* Subtract 1, if FIN is in queue. */
499
                        if (answ && !skb_queue_empty(&sk->receive_queue))
500
                                answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
501
                } else
502
                        answ = tp->urg_seq - tp->copied_seq;
503
                release_sock(sk);
504
                break;
505
        case SIOCATMARK:
506
                {
507
                        answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
508
                        break;
509
                }
510
        case SIOCOUTQ:
511
                if (sk->state == TCP_LISTEN)
512
                        return(-EINVAL);
513
 
514
                if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
515
                        answ = 0;
516
                else
517
                        answ = tp->write_seq - tp->snd_una;
518
                break;
519
        default:
520
                return(-ENOIOCTLCMD);
521
        };
522
 
523
        return put_user(answ, (int *)arg);
524
}
525
 
526
 
527
int tcp_listen_start(struct sock *sk)
528
{
529
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
530
        struct tcp_listen_opt *lopt;
531
 
532
        sk->max_ack_backlog = 0;
533
        sk->ack_backlog = 0;
534
        tp->accept_queue = tp->accept_queue_tail = NULL;
535
        tp->syn_wait_lock = RW_LOCK_UNLOCKED;
536
        tcp_delack_init(tp);
537
 
538
        lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
539
        if (!lopt)
540
                return -ENOMEM;
541
 
542
        memset(lopt, 0, sizeof(struct tcp_listen_opt));
543
        for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
544
                if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
545
                        break;
546
        get_random_bytes(&lopt->hash_rnd, 4);
547
 
548
        write_lock_bh(&tp->syn_wait_lock);
549
        tp->listen_opt = lopt;
550
        write_unlock_bh(&tp->syn_wait_lock);
551
 
552
        /* There is race window here: we announce ourselves listening,
553
         * but this transition is still not validated by get_port().
554
         * It is OK, because this socket enters to hash table only
555
         * after validation is complete.
556
         */
557
        sk->state = TCP_LISTEN;
558
        if (sk->prot->get_port(sk, sk->num) == 0) {
559
                sk->sport = htons(sk->num);
560
 
561
                sk_dst_reset(sk);
562
                sk->prot->hash(sk);
563
 
564
                return 0;
565
        }
566
 
567
        sk->state = TCP_CLOSE;
568
        write_lock_bh(&tp->syn_wait_lock);
569
        tp->listen_opt = NULL;
570
        write_unlock_bh(&tp->syn_wait_lock);
571
        kfree(lopt);
572
        return -EADDRINUSE;
573
}
574
 
575
/*
576
 *      This routine closes sockets which have been at least partially
577
 *      opened, but not yet accepted.
578
 */
579
 
580
static void tcp_listen_stop (struct sock *sk)
581
{
582
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
583
        struct tcp_listen_opt *lopt = tp->listen_opt;
584
        struct open_request *acc_req = tp->accept_queue;
585
        struct open_request *req;
586
        int i;
587
 
588
        tcp_delete_keepalive_timer(sk);
589
 
590
        /* make all the listen_opt local to us */
591
        write_lock_bh(&tp->syn_wait_lock);
592
        tp->listen_opt =NULL;
593
        write_unlock_bh(&tp->syn_wait_lock);
594
        tp->accept_queue = tp->accept_queue_tail = NULL;
595
 
596
        if (lopt->qlen) {
597
                for (i=0; i<TCP_SYNQ_HSIZE; i++) {
598
                        while ((req = lopt->syn_table[i]) != NULL) {
599
                                lopt->syn_table[i] = req->dl_next;
600
                                lopt->qlen--;
601
                                tcp_openreq_free(req);
602
 
603
                /* Following specs, it would be better either to send FIN
604
                 * (and enter FIN-WAIT-1, it is normal close)
605
                 * or to send active reset (abort).
606
                 * Certainly, it is pretty dangerous while synflood, but it is
607
                 * bad justification for our negligence 8)
608
                 * To be honest, we are not able to make either
609
                 * of the variants now.                 --ANK
610
                 */
611
                        }
612
                }
613
        }
614
        BUG_TRAP(lopt->qlen == 0);
615
 
616
        kfree(lopt);
617
 
618
        while ((req=acc_req) != NULL) {
619
                struct sock *child = req->sk;
620
 
621
                acc_req = req->dl_next;
622
 
623
                local_bh_disable();
624
                bh_lock_sock(child);
625
                BUG_TRAP(child->lock.users==0);
626
                sock_hold(child);
627
 
628
                tcp_disconnect(child, O_NONBLOCK);
629
 
630
                sock_orphan(child);
631
 
632
                atomic_inc(&tcp_orphan_count);
633
 
634
                tcp_destroy_sock(child);
635
 
636
                bh_unlock_sock(child);
637
                local_bh_enable();
638
                sock_put(child);
639
 
640
                tcp_acceptq_removed(sk);
641
                tcp_openreq_fastfree(req);
642
        }
643
        BUG_TRAP(sk->ack_backlog == 0);
644
}
645
 
646
/*
647
 *      Wait for a socket to get into the connected state
648
 *
649
 *      Note: Must be called with the socket locked.
650
 */
651
static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
652
{
653
        struct task_struct *tsk = current;
654
        DECLARE_WAITQUEUE(wait, tsk);
655
 
656
        while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
657
                if(sk->err)
658
                        return sock_error(sk);
659
                if((1 << sk->state) &
660
                   ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
661
                        return -EPIPE;
662
                if(!*timeo_p)
663
                        return -EAGAIN;
664
                if(signal_pending(tsk))
665
                        return sock_intr_errno(*timeo_p);
666
 
667
                __set_task_state(tsk, TASK_INTERRUPTIBLE);
668
                add_wait_queue(sk->sleep, &wait);
669
                sk->tp_pinfo.af_tcp.write_pending++;
670
 
671
                release_sock(sk);
672
                *timeo_p = schedule_timeout(*timeo_p);
673
                lock_sock(sk);
674
 
675
                __set_task_state(tsk, TASK_RUNNING);
676
                remove_wait_queue(sk->sleep, &wait);
677
                sk->tp_pinfo.af_tcp.write_pending--;
678
        }
679
        return 0;
680
}
681
 
682
static inline int tcp_memory_free(struct sock *sk)
683
{
684
        return sk->wmem_queued < sk->sndbuf;
685
}
686
 
687
/*
688
 *      Wait for more memory for a socket
689
 */
690
static int wait_for_tcp_memory(struct sock * sk, long *timeo)
691
{
692
        int err = 0;
693
        long vm_wait = 0;
694
        long current_timeo = *timeo;
695
        DECLARE_WAITQUEUE(wait, current);
696
 
697
        if (tcp_memory_free(sk))
698
                current_timeo = vm_wait = (net_random()%(HZ/5))+2;
699
 
700
        add_wait_queue(sk->sleep, &wait);
701
        for (;;) {
702
                set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
703
 
704
                set_current_state(TASK_INTERRUPTIBLE);
705
 
706
                if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
707
                        goto do_error;
708
                if (!*timeo)
709
                        goto do_nonblock;
710
                if (signal_pending(current))
711
                        goto do_interrupted;
712
                clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
713
                if (tcp_memory_free(sk) && !vm_wait)
714
                        break;
715
 
716
                set_bit(SOCK_NOSPACE, &sk->socket->flags);
717
                sk->tp_pinfo.af_tcp.write_pending++;
718
                release_sock(sk);
719
                if (!tcp_memory_free(sk) || vm_wait)
720
                        current_timeo = schedule_timeout(current_timeo);
721
                lock_sock(sk);
722
                sk->tp_pinfo.af_tcp.write_pending--;
723
 
724
                if (vm_wait) {
725
                        vm_wait -= current_timeo;
726
                        current_timeo = *timeo;
727
                        if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
728
                            (current_timeo -= vm_wait) < 0)
729
                                current_timeo = 0;
730
                        vm_wait = 0;
731
                }
732
                *timeo = current_timeo;
733
        }
734
out:
735
        current->state = TASK_RUNNING;
736
        remove_wait_queue(sk->sleep, &wait);
737
        return err;
738
 
739
do_error:
740
        err = -EPIPE;
741
        goto out;
742
do_nonblock:
743
        err = -EAGAIN;
744
        goto out;
745
do_interrupted:
746
        err = sock_intr_errno(*timeo);
747
        goto out;
748
}
749
 
750
ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
751
 
752
static inline int
753
can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
754
{
755
        if (i) {
756
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
757
                return page == frag->page &&
758
                        off == frag->page_offset+frag->size;
759
        }
760
        return 0;
761
}
762
 
763
static inline void
764
fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
765
{
766
        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
767
        frag->page = page;
768
        frag->page_offset = off;
769
        frag->size = size;
770
        skb_shinfo(skb)->nr_frags = i+1;
771
}
772
 
773
static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
774
{
775
        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
776
        tp->pushed_seq = tp->write_seq;
777
}
778
 
779
static inline int forced_push(struct tcp_opt *tp)
780
{
781
        return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1));
782
}
783
 
784
static inline void
785
skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
786
{
787
        skb->csum = 0;
788
        TCP_SKB_CB(skb)->seq = tp->write_seq;
789
        TCP_SKB_CB(skb)->end_seq = tp->write_seq;
790
        TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
791
        TCP_SKB_CB(skb)->sacked = 0;
792
        __skb_queue_tail(&sk->write_queue, skb);
793
        tcp_charge_skb(sk, skb);
794
        if (tp->send_head == NULL)
795
                tp->send_head = skb;
796
}
797
 
798
static inline void
799
tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb)
800
{
801
        if (flags & MSG_OOB) {
802
                tp->urg_mode = 1;
803
                tp->snd_up = tp->write_seq;
804
                TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
805
        }
806
}
807
 
808
static inline void
809
tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle)
810
{
811
        if (tp->send_head) {
812
                struct sk_buff *skb = sk->write_queue.prev;
813
                if (!(flags&MSG_MORE) || forced_push(tp))
814
                        tcp_mark_push(tp, skb);
815
                tcp_mark_urg(tp, flags, skb);
816
                __tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle);
817
        }
818
}
819
 
820
static int tcp_error(struct sock *sk, int flags, int err)
821
{
822
        if (err == -EPIPE)
823
                err = sock_error(sk) ? : -EPIPE;
824
        if (err == -EPIPE && !(flags&MSG_NOSIGNAL))
825
                send_sig(SIGPIPE, current, 0);
826
        return err;
827
}
828
 
829
ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
830
{
831
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
832
        int mss_now;
833
        int err;
834
        ssize_t copied;
835
        long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
836
 
837
        /* Wait for a connection to finish. */
838
        if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
839
                if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
840
                        goto out_err;
841
 
842
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
843
 
844
        mss_now = tcp_current_mss(sk);
845
        copied = 0;
846
 
847
        err = -EPIPE;
848
        if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
849
                goto do_error;
850
 
851
        while (psize > 0) {
852
                struct sk_buff *skb = sk->write_queue.prev;
853
                int offset, size, copy, i;
854
                struct page *page;
855
 
856
                page = pages[poffset/PAGE_SIZE];
857
                offset = poffset % PAGE_SIZE;
858
                size = min_t(size_t, psize, PAGE_SIZE-offset);
859
 
860
                if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
861
new_segment:
862
                        if (!tcp_memory_free(sk))
863
                                goto wait_for_sndbuf;
864
 
865
                        skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation);
866
                        if (skb == NULL)
867
                                goto wait_for_memory;
868
 
869
                        skb_entail(sk, tp, skb);
870
                        copy = mss_now;
871
                }
872
 
873
                if (copy > size)
874
                        copy = size;
875
 
876
                i = skb_shinfo(skb)->nr_frags;
877
                if (can_coalesce(skb, i, page, offset)) {
878
                        skb_shinfo(skb)->frags[i-1].size += copy;
879
                } else if (i < MAX_SKB_FRAGS) {
880
                        get_page(page);
881
                        fill_page_desc(skb, i, page, offset, copy);
882
                } else {
883
                        tcp_mark_push(tp, skb);
884
                        goto new_segment;
885
                }
886
 
887
                skb->len += copy;
888
                skb->data_len += copy;
889
                skb->ip_summed = CHECKSUM_HW;
890
                tp->write_seq += copy;
891
                TCP_SKB_CB(skb)->end_seq += copy;
892
 
893
                if (!copied)
894
                        TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
895
 
896
                copied += copy;
897
                poffset += copy;
898
                if (!(psize -= copy))
899
                        goto out;
900
 
901
                if (skb->len != mss_now || (flags&MSG_OOB))
902
                        continue;
903
 
904
                if (forced_push(tp)) {
905
                        tcp_mark_push(tp, skb);
906
                        __tcp_push_pending_frames(sk, tp, mss_now, 1);
907
                } else if (skb == tp->send_head)
908
                        tcp_push_one(sk, mss_now);
909
                continue;
910
 
911
wait_for_sndbuf:
912
                set_bit(SOCK_NOSPACE, &sk->socket->flags);
913
wait_for_memory:
914
                if (copied)
915
                        tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
916
 
917
                if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
918
                        goto do_error;
919
 
920
                mss_now = tcp_current_mss(sk);
921
        }
922
 
923
out:
924
        if (copied)
925
                tcp_push(sk, tp, flags, mss_now, tp->nonagle);
926
        return copied;
927
 
928
do_error:
929
        if (copied)
930
                goto out;
931
out_err:
932
        return tcp_error(sk, flags, err);
933
}
934
 
935
ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
936
{
937
        ssize_t res;
938
        struct sock *sk = sock->sk;
939
 
940
#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
941
 
942
        if (!(sk->route_caps & NETIF_F_SG) ||
943
            !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
944
                return sock_no_sendpage(sock, page, offset, size, flags);
945
 
946
#undef TCP_ZC_CSUM_FLAGS
947
 
948
        lock_sock(sk);
949
        TCP_CHECK_TIMER(sk);
950
        res = do_tcp_sendpages(sk, &page, offset, size, flags);
951
        TCP_CHECK_TIMER(sk);
952
        release_sock(sk);
953
        return res;
954
}
955
 
956
#define TCP_PAGE(sk)    (sk->tp_pinfo.af_tcp.sndmsg_page)
957
#define TCP_OFF(sk)     (sk->tp_pinfo.af_tcp.sndmsg_off)
958
 
959
static inline int
960
tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
961
                 struct page *page, int off, int copy)
962
{
963
        int err = 0;
964
        unsigned int csum;
965
 
966
        csum = csum_and_copy_from_user(from, page_address(page)+off,
967
                                       copy, 0, &err);
968
        if (!err) {
969
                if (skb->ip_summed == CHECKSUM_NONE)
970
                        skb->csum = csum_block_add(skb->csum, csum, skb->len);
971
                skb->len += copy;
972
                skb->data_len += copy;
973
                skb->truesize += copy;
974
                sk->wmem_queued += copy;
975
                sk->forward_alloc -= copy;
976
        }
977
        return err;
978
}
979
 
980
static inline int
981
skb_add_data(struct sk_buff *skb, char *from, int copy)
982
{
983
        int err = 0;
984
        unsigned int csum;
985
        int off = skb->len;
986
 
987
        csum = csum_and_copy_from_user(from, skb_put(skb, copy),
988
                                       copy, 0, &err);
989
        if (!err) {
990
                skb->csum = csum_block_add(skb->csum, csum, off);
991
                return 0;
992
        }
993
 
994
        __skb_trim(skb, off);
995
        return -EFAULT;
996
}
997
 
998
static inline int select_size(struct sock *sk, struct tcp_opt *tp)
999
{
1000
        int tmp = tp->mss_cache;
1001
 
1002
        if (sk->route_caps&NETIF_F_SG) {
1003
                int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1004
 
1005
                if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE)
1006
                        tmp = pgbreak;
1007
        }
1008
        return tmp;
1009
}
1010
 
1011
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
1012
{
1013
        struct iovec *iov;
1014
        struct tcp_opt *tp;
1015
        struct sk_buff *skb;
1016
        int iovlen, flags;
1017
        int mss_now;
1018
        int err, copied;
1019
        long timeo;
1020
 
1021
        tp = &(sk->tp_pinfo.af_tcp);
1022
 
1023
        lock_sock(sk);
1024
        TCP_CHECK_TIMER(sk);
1025
 
1026
        flags = msg->msg_flags;
1027
        timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
1028
 
1029
        /* Wait for a connection to finish. */
1030
        if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1031
                if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1032
                        goto out_err;
1033
 
1034
        /* This should be in poll */
1035
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1036
 
1037
        mss_now = tcp_current_mss(sk);
1038
 
1039
        /* Ok commence sending. */
1040
        iovlen = msg->msg_iovlen;
1041
        iov = msg->msg_iov;
1042
        copied = 0;
1043
 
1044
        err = -EPIPE;
1045
        if (sk->err || (sk->shutdown&SEND_SHUTDOWN))
1046
                goto do_error;
1047
 
1048
        while (--iovlen >= 0) {
1049
                int seglen=iov->iov_len;
1050
                unsigned char * from=iov->iov_base;
1051
 
1052
                iov++;
1053
 
1054
                while (seglen > 0) {
1055
                        int copy;
1056
 
1057
                        skb = sk->write_queue.prev;
1058
 
1059
                        if (tp->send_head == NULL ||
1060
                            (copy = mss_now - skb->len) <= 0) {
1061
 
1062
new_segment:
1063
                                /* Allocate new segment. If the interface is SG,
1064
                                 * allocate skb fitting to single page.
1065
                                 */
1066
                                if (!tcp_memory_free(sk))
1067
                                        goto wait_for_sndbuf;
1068
 
1069
                                skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation);
1070
                                if (skb == NULL)
1071
                                        goto wait_for_memory;
1072
 
1073
                                skb_entail(sk, tp, skb);
1074
                                copy = mss_now;
1075
                        }
1076
 
1077
                        /* Try to append data to the end of skb. */
1078
                        if (copy > seglen)
1079
                                copy = seglen;
1080
 
1081
                        /* Where to copy to? */
1082
                        if (skb_tailroom(skb) > 0) {
1083
                                /* We have some space in skb head. Superb! */
1084
                                if (copy > skb_tailroom(skb))
1085
                                        copy = skb_tailroom(skb);
1086
                                if ((err = skb_add_data(skb, from, copy)) != 0)
1087
                                        goto do_fault;
1088
                        } else {
1089
                                int merge = 0;
1090
                                int i = skb_shinfo(skb)->nr_frags;
1091
                                struct page *page = TCP_PAGE(sk);
1092
                                int off = TCP_OFF(sk);
1093
 
1094
                                if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
1095
                                        /* We can extend the last page fragment. */
1096
                                        merge = 1;
1097
                                } else if (i == MAX_SKB_FRAGS ||
1098
                                           (i == 0 && !(sk->route_caps&NETIF_F_SG))) {
1099
                                        /* Need to add new fragment and cannot
1100
                                         * do this because interface is non-SG,
1101
                                         * or because all the page slots are busy.
1102
                                         */
1103
                                        tcp_mark_push(tp, skb);
1104
                                        goto new_segment;
1105
                                } else if (page) {
1106
                                        /* If page is cached, align
1107
                                         * offset to L1 cache boundary
1108
                                         */
1109
                                        off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1);
1110
                                        if (off == PAGE_SIZE) {
1111
                                                put_page(page);
1112
                                                TCP_PAGE(sk) = page = NULL;
1113
                                        }
1114
                                }
1115
 
1116
                                if (!page) {
1117
                                        /* Allocate new cache page. */
1118
                                        if (!(page=tcp_alloc_page(sk)))
1119
                                                goto wait_for_memory;
1120
                                        off = 0;
1121
                                }
1122
 
1123
                                if (copy > PAGE_SIZE-off)
1124
                                        copy = PAGE_SIZE-off;
1125
 
1126
                                /* Time to copy data. We are close to the end! */
1127
                                err = tcp_copy_to_page(sk, from, skb, page, off, copy);
1128
                                if (err) {
1129
                                        /* If this page was new, give it to the
1130
                                         * socket so it does not get leaked.
1131
                                         */
1132
                                        if (TCP_PAGE(sk) == NULL) {
1133
                                                TCP_PAGE(sk) = page;
1134
                                                TCP_OFF(sk) = 0;
1135
                                        }
1136
                                        goto do_error;
1137
                                }
1138
 
1139
                                /* Update the skb. */
1140
                                if (merge) {
1141
                                        skb_shinfo(skb)->frags[i-1].size += copy;
1142
                                } else {
1143
                                        fill_page_desc(skb, i, page, off, copy);
1144
                                        if (TCP_PAGE(sk)) {
1145
                                                get_page(page);
1146
                                        } else if (off + copy < PAGE_SIZE) {
1147
                                                get_page(page);
1148
                                                TCP_PAGE(sk) = page;
1149
                                        }
1150
                                }
1151
 
1152
                                TCP_OFF(sk) = off+copy;
1153
                        }
1154
 
1155
                        if (!copied)
1156
                                TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1157
 
1158
                        tp->write_seq += copy;
1159
                        TCP_SKB_CB(skb)->end_seq += copy;
1160
 
1161
                        from += copy;
1162
                        copied += copy;
1163
                        if ((seglen -= copy) == 0 && iovlen == 0)
1164
                                goto out;
1165
 
1166
                        if (skb->len != mss_now || (flags&MSG_OOB))
1167
                                continue;
1168
 
1169
                        if (forced_push(tp)) {
1170
                                tcp_mark_push(tp, skb);
1171
                                __tcp_push_pending_frames(sk, tp, mss_now, 1);
1172
                        } else if (skb == tp->send_head)
1173
                                tcp_push_one(sk, mss_now);
1174
                        continue;
1175
 
1176
wait_for_sndbuf:
1177
                        set_bit(SOCK_NOSPACE, &sk->socket->flags);
1178
wait_for_memory:
1179
                        if (copied)
1180
                                tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
1181
 
1182
                        if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1183
                                goto do_error;
1184
 
1185
                        mss_now = tcp_current_mss(sk);
1186
                }
1187
        }
1188
 
1189
out:
1190
        if (copied)
1191
                tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1192
        TCP_CHECK_TIMER(sk);
1193
        release_sock(sk);
1194
        return copied;
1195
 
1196
do_fault:
1197
        if (skb->len == 0) {
1198
                if (tp->send_head == skb)
1199
                        tp->send_head = NULL;
1200
                __skb_unlink(skb, skb->list);
1201
                tcp_free_skb(sk, skb);
1202
        }
1203
 
1204
do_error:
1205
        if (copied)
1206
                goto out;
1207
out_err:
1208
        err = tcp_error(sk, flags, err);
1209
        TCP_CHECK_TIMER(sk);
1210
        release_sock(sk);
1211
        return err;
1212
}
1213
 
1214
/*
1215
 *      Handle reading urgent data. BSD has very simple semantics for
1216
 *      this, no blocking and very strange errors 8)
1217
 */
1218
 
1219
static int tcp_recv_urg(struct sock * sk, long timeo,
1220
                        struct msghdr *msg, int len, int flags,
1221
                        int *addr_len)
1222
{
1223
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1224
 
1225
        /* No URG data to read. */
1226
        if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1227
                return -EINVAL; /* Yes this is right ! */
1228
 
1229
        if (sk->state==TCP_CLOSE && !sk->done)
1230
                return -ENOTCONN;
1231
 
1232
        if (tp->urg_data & TCP_URG_VALID) {
1233
                int err = 0;
1234
                char c = tp->urg_data;
1235
 
1236
                if (!(flags & MSG_PEEK))
1237
                        tp->urg_data = TCP_URG_READ;
1238
 
1239
                /* Read urgent data. */
1240
                msg->msg_flags|=MSG_OOB;
1241
 
1242
                if(len>0) {
1243
                        if (!(flags & MSG_TRUNC))
1244
                                err = memcpy_toiovec(msg->msg_iov, &c, 1);
1245
                        len = 1;
1246
                } else
1247
                        msg->msg_flags|=MSG_TRUNC;
1248
 
1249
                return err ? -EFAULT : len;
1250
        }
1251
 
1252
        if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1253
                return 0;
1254
 
1255
        /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1256
         * the available implementations agree in this case:
1257
         * this call should never block, independent of the
1258
         * blocking state of the socket.
1259
         * Mike <pall@rz.uni-karlsruhe.de>
1260
         */
1261
        return -EAGAIN;
1262
}
1263
 
1264
/*
1265
 *      Release a skb if it is no longer needed. This routine
1266
 *      must be called with interrupts disabled or with the
1267
 *      socket locked so that the sk_buff queue operation is ok.
1268
 */
1269
 
1270
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1271
{
1272
        __skb_unlink(skb, &sk->receive_queue);
1273
        __kfree_skb(skb);
1274
}
1275
 
1276
/* Clean up the receive buffer for full frames taken by the user,
1277
 * then send an ACK if necessary.  COPIED is the number of bytes
1278
 * tcp_recvmsg has given to the user so far, it speeds up the
1279
 * calculation of whether or not we must ACK for the sake of
1280
 * a window update.
1281
 */
1282
static void cleanup_rbuf(struct sock *sk, int copied)
1283
{
1284
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1285
        int time_to_ack = 0;
1286
 
1287
#if TCP_DEBUG
1288
        struct sk_buff *skb = skb_peek(&sk->receive_queue);
1289
 
1290
        BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1291
#endif
1292
 
1293
        if (tcp_ack_scheduled(tp)) {
1294
                   /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1295
                if (tp->ack.blocked
1296
                    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1297
                    || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1298
                    /*
1299
                     * If this read emptied read buffer, we send ACK, if
1300
                     * connection is not bidirectional, user drained
1301
                     * receive buffer and there was a small segment
1302
                     * in queue.
1303
                     */
1304
                    || (copied > 0 &&
1305
                        (tp->ack.pending&TCP_ACK_PUSHED) &&
1306
                        !tp->ack.pingpong &&
1307
                        atomic_read(&sk->rmem_alloc) == 0)) {
1308
                        time_to_ack = 1;
1309
                }
1310
        }
1311
 
1312
        /* We send an ACK if we can now advertise a non-zero window
1313
         * which has been raised "significantly".
1314
         *
1315
         * Even if window raised up to infinity, do not send window open ACK
1316
         * in states, where we will not receive more. It is useless.
1317
         */
1318
        if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1319
                __u32 rcv_window_now = tcp_receive_window(tp);
1320
 
1321
                /* Optimize, __tcp_select_window() is not cheap. */
1322
                if (2*rcv_window_now <= tp->window_clamp) {
1323
                        __u32 new_window = __tcp_select_window(sk);
1324
 
1325
                        /* Send ACK now, if this read freed lots of space
1326
                         * in our buffer. Certainly, new_window is new window.
1327
                         * We can advertise it now, if it is not less than current one.
1328
                         * "Lots" means "at least twice" here.
1329
                         */
1330
                        if(new_window && new_window >= 2*rcv_window_now)
1331
                                time_to_ack = 1;
1332
                }
1333
        }
1334
        if (time_to_ack)
1335
                tcp_send_ack(sk);
1336
}
1337
 
1338
/* Now socket state including sk->err is changed only under lock,
1339
 * hence we may omit checks after joining wait queue.
1340
 * We check receive queue before schedule() only as optimization;
1341
 * it is very likely that release_sock() added new data.
1342
 */
1343
 
1344
static long tcp_data_wait(struct sock *sk, long timeo)
1345
{
1346
        DECLARE_WAITQUEUE(wait, current);
1347
 
1348
        add_wait_queue(sk->sleep, &wait);
1349
 
1350
        __set_current_state(TASK_INTERRUPTIBLE);
1351
 
1352
        set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1353
        release_sock(sk);
1354
 
1355
        if (skb_queue_empty(&sk->receive_queue))
1356
                timeo = schedule_timeout(timeo);
1357
 
1358
        lock_sock(sk);
1359
        clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1360
 
1361
        remove_wait_queue(sk->sleep, &wait);
1362
        __set_current_state(TASK_RUNNING);
1363
        return timeo;
1364
}
1365
 
1366
static void tcp_prequeue_process(struct sock *sk)
1367
{
1368
        struct sk_buff *skb;
1369
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1370
 
1371
        net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1372
 
1373
        /* RX process wants to run with disabled BHs, though it is not necessary */
1374
        local_bh_disable();
1375
        while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1376
                sk->backlog_rcv(sk, skb);
1377
        local_bh_enable();
1378
 
1379
        /* Clear memory counter. */
1380
        tp->ucopy.memory = 0;
1381
}
1382
 
1383
static inline
1384
struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1385
{
1386
        struct sk_buff *skb;
1387
        u32 offset;
1388
 
1389
        skb_queue_walk(&sk->receive_queue, skb) {
1390
                offset = seq - TCP_SKB_CB(skb)->seq;
1391
                if (skb->h.th->syn)
1392
                        offset--;
1393
                if (offset < skb->len || skb->h.th->fin) {
1394
                        *off = offset;
1395
                        return skb;
1396
                }
1397
        }
1398
        return NULL;
1399
}
1400
 
1401
/*
1402
 * This routine provides an alternative to tcp_recvmsg() for routines
1403
 * that would like to handle copying from skbuffs directly in 'sendfile'
1404
 * fashion.
1405
 * Note:
1406
 *      - It is assumed that the socket was locked by the caller.
1407
 *      - The routine does not block.
1408
 *      - At present, there is no support for reading OOB data
1409
 *        or for 'peeking' the socket using this routine
1410
 *        (although both would be easy to implement).
1411
 */
1412
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1413
                  sk_read_actor_t recv_actor)
1414
{
1415
        struct sk_buff *skb;
1416
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1417
        u32 seq = tp->copied_seq;
1418
        u32 offset;
1419
        int copied = 0;
1420
 
1421
        if (sk->state == TCP_LISTEN)
1422
                return -ENOTCONN;
1423
        while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1424
                if (offset < skb->len) {
1425
                        size_t used, len;
1426
 
1427
                        len = skb->len - offset;
1428
                        /* Stop reading if we hit a patch of urgent data */
1429
                        if (tp->urg_data) {
1430
                                u32 urg_offset = tp->urg_seq - seq;
1431
                                if (urg_offset < len)
1432
                                        len = urg_offset;
1433
                                if (!len)
1434
                                        break;
1435
                        }
1436
                        used = recv_actor(desc, skb, offset, len);
1437
                        if (used <= len) {
1438
                                seq += used;
1439
                                copied += used;
1440
                                offset += used;
1441
                        }
1442
                        if (offset != skb->len)
1443
                                break;
1444
                }
1445
                if (skb->h.th->fin) {
1446
                        tcp_eat_skb(sk, skb);
1447
                        ++seq;
1448
                        break;
1449
                }
1450
                tcp_eat_skb(sk, skb);
1451
                if (!desc->count)
1452
                        break;
1453
        }
1454
        tp->copied_seq = seq;
1455
        /* Clean up data we have read: This will do ACK frames. */
1456
        if (copied)
1457
                cleanup_rbuf(sk, copied);
1458
        return copied;
1459
}
1460
 
1461
/*
1462
 *      This routine copies from a sock struct into the user buffer.
1463
 *
1464
 *      Technical note: in 2.3 we work on _locked_ socket, so that
1465
 *      tricks with *seq access order and skb->users are not required.
1466
 *      Probably, code can be easily improved even more.
1467
 */
1468
 
1469
int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1470
                int len, int nonblock, int flags, int *addr_len)
1471
{
1472
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1473
        int copied = 0;
1474
        u32 peek_seq;
1475
        u32 *seq;
1476
        unsigned long used;
1477
        int err;
1478
        int target;             /* Read at least this many bytes */
1479
        long timeo;
1480
        struct task_struct *user_recv = NULL;
1481
 
1482
        lock_sock(sk);
1483
 
1484
        TCP_CHECK_TIMER(sk);
1485
 
1486
        err = -ENOTCONN;
1487
        if (sk->state == TCP_LISTEN)
1488
                goto out;
1489
 
1490
        timeo = sock_rcvtimeo(sk, nonblock);
1491
 
1492
        /* Urgent data needs to be handled specially. */
1493
        if (flags & MSG_OOB)
1494
                goto recv_urg;
1495
 
1496
        seq = &tp->copied_seq;
1497
        if (flags & MSG_PEEK) {
1498
                peek_seq = tp->copied_seq;
1499
                seq = &peek_seq;
1500
        }
1501
 
1502
        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1503
 
1504
        do {
1505
                struct sk_buff * skb;
1506
                u32 offset;
1507
 
1508
                /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1509
                if (tp->urg_data && tp->urg_seq == *seq) {
1510
                        if (copied)
1511
                                break;
1512
                        if (signal_pending(current)) {
1513
                                copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1514
                                break;
1515
                        }
1516
                }
1517
 
1518
                /* Next get a buffer. */
1519
 
1520
                skb = skb_peek(&sk->receive_queue);
1521
                do {
1522
                        if (!skb)
1523
                                break;
1524
 
1525
                        /* Now that we have two receive queues this
1526
                         * shouldn't happen.
1527
                         */
1528
                        if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1529
                                printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1530
                                       *seq, TCP_SKB_CB(skb)->seq);
1531
                                break;
1532
                        }
1533
                        offset = *seq - TCP_SKB_CB(skb)->seq;
1534
                        if (skb->h.th->syn)
1535
                                offset--;
1536
                        if (offset < skb->len)
1537
                                goto found_ok_skb;
1538
                        if (skb->h.th->fin)
1539
                                goto found_fin_ok;
1540
                        BUG_TRAP(flags&MSG_PEEK);
1541
                        skb = skb->next;
1542
                } while (skb != (struct sk_buff *)&sk->receive_queue);
1543
 
1544
                /* Well, if we have backlog, try to process it now yet. */
1545
 
1546
                if (copied >= target && sk->backlog.tail == NULL)
1547
                        break;
1548
 
1549
                if (copied) {
1550
                        if (sk->err ||
1551
                            sk->state == TCP_CLOSE ||
1552
                            (sk->shutdown & RCV_SHUTDOWN) ||
1553
                            !timeo ||
1554
                            signal_pending(current) ||
1555
                            (flags & MSG_PEEK))
1556
                                break;
1557
                } else {
1558
                        if (sk->done)
1559
                                break;
1560
 
1561
                        if (sk->err) {
1562
                                copied = sock_error(sk);
1563
                                break;
1564
                        }
1565
 
1566
                        if (sk->shutdown & RCV_SHUTDOWN)
1567
                                break;
1568
 
1569
                        if (sk->state == TCP_CLOSE) {
1570
                                if (!sk->done) {
1571
                                        /* This occurs when user tries to read
1572
                                         * from never connected socket.
1573
                                         */
1574
                                        copied = -ENOTCONN;
1575
                                        break;
1576
                                }
1577
                                break;
1578
                        }
1579
 
1580
                        if (!timeo) {
1581
                                copied = -EAGAIN;
1582
                                break;
1583
                        }
1584
 
1585
                        if (signal_pending(current)) {
1586
                                copied = sock_intr_errno(timeo);
1587
                                break;
1588
                        }
1589
                }
1590
 
1591
                cleanup_rbuf(sk, copied);
1592
 
1593
                if (tp->ucopy.task == user_recv) {
1594
                        /* Install new reader */
1595
                        if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1596
                                user_recv = current;
1597
                                tp->ucopy.task = user_recv;
1598
                                tp->ucopy.iov = msg->msg_iov;
1599
                        }
1600
 
1601
                        tp->ucopy.len = len;
1602
 
1603
                        BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1604
 
1605
                        /* Ugly... If prequeue is not empty, we have to
1606
                         * process it before releasing socket, otherwise
1607
                         * order will be broken at second iteration.
1608
                         * More elegant solution is required!!!
1609
                         *
1610
                         * Look: we have the following (pseudo)queues:
1611
                         *
1612
                         * 1. packets in flight
1613
                         * 2. backlog
1614
                         * 3. prequeue
1615
                         * 4. receive_queue
1616
                         *
1617
                         * Each queue can be processed only if the next ones
1618
                         * are empty. At this point we have empty receive_queue.
1619
                         * But prequeue _can_ be not empty after second iteration,
1620
                         * when we jumped to start of loop because backlog
1621
                         * processing added something to receive_queue.
1622
                         * We cannot release_sock(), because backlog contains
1623
                         * packets arrived _after_ prequeued ones.
1624
                         *
1625
                         * Shortly, algorithm is clear --- to process all
1626
                         * the queues in order. We could make it more directly,
1627
                         * requeueing packets from backlog to prequeue, if
1628
                         * is not empty. It is more elegant, but eats cycles,
1629
                         * unfortunately.
1630
                         */
1631
                        if (skb_queue_len(&tp->ucopy.prequeue))
1632
                                goto do_prequeue;
1633
 
1634
                        /* __ Set realtime policy in scheduler __ */
1635
                }
1636
 
1637
                if (copied >= target) {
1638
                        /* Do not sleep, just process backlog. */
1639
                        release_sock(sk);
1640
                        lock_sock(sk);
1641
                } else {
1642
                        timeo = tcp_data_wait(sk, timeo);
1643
                }
1644
 
1645
                if (user_recv) {
1646
                        int chunk;
1647
 
1648
                        /* __ Restore normal policy in scheduler __ */
1649
 
1650
                        if ((chunk = len - tp->ucopy.len) != 0) {
1651
                                net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1652
                                len -= chunk;
1653
                                copied += chunk;
1654
                        }
1655
 
1656
                        if (tp->rcv_nxt == tp->copied_seq &&
1657
                            skb_queue_len(&tp->ucopy.prequeue)) {
1658
do_prequeue:
1659
                                tcp_prequeue_process(sk);
1660
 
1661
                                if ((chunk = len - tp->ucopy.len) != 0) {
1662
                                        net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1663
                                        len -= chunk;
1664
                                        copied += chunk;
1665
                                }
1666
                        }
1667
                }
1668
                if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1669
                        if (net_ratelimit())
1670
                                printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1671
                                       current->comm, current->pid);
1672
                        peek_seq = tp->copied_seq;
1673
                }
1674
                continue;
1675
 
1676
        found_ok_skb:
1677
                /* Ok so how much can we use? */
1678
                used = skb->len - offset;
1679
                if (len < used)
1680
                        used = len;
1681
 
1682
                /* Do we have urgent data here? */
1683
                if (tp->urg_data) {
1684
                        u32 urg_offset = tp->urg_seq - *seq;
1685
                        if (urg_offset < used) {
1686
                                if (!urg_offset) {
1687
                                        if (!sk->urginline) {
1688
                                                ++*seq;
1689
                                                offset++;
1690
                                                used--;
1691
                                                if (!used)
1692
                                                        goto skip_copy;
1693
                                        }
1694
                                } else
1695
                                        used = urg_offset;
1696
                        }
1697
                }
1698
 
1699
                if (!(flags&MSG_TRUNC)) {
1700
                        err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used);
1701
                        if (err) {
1702
                                /* Exception. Bailout! */
1703
                                if (!copied)
1704
                                        copied = -EFAULT;
1705
                                break;
1706
                        }
1707
                }
1708
 
1709
                *seq += used;
1710
                copied += used;
1711
                len -= used;
1712
 
1713
skip_copy:
1714
                if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
1715
                        tp->urg_data = 0;
1716
                        tcp_fast_path_check(sk, tp);
1717
                }
1718
                if (used + offset < skb->len)
1719
                        continue;
1720
 
1721
                if (skb->h.th->fin)
1722
                        goto found_fin_ok;
1723
                if (!(flags & MSG_PEEK))
1724
                        tcp_eat_skb(sk, skb);
1725
                continue;
1726
 
1727
        found_fin_ok:
1728
                /* Process the FIN. */
1729
                ++*seq;
1730
                if (!(flags & MSG_PEEK))
1731
                        tcp_eat_skb(sk, skb);
1732
                break;
1733
        } while (len > 0);
1734
 
1735
        if (user_recv) {
1736
                if (skb_queue_len(&tp->ucopy.prequeue)) {
1737
                        int chunk;
1738
 
1739
                        tp->ucopy.len = copied > 0 ? len : 0;
1740
 
1741
                        tcp_prequeue_process(sk);
1742
 
1743
                        if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1744
                                net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1745
                                len -= chunk;
1746
                                copied += chunk;
1747
                        }
1748
                }
1749
 
1750
                tp->ucopy.task = NULL;
1751
                tp->ucopy.len = 0;
1752
        }
1753
 
1754
        /* According to UNIX98, msg_name/msg_namelen are ignored
1755
         * on connected socket. I was just happy when found this 8) --ANK
1756
         */
1757
 
1758
        /* Clean up data we have read: This will do ACK frames. */
1759
        cleanup_rbuf(sk, copied);
1760
 
1761
        TCP_CHECK_TIMER(sk);
1762
        release_sock(sk);
1763
        return copied;
1764
 
1765
out:
1766
        TCP_CHECK_TIMER(sk);
1767
        release_sock(sk);
1768
        return err;
1769
 
1770
recv_urg:
1771
        err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1772
        goto out;
1773
}
1774
 
1775
/*
1776
 *      State processing on a close. This implements the state shift for
1777
 *      sending our FIN frame. Note that we only send a FIN for some
1778
 *      states. A shutdown() may have already sent the FIN, or we may be
1779
 *      closed.
1780
 */
1781
 
1782
static unsigned char new_state[16] = {
1783
  /* current state:        new state:      action:      */
1784
  /* (Invalid)          */ TCP_CLOSE,
1785
  /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1786
  /* TCP_SYN_SENT       */ TCP_CLOSE,
1787
  /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1788
  /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1789
  /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1790
  /* TCP_TIME_WAIT      */ TCP_CLOSE,
1791
  /* TCP_CLOSE          */ TCP_CLOSE,
1792
  /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1793
  /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1794
  /* TCP_LISTEN         */ TCP_CLOSE,
1795
  /* TCP_CLOSING        */ TCP_CLOSING,
1796
};
1797
 
1798
static int tcp_close_state(struct sock *sk)
1799
{
1800
        int next = (int) new_state[sk->state];
1801
        int ns = (next & TCP_STATE_MASK);
1802
 
1803
        tcp_set_state(sk, ns);
1804
 
1805
        return (next & TCP_ACTION_FIN);
1806
}
1807
 
1808
/*
1809
 *      Shutdown the sending side of a connection. Much like close except
1810
 *      that we don't receive shut down or set sk->dead.
1811
 */
1812
 
1813
void tcp_shutdown(struct sock *sk, int how)
1814
{
1815
        /*      We need to grab some memory, and put together a FIN,
1816
         *      and then put it into the queue to be sent.
1817
         *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1818
         */
1819
        if (!(how & SEND_SHUTDOWN))
1820
                return;
1821
 
1822
        /* If we've already sent a FIN, or it's a closed state, skip this. */
1823
        if ((1 << sk->state) &
1824
            (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1825
                /* Clear out any half completed packets.  FIN if needed. */
1826
                if (tcp_close_state(sk))
1827
                        tcp_send_fin(sk);
1828
        }
1829
}
1830
 
1831
 
1832
/*
1833
 *      Return 1 if we still have things to send in our buffers.
1834
 */
1835
 
1836
static inline int closing(struct sock * sk)
1837
{
1838
        return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1839
}
1840
 
1841
static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1842
{
1843
        /* First the read buffer. */
1844
        __skb_queue_purge(&sk->receive_queue);
1845
 
1846
        /* Next, the error queue. */
1847
        __skb_queue_purge(&sk->error_queue);
1848
 
1849
        /* Next, the write queue. */
1850
        BUG_TRAP(skb_queue_empty(&sk->write_queue));
1851
 
1852
        /* Account for returned memory. */
1853
        tcp_mem_reclaim(sk);
1854
 
1855
        BUG_TRAP(sk->wmem_queued == 0);
1856
        BUG_TRAP(sk->forward_alloc == 0);
1857
 
1858
        /* It is _impossible_ for the backlog to contain anything
1859
         * when we get here.  All user references to this socket
1860
         * have gone away, only the net layer knows can touch it.
1861
         */
1862
}
1863
 
1864
/*
1865
 * At this point, there should be no process reference to this
1866
 * socket, and thus no user references at all.  Therefore we
1867
 * can assume the socket waitqueue is inactive and nobody will
1868
 * try to jump onto it.
1869
 */
1870
void tcp_destroy_sock(struct sock *sk)
1871
{
1872
        BUG_TRAP(sk->state==TCP_CLOSE);
1873
        BUG_TRAP(sk->dead);
1874
 
1875
        /* It cannot be in hash table! */
1876
        BUG_TRAP(sk->pprev==NULL);
1877
 
1878
        /* If it has not 0 sk->num, it must be bound */
1879
        BUG_TRAP(!sk->num || sk->prev!=NULL);
1880
 
1881
#ifdef TCP_DEBUG
1882
        if (sk->zapped) {
1883
                printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1884
                sock_hold(sk);
1885
        }
1886
        sk->zapped = 1;
1887
#endif
1888
 
1889
        sk->prot->destroy(sk);
1890
 
1891
        tcp_kill_sk_queues(sk);
1892
 
1893
#ifdef INET_REFCNT_DEBUG
1894
        if (atomic_read(&sk->refcnt) != 1) {
1895
                printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1896
        }
1897
#endif
1898
 
1899
        atomic_dec(&tcp_orphan_count);
1900
        sock_put(sk);
1901
}
1902
 
1903
void tcp_close(struct sock *sk, long timeout)
1904
{
1905
        struct sk_buff *skb;
1906
        int data_was_unread = 0;
1907
 
1908
        lock_sock(sk);
1909
        sk->shutdown = SHUTDOWN_MASK;
1910
 
1911
        if(sk->state == TCP_LISTEN) {
1912
                tcp_set_state(sk, TCP_CLOSE);
1913
 
1914
                /* Special case. */
1915
                tcp_listen_stop(sk);
1916
 
1917
                goto adjudge_to_death;
1918
        }
1919
 
1920
        /*  We need to flush the recv. buffs.  We do this only on the
1921
         *  descriptor close, not protocol-sourced closes, because the
1922
         *  reader process may not have drained the data yet!
1923
         */
1924
        while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1925
                u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1926
                data_was_unread += len;
1927
                __kfree_skb(skb);
1928
        }
1929
 
1930
        tcp_mem_reclaim(sk);
1931
 
1932
        /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1933
         * 3.10, we send a RST here because data was lost.  To
1934
         * witness the awful effects of the old behavior of always
1935
         * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1936
         * a bulk GET in an FTP client, suspend the process, wait
1937
         * for the client to advertise a zero window, then kill -9
1938
         * the FTP client, wheee...  Note: timeout is always zero
1939
         * in such a case.
1940
         */
1941
        if(data_was_unread != 0) {
1942
                /* Unread data was tossed, zap the connection. */
1943
                NET_INC_STATS_USER(TCPAbortOnClose);
1944
                tcp_set_state(sk, TCP_CLOSE);
1945
                tcp_send_active_reset(sk, GFP_KERNEL);
1946
        } else if (sk->linger && sk->lingertime==0) {
1947
                /* Check zero linger _after_ checking for unread data. */
1948
                sk->prot->disconnect(sk, 0);
1949
                NET_INC_STATS_USER(TCPAbortOnData);
1950
        } else if (tcp_close_state(sk)) {
1951
                /* We FIN if the application ate all the data before
1952
                 * zapping the connection.
1953
                 */
1954
 
1955
                /* RED-PEN. Formally speaking, we have broken TCP state
1956
                 * machine. State transitions:
1957
                 *
1958
                 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1959
                 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1960
                 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1961
                 *
1962
                 * are legal only when FIN has been sent (i.e. in window),
1963
                 * rather than queued out of window. Purists blame.
1964
                 *
1965
                 * F.e. "RFC state" is ESTABLISHED,
1966
                 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1967
                 *
1968
                 * The visible declinations are that sometimes
1969
                 * we enter time-wait state, when it is not required really
1970
                 * (harmless), do not send active resets, when they are
1971
                 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1972
                 * they look as CLOSING or LAST_ACK for Linux)
1973
                 * Probably, I missed some more holelets.
1974
                 *                                              --ANK
1975
                 */
1976
                tcp_send_fin(sk);
1977
        }
1978
 
1979
        if (timeout) {
1980
                struct task_struct *tsk = current;
1981
                DECLARE_WAITQUEUE(wait, current);
1982
 
1983
                add_wait_queue(sk->sleep, &wait);
1984
 
1985
                do {
1986
                        set_current_state(TASK_INTERRUPTIBLE);
1987
                        if (!closing(sk))
1988
                                break;
1989
                        release_sock(sk);
1990
                        timeout = schedule_timeout(timeout);
1991
                        lock_sock(sk);
1992
                } while (!signal_pending(tsk) && timeout);
1993
 
1994
                tsk->state = TASK_RUNNING;
1995
                remove_wait_queue(sk->sleep, &wait);
1996
        }
1997
 
1998
adjudge_to_death:
1999
        /* It is the last release_sock in its life. It will remove backlog. */
2000
        release_sock(sk);
2001
 
2002
 
2003
        /* Now socket is owned by kernel and we acquire BH lock
2004
           to finish close. No need to check for user refs.
2005
         */
2006
        local_bh_disable();
2007
        bh_lock_sock(sk);
2008
        BUG_TRAP(sk->lock.users==0);
2009
 
2010
        sock_hold(sk);
2011
        sock_orphan(sk);
2012
 
2013
        /*      This is a (useful) BSD violating of the RFC. There is a
2014
         *      problem with TCP as specified in that the other end could
2015
         *      keep a socket open forever with no application left this end.
2016
         *      We use a 3 minute timeout (about the same as BSD) then kill
2017
         *      our end. If they send after that then tough - BUT: long enough
2018
         *      that we won't make the old 4*rto = almost no time - whoops
2019
         *      reset mistake.
2020
         *
2021
         *      Nope, it was not mistake. It is really desired behaviour
2022
         *      f.e. on http servers, when such sockets are useless, but
2023
         *      consume significant resources. Let's do it with special
2024
         *      linger2 option.                                 --ANK
2025
         */
2026
 
2027
        if (sk->state == TCP_FIN_WAIT2) {
2028
                struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2029
                if (tp->linger2 < 0) {
2030
                        tcp_set_state(sk, TCP_CLOSE);
2031
                        tcp_send_active_reset(sk, GFP_ATOMIC);
2032
                        NET_INC_STATS_BH(TCPAbortOnLinger);
2033
                } else {
2034
                        int tmo = tcp_fin_time(tp);
2035
 
2036
                        if (tmo > TCP_TIMEWAIT_LEN) {
2037
                                tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2038
                        } else {
2039
                                atomic_inc(&tcp_orphan_count);
2040
                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2041
                                goto out;
2042
                        }
2043
                }
2044
        }
2045
        if (sk->state != TCP_CLOSE) {
2046
                tcp_mem_reclaim(sk);
2047
                if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2048
                    (sk->wmem_queued > SOCK_MIN_SNDBUF &&
2049
                     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2050
                        if (net_ratelimit())
2051
                                printk(KERN_INFO "TCP: too many of orphaned sockets\n");
2052
                        tcp_set_state(sk, TCP_CLOSE);
2053
                        tcp_send_active_reset(sk, GFP_ATOMIC);
2054
                        NET_INC_STATS_BH(TCPAbortOnMemory);
2055
                }
2056
        }
2057
        atomic_inc(&tcp_orphan_count);
2058
 
2059
        if (sk->state == TCP_CLOSE)
2060
                tcp_destroy_sock(sk);
2061
        /* Otherwise, socket is reprieved until protocol close. */
2062
 
2063
out:
2064
        bh_unlock_sock(sk);
2065
        local_bh_enable();
2066
        sock_put(sk);
2067
}
2068
 
2069
/* These states need RST on ABORT according to RFC793 */
2070
 
2071
static inline int tcp_need_reset(int state)
2072
{
2073
        return ((1 << state) &
2074
                (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
2075
                 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
2076
}
2077
 
2078
int tcp_disconnect(struct sock *sk, int flags)
2079
{
2080
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2081
        int old_state;
2082
        int err = 0;
2083
 
2084
        old_state = sk->state;
2085
        if (old_state != TCP_CLOSE)
2086
                tcp_set_state(sk, TCP_CLOSE);
2087
 
2088
        /* ABORT function of RFC793 */
2089
        if (old_state == TCP_LISTEN) {
2090
                tcp_listen_stop(sk);
2091
        } else if (tcp_need_reset(old_state) ||
2092
                   (tp->snd_nxt != tp->write_seq &&
2093
                    (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
2094
                /* The last check adjusts for discrepance of Linux wrt. RFC
2095
                 * states
2096
                 */
2097
                tcp_send_active_reset(sk, gfp_any());
2098
                sk->err = ECONNRESET;
2099
        } else if (old_state == TCP_SYN_SENT)
2100
                sk->err = ECONNRESET;
2101
 
2102
        tcp_clear_xmit_timers(sk);
2103
        __skb_queue_purge(&sk->receive_queue);
2104
        tcp_writequeue_purge(sk);
2105
        __skb_queue_purge(&tp->out_of_order_queue);
2106
 
2107
        sk->dport = 0;
2108
 
2109
        if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
2110
                sk->rcv_saddr = 0;
2111
                sk->saddr = 0;
2112
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2113
                memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
2114
                memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
2115
#endif
2116
        }
2117
 
2118
        sk->shutdown = 0;
2119
        sk->done = 0;
2120
        tp->srtt = 0;
2121
        if ((tp->write_seq += tp->max_window+2) == 0)
2122
                tp->write_seq = 1;
2123
        tp->backoff = 0;
2124
        tp->snd_cwnd = 2;
2125
        tp->probes_out = 0;
2126
        tp->packets_out = 0;
2127
        tp->snd_ssthresh = 0x7fffffff;
2128
        tp->snd_cwnd_cnt = 0;
2129
        tp->ca_state = TCP_CA_Open;
2130
        tcp_clear_retrans(tp);
2131
        tcp_delack_init(tp);
2132
        tp->send_head = NULL;
2133
        tp->saw_tstamp = 0;
2134
        tcp_sack_reset(tp);
2135
        __sk_dst_reset(sk);
2136
 
2137
        BUG_TRAP(!sk->num || sk->prev);
2138
 
2139
        sk->error_report(sk);
2140
        return err;
2141
}
2142
 
2143
/*
2144
 *      Wait for an incoming connection, avoid race
2145
 *      conditions. This must be called with the socket locked.
2146
 */
2147
static int wait_for_connect(struct sock * sk, long timeo)
2148
{
2149
        DECLARE_WAITQUEUE(wait, current);
2150
        int err;
2151
 
2152
        /*
2153
         * True wake-one mechanism for incoming connections: only
2154
         * one process gets woken up, not the 'whole herd'.
2155
         * Since we do not 'race & poll' for established sockets
2156
         * anymore, the common case will execute the loop only once.
2157
         *
2158
         * Subtle issue: "add_wait_queue_exclusive()" will be added
2159
         * after any current non-exclusive waiters, and we know that
2160
         * it will always _stay_ after any new non-exclusive waiters
2161
         * because all non-exclusive waiters are added at the
2162
         * beginning of the wait-queue. As such, it's ok to "drop"
2163
         * our exclusiveness temporarily when we get woken up without
2164
         * having to remove and re-insert us on the wait queue.
2165
         */
2166
        add_wait_queue_exclusive(sk->sleep, &wait);
2167
        for (;;) {
2168
                current->state = TASK_INTERRUPTIBLE;
2169
                release_sock(sk);
2170
                if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2171
                        timeo = schedule_timeout(timeo);
2172
                lock_sock(sk);
2173
                err = 0;
2174
                if (sk->tp_pinfo.af_tcp.accept_queue)
2175
                        break;
2176
                err = -EINVAL;
2177
                if (sk->state != TCP_LISTEN)
2178
                        break;
2179
                err = sock_intr_errno(timeo);
2180
                if (signal_pending(current))
2181
                        break;
2182
                err = -EAGAIN;
2183
                if (!timeo)
2184
                        break;
2185
        }
2186
        current->state = TASK_RUNNING;
2187
        remove_wait_queue(sk->sleep, &wait);
2188
        return err;
2189
}
2190
 
2191
/*
2192
 *      This will accept the next outstanding connection.
2193
 */
2194
 
2195
struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2196
{
2197
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2198
        struct open_request *req;
2199
        struct sock *newsk;
2200
        int error;
2201
 
2202
        lock_sock(sk);
2203
 
2204
        /* We need to make sure that this socket is listening,
2205
         * and that it has something pending.
2206
         */
2207
        error = -EINVAL;
2208
        if (sk->state != TCP_LISTEN)
2209
                goto out;
2210
 
2211
        /* Find already established connection */
2212
        if (!tp->accept_queue) {
2213
                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2214
 
2215
                /* If this is a non blocking socket don't sleep */
2216
                error = -EAGAIN;
2217
                if (!timeo)
2218
                        goto out;
2219
 
2220
                error = wait_for_connect(sk, timeo);
2221
                if (error)
2222
                        goto out;
2223
        }
2224
 
2225
        req = tp->accept_queue;
2226
        if ((tp->accept_queue = req->dl_next) == NULL)
2227
                tp->accept_queue_tail = NULL;
2228
 
2229
        newsk = req->sk;
2230
        tcp_acceptq_removed(sk);
2231
        tcp_openreq_fastfree(req);
2232
        BUG_TRAP(newsk->state != TCP_SYN_RECV);
2233
        release_sock(sk);
2234
        return newsk;
2235
 
2236
out:
2237
        release_sock(sk);
2238
        *err = error;
2239
        return NULL;
2240
}
2241
 
2242
/*
2243
 *      Socket option code for TCP.
2244
 */
2245
 
2246
int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2247
                   int optlen)
2248
{
2249
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2250
        int val;
2251
        int err = 0;
2252
 
2253
        if (level != SOL_TCP)
2254
                return tp->af_specific->setsockopt(sk, level, optname,
2255
                                                   optval, optlen);
2256
 
2257
        if(optlen<sizeof(int))
2258
                return -EINVAL;
2259
 
2260
        if (get_user(val, (int *)optval))
2261
                return -EFAULT;
2262
 
2263
        lock_sock(sk);
2264
 
2265
        switch(optname) {
2266
        case TCP_MAXSEG:
2267
                /* values greater than interface MTU won't take effect.  however at
2268
                 * the point when this call is done we typically don't yet know
2269
                 * which interface is going to be used
2270
                 */
2271
                if(val < 8 || val > MAX_TCP_WINDOW) {
2272
                        err = -EINVAL;
2273
                        break;
2274
                }
2275
                tp->user_mss = val;
2276
                break;
2277
 
2278
        case TCP_NODELAY:
2279
                /* You cannot try to use this and TCP_CORK in
2280
                 * tandem, so let the user know.
2281
                 */
2282
                if (tp->nonagle == 2) {
2283
                        err = -EINVAL;
2284
                        break;
2285
                }
2286
                tp->nonagle = (val == 0) ? 0 : 1;
2287
                if (val)
2288
                        tcp_push_pending_frames(sk, tp);
2289
                break;
2290
 
2291
        case TCP_CORK:
2292
                /* When set indicates to always queue non-full frames.
2293
                 * Later the user clears this option and we transmit
2294
                 * any pending partial frames in the queue.  This is
2295
                 * meant to be used alongside sendfile() to get properly
2296
                 * filled frames when the user (for example) must write
2297
                 * out headers with a write() call first and then use
2298
                 * sendfile to send out the data parts.
2299
                 *
2300
                 * You cannot try to use TCP_NODELAY and this mechanism
2301
                 * at the same time, so let the user know.
2302
                 */
2303
                if (tp->nonagle == 1) {
2304
                        err = -EINVAL;
2305
                        break;
2306
                }
2307
                if (val != 0) {
2308
                        tp->nonagle = 2;
2309
                } else {
2310
                        tp->nonagle = 0;
2311
 
2312
                        tcp_push_pending_frames(sk, tp);
2313
                }
2314
                break;
2315
 
2316
        case TCP_KEEPIDLE:
2317
                if (val < 1 || val > MAX_TCP_KEEPIDLE)
2318
                        err = -EINVAL;
2319
                else {
2320
                        tp->keepalive_time = val * HZ;
2321
                        if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2322
                                __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2323
                                if (tp->keepalive_time > elapsed)
2324
                                        elapsed = tp->keepalive_time - elapsed;
2325
                                else
2326
                                        elapsed = 0;
2327
                                tcp_reset_keepalive_timer(sk, elapsed);
2328
                        }
2329
                }
2330
                break;
2331
        case TCP_KEEPINTVL:
2332
                if (val < 1 || val > MAX_TCP_KEEPINTVL)
2333
                        err = -EINVAL;
2334
                else
2335
                        tp->keepalive_intvl = val * HZ;
2336
                break;
2337
        case TCP_KEEPCNT:
2338
                if (val < 1 || val > MAX_TCP_KEEPCNT)
2339
                        err = -EINVAL;
2340
                else
2341
                        tp->keepalive_probes = val;
2342
                break;
2343
        case TCP_SYNCNT:
2344
                if (val < 1 || val > MAX_TCP_SYNCNT)
2345
                        err = -EINVAL;
2346
                else
2347
                        tp->syn_retries = val;
2348
                break;
2349
 
2350
        case TCP_LINGER2:
2351
                if (val < 0)
2352
                        tp->linger2 = -1;
2353
                else if (val > sysctl_tcp_fin_timeout/HZ)
2354
                        tp->linger2 = 0;
2355
                else
2356
                        tp->linger2 = val*HZ;
2357
                break;
2358
 
2359
        case TCP_DEFER_ACCEPT:
2360
                tp->defer_accept = 0;
2361
                if (val > 0) {
2362
                        /* Translate value in seconds to number of retransmits */
2363
                        while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2364
                                tp->defer_accept++;
2365
                        tp->defer_accept++;
2366
                }
2367
                break;
2368
 
2369
        case TCP_WINDOW_CLAMP:
2370
                if (val==0) {
2371
                        if (sk->state != TCP_CLOSE) {
2372
                                err = -EINVAL;
2373
                                break;
2374
                        }
2375
                        tp->window_clamp = 0;
2376
                } else {
2377
                        tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2378
                                SOCK_MIN_RCVBUF/2 : val;
2379
                }
2380
                break;
2381
 
2382
        case TCP_QUICKACK:
2383
                if (!val) {
2384
                        tp->ack.pingpong = 1;
2385
                } else {
2386
                        tp->ack.pingpong = 0;
2387
                        if ((1<<sk->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) &&
2388
                            tcp_ack_scheduled(tp)) {
2389
                                tp->ack.pending |= TCP_ACK_PUSHED;
2390
                                cleanup_rbuf(sk, 1);
2391
                                if (!(val & 1))
2392
                                        tp->ack.pingpong = 1;
2393
                        }
2394
                }
2395
                break;
2396
 
2397
        default:
2398
                err = -ENOPROTOOPT;
2399
                break;
2400
        };
2401
        release_sock(sk);
2402
        return err;
2403
}
2404
 
2405
int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2406
                   int *optlen)
2407
{
2408
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2409
        int val, len;
2410
 
2411
        if(level != SOL_TCP)
2412
                return tp->af_specific->getsockopt(sk, level, optname,
2413
                                                   optval, optlen);
2414
 
2415
        if(get_user(len,optlen))
2416
                return -EFAULT;
2417
 
2418
        len = min_t(unsigned int, len, sizeof(int));
2419
 
2420
        if(len < 0)
2421
                return -EINVAL;
2422
 
2423
        switch(optname) {
2424
        case TCP_MAXSEG:
2425
                val = tp->mss_cache;
2426
                if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2427
                        val = tp->user_mss;
2428
                break;
2429
        case TCP_NODELAY:
2430
                val = (tp->nonagle == 1);
2431
                break;
2432
        case TCP_CORK:
2433
                val = (tp->nonagle == 2);
2434
                break;
2435
        case TCP_KEEPIDLE:
2436
                val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2437
                break;
2438
        case TCP_KEEPINTVL:
2439
                val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2440
                break;
2441
        case TCP_KEEPCNT:
2442
                val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2443
                break;
2444
        case TCP_SYNCNT:
2445
                val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2446
                break;
2447
        case TCP_LINGER2:
2448
                val = tp->linger2;
2449
                if (val >= 0)
2450
                        val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2451
                break;
2452
        case TCP_DEFER_ACCEPT:
2453
                val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1));
2454
                break;
2455
        case TCP_WINDOW_CLAMP:
2456
                val = tp->window_clamp;
2457
                break;
2458
        case TCP_INFO:
2459
        {
2460
                struct tcp_info info;
2461
                u32 now = tcp_time_stamp;
2462
 
2463
                if(get_user(len,optlen))
2464
                        return -EFAULT;
2465
                info.tcpi_state = sk->state;
2466
                info.tcpi_ca_state = tp->ca_state;
2467
                info.tcpi_retransmits = tp->retransmits;
2468
                info.tcpi_probes = tp->probes_out;
2469
                info.tcpi_backoff = tp->backoff;
2470
                info.tcpi_options = 0;
2471
                if (tp->tstamp_ok)
2472
                        info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2473
                if (tp->sack_ok)
2474
                        info.tcpi_options |= TCPI_OPT_SACK;
2475
                if (tp->wscale_ok) {
2476
                        info.tcpi_options |= TCPI_OPT_WSCALE;
2477
                        info.tcpi_snd_wscale = tp->snd_wscale;
2478
                        info.tcpi_rcv_wscale = tp->rcv_wscale;
2479
                } else {
2480
                        info.tcpi_snd_wscale = 0;
2481
                        info.tcpi_rcv_wscale = 0;
2482
                }
2483
                if (tp->ecn_flags&TCP_ECN_OK)
2484
                        info.tcpi_options |= TCPI_OPT_ECN;
2485
 
2486
                info.tcpi_rto = (1000000*tp->rto)/HZ;
2487
                info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2488
                info.tcpi_snd_mss = tp->mss_cache;
2489
                info.tcpi_rcv_mss = tp->ack.rcv_mss;
2490
 
2491
                info.tcpi_unacked = tp->packets_out;
2492
                info.tcpi_sacked = tp->sacked_out;
2493
                info.tcpi_lost = tp->lost_out;
2494
                info.tcpi_retrans = tp->retrans_out;
2495
                info.tcpi_fackets = tp->fackets_out;
2496
 
2497
                info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2498
                info.tcpi_last_ack_sent = 0;
2499
                info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2500
                info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2501
 
2502
                info.tcpi_pmtu = tp->pmtu_cookie;
2503
                info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2504
                info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2505
                info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2506
                info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2507
                info.tcpi_snd_cwnd = tp->snd_cwnd;
2508
                info.tcpi_advmss = tp->advmss;
2509
                info.tcpi_reordering = tp->reordering;
2510
 
2511
                len = min_t(unsigned int, len, sizeof(info));
2512
                if(put_user(len, optlen))
2513
                        return -EFAULT;
2514
                if(copy_to_user(optval, &info,len))
2515
                        return -EFAULT;
2516
                return 0;
2517
        }
2518
        case TCP_QUICKACK:
2519
                val = !tp->ack.pingpong;
2520
                break;
2521
        default:
2522
                return -ENOPROTOOPT;
2523
        };
2524
 
2525
        if(put_user(len, optlen))
2526
                return -EFAULT;
2527
        if(copy_to_user(optval, &val,len))
2528
                return -EFAULT;
2529
        return 0;
2530
}
2531
 
2532
 
2533
extern void __skb_cb_too_small_for_tcp(int, int);
2534
extern void tcpdiag_init(void);
2535
 
2536
void __init tcp_init(void)
2537
{
2538
        struct sk_buff *skb = NULL;
2539
        unsigned long goal;
2540
        int order, i;
2541
 
2542
        if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2543
                __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2544
                                           sizeof(skb->cb));
2545
 
2546
        tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2547
                                                   sizeof(struct open_request),
2548
                                               0, SLAB_HWCACHE_ALIGN,
2549
                                               NULL, NULL);
2550
        if(!tcp_openreq_cachep)
2551
                panic("tcp_init: Cannot alloc open_request cache.");
2552
 
2553
        tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2554
                                              sizeof(struct tcp_bind_bucket),
2555
                                              0, SLAB_HWCACHE_ALIGN,
2556
                                              NULL, NULL);
2557
        if(!tcp_bucket_cachep)
2558
                panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2559
 
2560
        tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2561
                                                sizeof(struct tcp_tw_bucket),
2562
                                                0, SLAB_HWCACHE_ALIGN,
2563
                                                NULL, NULL);
2564
        if(!tcp_timewait_cachep)
2565
                panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2566
 
2567
        /* Size and allocate the main established and bind bucket
2568
         * hash tables.
2569
         *
2570
         * The methodology is similar to that of the buffer cache.
2571
         */
2572
        if (num_physpages >= (128 * 1024))
2573
                goal = num_physpages >> (21 - PAGE_SHIFT);
2574
        else
2575
                goal = num_physpages >> (23 - PAGE_SHIFT);
2576
 
2577
        for(order = 0; (1UL << order) < goal; order++)
2578
                ;
2579
        do {
2580
                tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2581
                        sizeof(struct tcp_ehash_bucket);
2582
                tcp_ehash_size >>= 1;
2583
                while (tcp_ehash_size & (tcp_ehash_size-1))
2584
                        tcp_ehash_size--;
2585
                tcp_ehash = (struct tcp_ehash_bucket *)
2586
                        __get_free_pages(GFP_ATOMIC, order);
2587
        } while (tcp_ehash == NULL && --order > 0);
2588
 
2589
        if (!tcp_ehash)
2590
                panic("Failed to allocate TCP established hash table\n");
2591
        for (i = 0; i < (tcp_ehash_size<<1); i++) {
2592
                tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2593
                tcp_ehash[i].chain = NULL;
2594
        }
2595
 
2596
        do {
2597
                tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2598
                        sizeof(struct tcp_bind_hashbucket);
2599
                if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2600
                        continue;
2601
                tcp_bhash = (struct tcp_bind_hashbucket *)
2602
                        __get_free_pages(GFP_ATOMIC, order);
2603
        } while (tcp_bhash == NULL && --order >= 0);
2604
 
2605
        if (!tcp_bhash)
2606
                panic("Failed to allocate TCP bind hash table\n");
2607
        for (i = 0; i < tcp_bhash_size; i++) {
2608
                tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2609
                tcp_bhash[i].chain = NULL;
2610
        }
2611
 
2612
        /* Try to be a bit smarter and adjust defaults depending
2613
         * on available memory.
2614
         */
2615
        if (order > 4) {
2616
                sysctl_local_port_range[0] = 32768;
2617
                sysctl_local_port_range[1] = 61000;
2618
                sysctl_tcp_max_tw_buckets = 180000;
2619
                sysctl_tcp_max_orphans = 4096<<(order-4);
2620
                sysctl_max_syn_backlog = 1024;
2621
        } else if (order < 3) {
2622
                sysctl_local_port_range[0] = 1024*(3-order);
2623
                sysctl_tcp_max_tw_buckets >>= (3-order);
2624
                sysctl_tcp_max_orphans >>= (3-order);
2625
                sysctl_max_syn_backlog = 128;
2626
        }
2627
        tcp_port_rover = sysctl_local_port_range[0] - 1;
2628
 
2629
        sysctl_tcp_mem[0] = 768<<order;
2630
        sysctl_tcp_mem[1] = 1024<<order;
2631
        sysctl_tcp_mem[2] = 1536<<order;
2632
        if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2633
                sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2634
        if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2635
                sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2636
 
2637
        if (order < 3) {
2638
                sysctl_tcp_wmem[2] = 64*1024;
2639
                sysctl_tcp_rmem[0] = PAGE_SIZE;
2640
                sysctl_tcp_rmem[1] = 43689;
2641
                sysctl_tcp_rmem[2] = 2*43689;
2642
        }
2643
 
2644
        printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n",
2645
               tcp_ehash_size<<1, tcp_bhash_size);
2646
 
2647
        (void) tcp_mib_init();
2648
        tcpdiag_init();
2649
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.