OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [rc203soc/] [sw/] [uClinux/] [net/] [ipv4/] [tcp.c] - Blame information for rev 1772

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1629 jcastillo
/*
2
 * INET         An implementation of the TCP/IP protocol suite for the LINUX
3
 *              operating system.  INET is implemented using the  BSD Socket
4
 *              interface as the means of communication with the user level.
5
 *
6
 *              Implementation of the Transmission Control Protocol(TCP).
7
 *
8
 * Version:     @(#)tcp.c       1.0.16  05/25/93
9
 *
10
 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11
 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12
 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13
 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14
 *              Florian La Roche, <flla@stud.uni-sb.de>
15
 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16
 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17
 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18
 *              Matthew Dillon, <dillon@apollo.west.oic.com>
19
 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20
 *              Jorge Cwik, <jorge@laser.satlink.net>
21
 *
22
 * Fixes:
23
 *              Alan Cox        :       Numerous verify_area() calls
24
 *              Alan Cox        :       Set the ACK bit on a reset
25
 *              Alan Cox        :       Stopped it crashing if it closed while
26
 *                                      sk->inuse=1 and was trying to connect
27
 *                                      (tcp_err()).
28
 *              Alan Cox        :       All icmp error handling was broken
29
 *                                      pointers passed where wrong and the
30
 *                                      socket was looked up backwards. Nobody
31
 *                                      tested any icmp error code obviously.
32
 *              Alan Cox        :       tcp_err() now handled properly. It
33
 *                                      wakes people on errors. select
34
 *                                      behaves and the icmp error race
35
 *                                      has gone by moving it into sock.c
36
 *              Alan Cox        :       tcp_send_reset() fixed to work for
37
 *                                      everything not just packets for
38
 *                                      unknown sockets.
39
 *              Alan Cox        :       tcp option processing.
40
 *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41
 *                                      syn rule wrong]
42
 *              Herp Rosmanith  :       More reset fixes
43
 *              Alan Cox        :       No longer acks invalid rst frames.
44
 *                                      Acking any kind of RST is right out.
45
 *              Alan Cox        :       Sets an ignore me flag on an rst
46
 *                                      receive otherwise odd bits of prattle
47
 *                                      escape still
48
 *              Alan Cox        :       Fixed another acking RST frame bug.
49
 *                                      Should stop LAN workplace lockups.
50
 *              Alan Cox        :       Some tidyups using the new skb list
51
 *                                      facilities
52
 *              Alan Cox        :       sk->keepopen now seems to work
53
 *              Alan Cox        :       Pulls options out correctly on accepts
54
 *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55
 *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56
 *                                      bit to skb ops.
57
 *              Alan Cox        :       Tidied tcp_data to avoid a potential
58
 *                                      nasty.
59
 *              Alan Cox        :       Added some better commenting, as the
60
 *                                      tcp is hard to follow
61
 *              Alan Cox        :       Removed incorrect check for 20 * psh
62
 *      Michael O'Reilly        :       ack < copied bug fix.
63
 *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64
 *              Alan Cox        :       FIN with no memory -> CRASH
65
 *              Alan Cox        :       Added socket option proto entries.
66
 *                                      Also added awareness of them to accept.
67
 *              Alan Cox        :       Added TCP options (SOL_TCP)
68
 *              Alan Cox        :       Switched wakeup calls to callbacks,
69
 *                                      so the kernel can layer network
70
 *                                      sockets.
71
 *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72
 *              Alan Cox        :       Handle FIN (more) properly (we hope).
73
 *              Alan Cox        :       RST frames sent on unsynchronised
74
 *                                      state ack error.
75
 *              Alan Cox        :       Put in missing check for SYN bit.
76
 *              Alan Cox        :       Added tcp_select_window() aka NET2E
77
 *                                      window non shrink trick.
78
 *              Alan Cox        :       Added a couple of small NET2E timer
79
 *                                      fixes
80
 *              Charles Hedrick :       TCP fixes
81
 *              Toomas Tamm     :       TCP window fixes
82
 *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83
 *              Charles Hedrick :       Rewrote most of it to actually work
84
 *              Linus           :       Rewrote tcp_read() and URG handling
85
 *                                      completely
86
 *              Gerhard Koerting:       Fixed some missing timer handling
87
 *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88
 *              Gerhard Koerting:       PC/TCP workarounds
89
 *              Adam Caldwell   :       Assorted timer/timing errors
90
 *              Matthew Dillon  :       Fixed another RST bug
91
 *              Alan Cox        :       Move to kernel side addressing changes.
92
 *              Alan Cox        :       Beginning work on TCP fastpathing
93
 *                                      (not yet usable)
94
 *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95
 *              Alan Cox        :       TCP fast path debugging
96
 *              Alan Cox        :       Window clamping
97
 *              Michael Riepe   :       Bug in tcp_check()
98
 *              Matt Dillon     :       More TCP improvements and RST bug fixes
99
 *              Matt Dillon     :       Yet more small nasties remove from the
100
 *                                      TCP code (Be very nice to this man if
101
 *                                      tcp finally works 100%) 8)
102
 *              Alan Cox        :       BSD accept semantics.
103
 *              Alan Cox        :       Reset on closedown bug.
104
 *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105
 *              Michael Pall    :       Handle select() after URG properly in
106
 *                                      all cases.
107
 *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108
 *                                      (multi URG PUSH broke rlogin).
109
 *              Michael Pall    :       Fix the multi URG PUSH problem in
110
 *                                      tcp_readable(), select() after URG
111
 *                                      works now.
112
 *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113
 *                                      BSD api.
114
 *              Alan Cox        :       Changed the semantics of sk->socket to
115
 *                                      fix a race and a signal problem with
116
 *                                      accept() and async I/O.
117
 *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118
 *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119
 *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120
 *                                      clients/servers which listen in on
121
 *                                      fixed ports.
122
 *              Alan Cox        :       Cleaned the above up and shrank it to
123
 *                                      a sensible code size.
124
 *              Alan Cox        :       Self connect lockup fix.
125
 *              Alan Cox        :       No connect to multicast.
126
 *              Ross Biro       :       Close unaccepted children on master
127
 *                                      socket close.
128
 *              Alan Cox        :       Reset tracing code.
129
 *              Alan Cox        :       Spurious resets on shutdown.
130
 *              Alan Cox        :       Giant 15 minute/60 second timer error
131
 *              Alan Cox        :       Small whoops in selecting before an
132
 *                                      accept.
133
 *              Alan Cox        :       Kept the state trace facility since
134
 *                                      it's handy for debugging.
135
 *              Alan Cox        :       More reset handler fixes.
136
 *              Alan Cox        :       Started rewriting the code based on
137
 *                                      the RFC's for other useful protocol
138
 *                                      references see: Comer, KA9Q NOS, and
139
 *                                      for a reference on the difference
140
 *                                      between specifications and how BSD
141
 *                                      works see the 4.4lite source.
142
 *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143
 *                                      close.
144
 *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145
 *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146
 *              Alan Cox        :       Reimplemented timers as per the RFC
147
 *                                      and using multiple timers for sanity.
148
 *              Alan Cox        :       Small bug fixes, and a lot of new
149
 *                                      comments.
150
 *              Alan Cox        :       Fixed dual reader crash by locking
151
 *                                      the buffers (much like datagram.c)
152
 *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153
 *                                      now gets fed up of retrying without
154
 *                                      (even a no space) answer.
155
 *              Alan Cox        :       Extracted closing code better
156
 *              Alan Cox        :       Fixed the closing state machine to
157
 *                                      resemble the RFC.
158
 *              Alan Cox        :       More 'per spec' fixes.
159
 *              Jorge Cwik      :       Even faster checksumming.
160
 *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161
 *                                      only frames. At least one pc tcp stack
162
 *                                      generates them.
163
 *              Alan Cox        :       Cache last socket.
164
 *              Alan Cox        :       Per route irtt.
165
 *              Matt Day        :       Select() match BSD precisely on error
166
 *              Alan Cox        :       New buffers
167
 *              Marc Tamsky     :       Various sk->prot->retransmits and
168
 *                                      sk->retransmits misupdating fixed.
169
 *                                      Fixed tcp_write_timeout: stuck close,
170
 *                                      and TCP syn retries gets used now.
171
 *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172
 *                                      ack if stat is TCP_CLOSED.
173
 *              Alan Cox        :       Look up device on a retransmit - routes may
174
 *                                      change. Doesn't yet cope with MSS shrink right
175
 *                                      but it's a start!
176
 *              Marc Tamsky     :       Closing in closing fixes.
177
 *              Mike Shaver     :       RFC1122 verifications.
178
 *              Alan Cox        :       rcv_saddr errors.
179
 *              Alan Cox        :       Block double connect().
180
 *              Alan Cox        :       Small hooks for enSKIP.
181
 *              Alexey Kuznetsov:       Path MTU discovery.
182
 *              Alan Cox        :       Support soft errors.
183
 *              Alan Cox        :       Fix MTU discovery pathological case
184
 *                                      when the remote claims no mtu!
185
 *              Marc Tamsky     :       TCP_CLOSE fix.
186
 *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187
 *                                      window but wrong (fixes NT lpd problems)
188
 *              Pedro Roque     :       Better TCP window handling, delayed ack.
189
 *              Joerg Reuter    :       No modification of locked buffers in
190
 *                                      tcp_do_retransmit()
191
 *              Eric Schenk     :       Changed receiver side silly window
192
 *                                      avoidance algorithm to BSD style
193
 *                                      algorithm. This doubles throughput
194
 *                                      against machines running Solaris,
195
 *                                      and seems to result in general
196
 *                                      improvement.
197
 *              Eric Schenk     :       Changed receiver side silly window
198
 *                                      avoidance algorithm to BSD style
199
 *                                      algorithm. This doubles throughput
200
 *                                      against machines running Solaris,
201
 *                                      and seems to result in general
202
 *                                      improvement.
203
 *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
204
 *      Willy Konynenberg       :       Transparent proxying support.
205
 *              Theodore Ts'o   :       Do secure TCP sequence numbers.
206
 *              David S. Miller :       New socket lookup architecture for ISS.
207
 *                                      This code is dedicated to John Dyson.
208
 *              Elliot Poger    :       Added support for SO_BINDTODEVICE.
209
 *
210
 * To Fix:
211
 *              Fast path the code. Two things here - fix the window calculation
212
 *              so it doesn't iterate over the queue, also spot packets with no funny
213
 *              options arriving in order and process directly.
214
 *
215
 *              Rewrite output state machine to use a single queue.
216
 *              Speed up input assembly algorithm.
217
 *              RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
218
 *              could do with it working on IPv4
219
 *              User settable/learned rtt/max window/mtu
220
 *
221
 *              Change the fundamental structure to a single send queue maintained
222
 *              by TCP (removing the bogus ip stuff [thus fixing mtu drops on
223
 *              active routes too]). Cut the queue off in tcp_retransmit/
224
 *              tcp_transmit.
225
 *              Change the receive queue to assemble as it goes. This lets us
226
 *              dispose of most of tcp_sequence, half of tcp_ack and chunks of
227
 *              tcp_data/tcp_read as well as the window shrink crud.
228
 *              Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
229
 *              tcp_queue_skb seem obvious routines to extract.
230
 *
231
 *              This program is free software; you can redistribute it and/or
232
 *              modify it under the terms of the GNU General Public License
233
 *              as published by the Free Software Foundation; either version
234
 *              2 of the License, or(at your option) any later version.
235
 *
236
 * Description of States:
237
 *
238
 *      TCP_SYN_SENT            sent a connection request, waiting for ack
239
 *
240
 *      TCP_SYN_RECV            received a connection request, sent ack,
241
 *                              waiting for final ack in three-way handshake.
242
 *
243
 *      TCP_ESTABLISHED         connection established
244
 *
245
 *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
246
 *                              transmission of remaining buffered data
247
 *
248
 *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
249
 *                              to shutdown
250
 *
251
 *      TCP_CLOSING             both sides have shutdown but we still have
252
 *                              data we have to finish sending
253
 *
254
 *      TCP_TIME_WAIT           timeout to catch resent junk before entering
255
 *                              closed, can only be entered from FIN_WAIT2
256
 *                              or CLOSING.  Required because the other end
257
 *                              may not have gotten our last ACK causing it
258
 *                              to retransmit the data packet (which we ignore)
259
 *
260
 *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
261
 *                              us to finish writing our data and to shutdown
262
 *                              (we have to close() to move on to LAST_ACK)
263
 *
264
 *      TCP_LAST_ACK            out side has shutdown after remote has
265
 *                              shutdown.  There may still be data in our
266
 *                              buffer that we have to finish sending
267
 *
268
 *      TCP_CLOSE               socket is finished
269
 */
270
 
271
/*
272
 * RFC1122 status:
273
 * NOTE: I'm not going to be doing comments in the code for this one except
274
 * for violations and the like.  tcp.c is just too big... If I say something
275
 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
276
 * with Alan. -- MS 950903
277
 *
278
 * Use of PSH (4.2.2.2)
279
 *   MAY aggregate data sent without the PSH flag. (does)
280
 *   MAY queue data received without the PSH flag. (does)
281
 *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
282
 *   MAY implement PSH on send calls. (doesn't, thus:)
283
 *     MUST NOT buffer data indefinitely (doesn't [1 second])
284
 *     MUST set PSH on last segment (does)
285
 *   MAY pass received PSH to application layer (doesn't)
286
 *   SHOULD send maximum-sized segment whenever possible. (almost always does)
287
 *
288
 * Window Size (4.2.2.3, 4.2.2.16)
289
 *   MUST treat window size as an unsigned number (does)
290
 *   SHOULD treat window size as a 32-bit number (does not)
291
 *   MUST NOT shrink window once it is offered (does not normally)
292
 *
293
 * Urgent Pointer (4.2.2.4)
294
 * **MUST point urgent pointer to last byte of urgent data (not right
295
 *     after). (doesn't, to be like BSD)
296
 *   MUST inform application layer asynchronously of incoming urgent
297
 *     data. (does)
298
 *   MUST provide application with means of determining the amount of
299
 *     urgent data pending. (does)
300
 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
301
 *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
302
 *      [Follows BSD 1 byte of urgent data]
303
 *
304
 * TCP Options (4.2.2.5)
305
 *   MUST be able to receive TCP options in any segment. (does)
306
 *   MUST ignore unsupported options (does)
307
 *
308
 * Maximum Segment Size Option (4.2.2.6)
309
 *   MUST implement both sending and receiving MSS. (does)
310
 *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
311
 *     it always). (does, even when MSS == 536, which is legal)
312
 *   MUST assume MSS == 536 if no MSS received at connection setup (does)
313
 *   MUST calculate "effective send MSS" correctly:
314
 *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
315
 *     (does - but allows operator override)
316
 *
317
 * TCP Checksum (4.2.2.7)
318
 *   MUST generate and check TCP checksum. (does)
319
 *
320
 * Initial Sequence Number Selection (4.2.2.8)
321
 *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
322
 *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
323
 *     necessary for 10Mbps networks - and harder than BSD to spoof!)
324
 *
325
 * Simultaneous Open Attempts (4.2.2.10)
326
 *   MUST support simultaneous open attempts (does)
327
 *
328
 * Recovery from Old Duplicate SYN (4.2.2.11)
329
 *   MUST keep track of active vs. passive open (does)
330
 *
331
 * RST segment (4.2.2.12)
332
 *   SHOULD allow an RST segment to contain data (does, but doesn't do
333
 *     anything with it, which is standard)
334
 *
335
 * Closing a Connection (4.2.2.13)
336
 *   MUST inform application of whether connection was closed by RST or
337
 *     normal close. (does)
338
 *   MAY allow "half-duplex" close (treat connection as closed for the
339
 *     local app, even before handshake is done). (does)
340
 *   MUST linger in TIME_WAIT for 2 * MSL (does)
341
 *
342
 * Retransmission Timeout (4.2.2.15)
343
 *   MUST implement Jacobson's slow start and congestion avoidance
344
 *     stuff. (does)
345
 *
346
 * Probing Zero Windows (4.2.2.17)
347
 *   MUST support probing of zero windows. (does)
348
 *   MAY keep offered window closed indefinitely. (does)
349
 *   MUST allow remote window to stay closed indefinitely. (does)
350
 *
351
 * Passive Open Calls (4.2.2.18)
352
 *   MUST NOT let new passive open affect other connections. (doesn't)
353
 *   MUST support passive opens (LISTENs) concurrently. (does)
354
 *
355
 * Time to Live (4.2.2.19)
356
 *   MUST make TCP TTL configurable. (does - IP_TTL option)
357
 *
358
 * Event Processing (4.2.2.20)
359
 *   SHOULD queue out-of-order segments. (does)
360
 *   MUST aggregate ACK segments whenever possible. (does but badly)
361
 *
362
 * Retransmission Timeout Calculation (4.2.3.1)
363
 *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
364
 *     calculation. (does, or at least explains them in the comments 8*b)
365
 *  SHOULD initialize RTO to 0 and RTT to 3. (does)
366
 *
367
 * When to Send an ACK Segment (4.2.3.2)
368
 *   SHOULD implement delayed ACK. (does)
369
 *   MUST keep ACK delay < 0.5 sec. (does)
370
 *
371
 * When to Send a Window Update (4.2.3.3)
372
 *   MUST implement receiver-side SWS. (does)
373
 *
374
 * When to Send Data (4.2.3.4)
375
 *   MUST implement sender-side SWS. (does)
376
 *   SHOULD implement Nagle algorithm. (does)
377
 *
378
 * TCP Connection Failures (4.2.3.5)
379
 *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
380
 *   SHOULD inform application layer of soft errors. (does)
381
 *
382
 * TCP Keep-Alives (4.2.3.6)
383
 *   MAY provide keep-alives. (does)
384
 *   MUST make keep-alives configurable on a per-connection basis. (does)
385
 *   MUST default to no keep-alives. (does)
386
 * **MUST make keep-alive interval configurable. (doesn't)
387
 * **MUST make default keep-alive interval > 2 hours. (doesn't)
388
 *   MUST NOT interpret failure to ACK keep-alive packet as dead
389
 *     connection. (doesn't)
390
 *   SHOULD send keep-alive with no data. (does)
391
 *
392
 * TCP Multihoming (4.2.3.7)
393
 *   MUST get source address from IP layer before sending first
394
 *     SYN. (does)
395
 *   MUST use same local address for all segments of a connection. (does)
396
 *
397
 * IP Options (4.2.3.8)
398
 *   MUST ignore unsupported IP options. (does)
399
 *   MAY support Time Stamp and Record Route. (does)
400
 *   MUST allow application to specify a source route. (does)
401
 *   MUST allow received Source Route option to set route for all future
402
 *     segments on this connection. (does not (security issues))
403
 *
404
 * ICMP messages (4.2.3.9)
405
 *   MUST act on ICMP errors. (does)
406
 *   MUST slow transmission upon receipt of a Source Quench. (does)
407
 *   MUST NOT abort connection upon receipt of soft Destination
408
 *     Unreachables (0, 1, 5), Time Exceededs and Parameter
409
 *     Problems. (doesn't)
410
 *   SHOULD report soft Destination Unreachables etc. to the
411
 *     application. (does)
412
 *   SHOULD abort connection upon receipt of hard Destination Unreachable
413
 *     messages (2, 3, 4). (does)
414
 *
415
 * Remote Address Validation (4.2.3.10)
416
 *   MUST reject as an error OPEN for invalid remote IP address. (does)
417
 *   MUST ignore SYN with invalid source address. (does)
418
 *   MUST silently discard incoming SYN for broadcast/multicast
419
 *     address. (does)
420
 *
421
 * Asynchronous Reports (4.2.4.1)
422
 * MUST provide mechanism for reporting soft errors to application
423
 *     layer. (does)
424
 *
425
 * Type of Service (4.2.4.2)
426
 *   MUST allow application layer to set Type of Service. (does IP_TOS)
427
 *
428
 * (Whew. -- MS 950903)
429
 **/
430
 
431
#include <linux/config.h>
432
#include <linux/types.h>
433
#include <linux/fcntl.h>
434
#include <linux/random.h>
435
 
436
#include <net/icmp.h>
437
#include <net/tcp.h>
438
 
439
#include <asm/segment.h>
440
 
441
unsigned long seq_offset;
442
struct tcp_mib  tcp_statistics;
443
 
444
/* This is for sockets with full identity only.  Sockets here will always
445
 * be without wildcards and will have the following invariant:
446
 *          TCP_ESTABLISHED <= sk->state < TCP_CLOSE
447
 */
448
struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
449
 
450
/* All sockets in TCP_LISTEN state will be in here.  This is the only table
451
 * where wildcard'd TCP sockets can exist.  Hash function here is just local
452
 * port number.  XXX Fix or we'll lose with thousands of IP aliases...
453
 */
454
struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
455
 
456
/* Ok, let's try this, I give up, we do need a local binding
457
 * TCP hash as well as the others for fast bind/connect.
458
 */
459
struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
460
 
461
extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport);
462
 
463
static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
464
{
465
        struct sock *sk2;
466
        int retval = 0, sk_reuse = sk->reuse;
467
 
468
        SOCKHASH_LOCK();
469
        sk2 = tcp_bound_hash[tcp_bhashfn(snum)];
470
        for(; sk2 != NULL; sk2 = sk2->bind_next) {
471
                if((sk2->num == snum) && (sk2 != sk)) {
472
                        unsigned char state = sk2->state;
473
                        int sk2_reuse = sk2->reuse;
474
 
475
                        /* Two sockets can be bound to the same port if they're
476
                         * bound to different interfaces... */
477
                        if (sk->bound_device != sk2->bound_device)
478
                                continue;
479
 
480
                        if(!sk2->rcv_saddr || !sk->rcv_saddr) {
481
                                if((!sk2_reuse)                 ||
482
                                   (!sk_reuse)                  ||
483
                                   (state == TCP_LISTEN)) {
484
                                        retval = 1;
485
                                        break;
486
                                }
487
                        } else if(sk2->rcv_saddr == sk->rcv_saddr) {
488
                                if((!sk_reuse)                  ||
489
                                   (!sk2_reuse)                 ||
490
                                   (state == TCP_LISTEN)) {
491
                                        retval = 1;
492
                                        break;
493
                                }
494
                        }
495
                }
496
        }
497
        SOCKHASH_UNLOCK();
498
 
499
        return retval;
500
}
501
 
502
static __inline__ int tcp_lport_inuse(int num)
503
{
504
        struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)];
505
 
506
        for(; sk != NULL; sk = sk->bind_next) {
507
                if(sk->num == num)
508
                        return 1;
509
        }
510
        return 0;
511
}
512
 
513
/* Find a "good" local port, this is family independant.
514
 * There are several strategies working in unison here to
515
 * get the best possible performance.  The current socket
516
 * load is kept track of, if it is zero there is a strong
517
 * likely hood that there is a zero length chain we will
518
 * find with a small amount of searching, else the load is
519
 * what we shoot for when the chains all have at least
520
 * one entry.  The base helps us walk the chains in an
521
 * order such that a good chain is found as quickly as possible.  -DaveM
522
 */
523
unsigned short tcp_good_socknum(void)
524
{
525
        static int start = PROT_SOCK;
526
        static int binding_contour = 0;
527
        int best = 0;
528
        int size = 32767; /* a big num. */
529
        int retval = 0, i, end, bc;
530
 
531
        SOCKHASH_LOCK();
532
        i = tcp_bhashfn(start);
533
        end = i + TCP_BHTABLE_SIZE;
534
        bc = binding_contour;
535
        do {
536
                struct sock *sk = tcp_bound_hash[i&(TCP_BHTABLE_SIZE-1)];
537
                if(!sk) {
538
                        /* find the smallest value no smaller than start
539
                         * that has this hash value.
540
                         */
541
                        retval = tcp_bhashnext(start-1,i&(TCP_BHTABLE_SIZE-1));
542
 
543
                        /* Check for decreasing load. */
544
                        if (bc != 0)
545
                                binding_contour = 0;
546
                        goto done;
547
                } else {
548
                        int j = 0;
549
                        do { sk = sk->bind_next; } while (++j < size && sk);
550
                        if (j < size) {
551
                                best = i&(TCP_BHTABLE_SIZE-1);
552
                                size = j;
553
                                if (bc && size <= bc) {
554
                                        i = best;
555
                                        goto verify;
556
                                }
557
                        }
558
                }
559
        } while(++i != end);
560
        i = best;
561
 
562
        /* Socket load is increasing, adjust our load average. */
563
        binding_contour = size;
564
verify:
565
        if (size < binding_contour)
566
                binding_contour = size;
567
 
568
        retval = tcp_bhashnext(start-1,i);
569
 
570
        best = retval;  /* mark the starting point to avoid infinite loops */
571
        while(tcp_lport_inuse(retval)) {
572
                retval = tcp_bhashnext(retval,i);
573
                if (retval > 32767)     /* Upper bound */
574
                        retval = tcp_bhashnext(PROT_SOCK,i);
575
                if (retval == best) {
576
                        /* This hash chain is full. No answer. */
577
                        retval = 0;
578
                        break;
579
                }
580
        }
581
 
582
done:
583
        start = (retval + 1);
584
        if (start > 32767 || start < PROT_SOCK)
585
                start = PROT_SOCK;
586
        SOCKHASH_UNLOCK();
587
 
588
        return retval;
589
}
590
 
591
void tcp_v4_hash(struct sock *sk)
592
{
593
        unsigned char state;
594
 
595
        SOCKHASH_LOCK();
596
        state = sk->state;
597
        if(state != TCP_CLOSE || !sk->dead) {
598
                struct sock **skp;
599
 
600
                if(state == TCP_LISTEN)
601
                        skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
602
                else
603
                        skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
604
 
605
                if((sk->next = *skp) != NULL)
606
                        (*skp)->pprev = &sk->next;
607
                *skp = sk;
608
                sk->pprev = skp;
609
                tcp_sk_bindify(sk);
610
        }
611
        SOCKHASH_UNLOCK();
612
}
613
 
614
void tcp_v4_unhash(struct sock *sk)
615
{
616
        SOCKHASH_LOCK();
617
        if(sk->pprev) {
618
                if(sk->next)
619
                        sk->next->pprev = sk->pprev;
620
                *sk->pprev = sk->next;
621
                sk->pprev = NULL;
622
                tcp_sk_unbindify(sk);
623
        }
624
        SOCKHASH_UNLOCK();
625
}
626
 
627
void tcp_v4_rehash(struct sock *sk)
628
{
629
        unsigned char state;
630
 
631
        SOCKHASH_LOCK();
632
        state = sk->state;
633
        if(sk->pprev) {
634
                if(sk->next)
635
                        sk->next->pprev = sk->pprev;
636
                *sk->pprev = sk->next;
637
                sk->pprev = NULL;
638
                tcp_sk_unbindify(sk);
639
        }
640
        if(state != TCP_CLOSE || !sk->dead) {
641
                struct sock **skp;
642
 
643
                if(state == TCP_LISTEN)
644
                        skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
645
                else
646
                        skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
647
 
648
                if((sk->next = *skp) != NULL)
649
                        (*skp)->pprev = &sk->next;
650
                *skp = sk;
651
                sk->pprev = skp;
652
                tcp_sk_bindify(sk);
653
        }
654
        SOCKHASH_UNLOCK();
655
}
656
 
657
static void tcp_close(struct sock *sk, unsigned long timeout);
658
 
659
/*
660
 *      Find someone to 'accept'. Must be called with
661
 *      the socket locked or with interrupts disabled
662
 */
663
 
664
static struct sk_buff *tcp_find_established(struct sock *s)
665
{
666
        struct sk_buff *p=skb_peek(&s->receive_queue);
667
        if(p==NULL)
668
                return NULL;
669
        do
670
        {
671
                if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
672
                        return p;
673
                p=p->next;
674
        }
675
        while(p!=(struct sk_buff *)&s->receive_queue);
676
        return NULL;
677
}
678
 
679
/*
680
 *      This routine closes sockets which have been at least partially
681
 *      opened, but not yet accepted. Currently it is only called by
682
 *      tcp_close, and timeout mirrors the value there.
683
 */
684
 
685
static void tcp_close_pending (struct sock *sk)
686
{
687
        struct sk_buff *skb;
688
 
689
        while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
690
        {
691
                tcp_close(skb->sk, 0);
692
                kfree_skb(skb, FREE_READ);
693
        }
694
        return;
695
}
696
 
697
/*
698
 *      Enter the time wait state.
699
 */
700
 
701
void tcp_time_wait(struct sock *sk)
702
{
703
        tcp_set_state(sk,TCP_TIME_WAIT);
704
        sk->shutdown = SHUTDOWN_MASK;
705
        if (!sk->dead)
706
                sk->state_change(sk);
707
        tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
708
}
709
 
710
 
711
/*
712
 * This routine is called by the ICMP module when it gets some
713
 * sort of error condition.  If err < 0 then the socket should
714
 * be closed and the error returned to the user.  If err > 0
715
 * it's just the icmp type << 8 | icmp code.  After adjustment
716
 * header points to the first 8 bytes of the tcp header.  We need
717
 * to find the appropriate port.
718
 */
719
 
720
void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
721
        __u32 saddr, struct inet_protocol *protocol, int len)
722
{
723
        struct tcphdr *th = (struct tcphdr *)header;
724
        struct sock *sk;
725
 
726
        /*
727
         *      This one is _WRONG_. FIXME urgently.
728
         */
729
#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
730
        struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
731
#endif
732
        th =(struct tcphdr *)header;
733
 
734
        if (len < 8)    /* NOT sizeof(struct tcphdr) */
735
                return;
736
 
737
        sk = tcp_v4_lookup(daddr, th->dest, saddr, th->source);
738
        if (sk == NULL)
739
                return;
740
 
741
        if (type == ICMP_SOURCE_QUENCH)
742
        {
743
                /* Current practice says these frames are bad, plus the drops
744
                   will account right anyway. If we act on this we stall doubly */
745
                return;
746
        }
747
 
748
        if (type == ICMP_PARAMETERPROB)
749
        {
750
                sk->err=EPROTO;
751
                sk->error_report(sk);
752
        }
753
 
754
#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
755
        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
756
        {
757
                struct rtable * rt;
758
                /*
759
                 * Ugly trick to pass MTU to protocol layer.
760
                 * Really we should add argument "info" to error handler.
761
                 */
762
                unsigned short new_mtu = ntohs(iph->id);
763
 
764
                if ((rt = sk->ip_route_cache) != NULL)
765
                        if (rt->rt_mtu > new_mtu)
766
                                rt->rt_mtu = new_mtu;
767
 
768
                /*
769
                 *      FIXME::
770
                 *      Not the nicest of fixes: Lose a MTU update if the socket is
771
                 *      locked this instant. Not the right answer but will be best
772
                 *      for the production fix. Make 2.1 work right!
773
                 */
774
 
775
                if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
776
                        && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr) && !sk->users)
777
                        sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
778
 
779
                return;
780
        }
781
#endif
782
 
783
        /*
784
         * If we've already connected we will keep trying
785
         * until we time out, or the user gives up.
786
         */
787
 
788
        if(code<=NR_ICMP_UNREACH)
789
        {
790
                if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
791
                {
792
                        sk->err = icmp_err_convert[code].errno;
793
                        if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
794
                        {
795
                                tcp_statistics.TcpAttemptFails++;
796
                                tcp_set_state(sk,TCP_CLOSE);
797
                                sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
798
                        }
799
                }
800
                else    /* Only an error on timeout */
801
                        sk->err_soft = icmp_err_convert[code].errno;
802
        }
803
}
804
 
805
 
806
/*
807
 *      Walk down the receive queue counting readable data until we hit the end or we find a gap
808
 *      in the received data queue (ie a frame missing that needs sending to us). Not
809
 *      sorting using two queues as data arrives makes life so much harder.
810
 */
811
 
812
static int tcp_readable(struct sock *sk)
813
{
814
        unsigned long counted;
815
        unsigned long amount;
816
        struct sk_buff *skb;
817
        int sum;
818
        unsigned long flags;
819
 
820
        if(sk && sk->debug)
821
                printk("tcp_readable: %p - ",sk);
822
 
823
        save_flags(flags);
824
        cli();
825
        if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
826
        {
827
                restore_flags(flags);
828
                if(sk && sk->debug)
829
                        printk("empty\n");
830
                return(0);
831
        }
832
 
833
        counted = sk->copied_seq;       /* Where we are at the moment */
834
        amount = 0;
835
 
836
        /*
837
         *      Do until a push or until we are out of data.
838
         */
839
 
840
        do
841
        {
842
                if (before(counted, skb->seq))          /* Found a hole so stops here */
843
                        break;
844
                sum = skb->len - (counted - skb->seq);  /* Length - header but start from where we are up to (avoid overlaps) */
845
                if (skb->h.th->syn)
846
                        sum++;
847
                if (sum > 0)
848
                {                                       /* Add it up, move on */
849
                        amount += sum;
850
                        if (skb->h.th->syn)
851
                                amount--;
852
                        counted += sum;
853
                }
854
                /*
855
                 * Don't count urg data ... but do it in the right place!
856
                 * Consider: "old_data (ptr is here) URG PUSH data"
857
                 * The old code would stop at the first push because
858
                 * it counted the urg (amount==1) and then does amount--
859
                 * *after* the loop.  This means tcp_readable() always
860
                 * returned zero if any URG PUSH was in the queue, even
861
                 * though there was normal data available. If we subtract
862
                 * the urg data right here, we even get it to work for more
863
                 * than one URG PUSH skb without normal data.
864
                 * This means that select() finally works now with urg data
865
                 * in the queue.  Note that rlogin was never affected
866
                 * because it doesn't use select(); it uses two processes
867
                 * and a blocking read().  And the queue scan in tcp_read()
868
                 * was correct.  Mike <pall@rz.uni-karlsruhe.de>
869
                 */
870
                if (skb->h.th->urg)
871
                        amount--;       /* don't count urg data */
872
/*              if (amount && skb->h.th->psh) break;*/
873
                skb = skb->next;
874
        }
875
        while(skb != (struct sk_buff *)&sk->receive_queue);
876
 
877
        restore_flags(flags);
878
        if(sk->debug)
879
                printk("got %lu bytes.\n",amount);
880
        return(amount);
881
}
882
 
883
/*
884
 * LISTEN is a special case for select..
885
 */
886
static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
887
{
888
        if (sel_type == SEL_IN) {
889
                struct sk_buff * skb;
890
 
891
                lock_sock(sk);
892
                skb = tcp_find_established(sk);
893
                release_sock(sk);
894
                if (skb)
895
                        return 1;
896
                select_wait(sk->sleep,wait);
897
                return 0;
898
        }
899
        return 0;
900
}
901
 
902
 
903
/*
904
 *      Wait for a TCP event.
905
 *
906
 *      Note that we don't need to lock the socket, as the upper select layers
907
 *      take care of normal races (between the test and the event) and we don't
908
 *      go look at any of the socket buffers directly.
909
 */
910
static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
911
{
912
        if (sk->state == TCP_LISTEN)
913
                return tcp_listen_select(sk, sel_type, wait);
914
 
915
        switch(sel_type) {
916
        case SEL_IN:
917
                if (sk->err)
918
                        return 1;
919
                if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
920
                        break;
921
 
922
                if (sk->shutdown & RCV_SHUTDOWN)
923
                        return 1;
924
 
925
                if (sk->acked_seq == sk->copied_seq)
926
                        break;
927
 
928
                if (sk->urg_seq != sk->copied_seq ||
929
                    sk->acked_seq != sk->copied_seq+1 ||
930
                    sk->urginline || !sk->urg_data)
931
                        return 1;
932
                break;
933
 
934
        case SEL_OUT:
935
                if (sk->err)
936
                        return 1;
937
                if (sk->shutdown & SEND_SHUTDOWN)
938
                        return 0;
939
                if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
940
                        break;
941
                if (sk->wmem_alloc*2 > sk->sndbuf)
942
                        break;
943
                return 1;
944
 
945
        case SEL_EX:
946
                if (sk->urg_data & URG_VALID)
947
                        return 1;
948
                break;
949
        }
950
        select_wait(sk->sleep, wait);
951
        return 0;
952
}
953
 
954
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
955
{
956
        int err;
957
        switch(cmd)
958
        {
959
 
960
                case TIOCINQ:
961
#ifdef FIXME    /* FIXME: */
962
                case FIONREAD:
963
#endif
964
                {
965
                        unsigned long amount;
966
 
967
                        if (sk->state == TCP_LISTEN)
968
                                return(-EINVAL);
969
 
970
                        lock_sock(sk);
971
                        amount = tcp_readable(sk);
972
                        release_sock(sk);
973
                        err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
974
                        if(err)
975
                                return err;
976
                        put_user(amount, (int *)arg);
977
                        return(0);
978
                }
979
                case SIOCATMARK:
980
                {
981
                        int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
982
 
983
                        err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
984
                        if (err)
985
                                return err;
986
                        put_user(answ,(int *) arg);
987
                        return(0);
988
                }
989
                case TIOCOUTQ:
990
                {
991
                        unsigned long amount;
992
 
993
                        if (sk->state == TCP_LISTEN) return(-EINVAL);
994
                        amount = sock_wspace(sk);
995
                        err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
996
                        if(err)
997
                                return err;
998
                        put_user(amount, (int *)arg);
999
                        return(0);
1000
                }
1001
                default:
1002
                        return(-EINVAL);
1003
        }
1004
}
1005
 
1006
 
1007
/*
1008
 *      This routine computes a TCP checksum.
1009
 *
1010
 *      Modified January 1995 from a go-faster DOS routine by
1011
 *      Jorge Cwik <jorge@laser.satlink.net>
1012
 */
1013
#undef DEBUG_TCP_CHECK
1014
void tcp_send_check(struct tcphdr *th, unsigned long saddr,
1015
                unsigned long daddr, int len, struct sk_buff *skb)
1016
{
1017
#ifdef DEBUG_TCP_CHECK
1018
        u16 check;
1019
#endif
1020
        th->check = 0;
1021
        th->check = tcp_check(th, len, saddr, daddr,
1022
                csum_partial((char *)th,sizeof(*th),skb->csum));
1023
 
1024
#ifdef DEBUG_TCP_CHECK
1025
        check = th->check;
1026
        th->check = 0;
1027
        th->check = tcp_check(th, len, saddr, daddr,
1028
                csum_partial((char *)th,len,0));
1029
        if (check != th->check) {
1030
                static int count = 0;
1031
                if (++count < 10) {
1032
                        printk("Checksum %x (%x) from %p\n", th->check, check,
1033
                                (&th)[-1]);
1034
                        printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
1035
                }
1036
        }
1037
#endif
1038
}
1039
 
1040
 
1041
/*
1042
 *      This routine builds a generic TCP header.
1043
 */
1044
 
1045
static inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
1046
{
1047
        memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1048
        th->psh = (push == 0) ? 1 : 0;
1049
        th->seq = htonl(sk->write_seq);
1050
        th->ack_seq = htonl(sk->acked_seq);
1051
        th->window = htons(tcp_select_window(sk));
1052
 
1053
        return(sizeof(*th));
1054
}
1055
 
1056
/*
1057
 *      Wait for a socket to get into the connected state
1058
 */
1059
static void wait_for_tcp_connect(struct sock * sk)
1060
{
1061
        release_sock(sk);
1062
        cli();
1063
        if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1064
        {
1065
                interruptible_sleep_on(sk->sleep);
1066
        }
1067
        sti();
1068
        lock_sock(sk);
1069
}
1070
 
1071
static inline int tcp_memory_free(struct sock *sk)
1072
{
1073
        return sk->wmem_alloc < sk->sndbuf;
1074
}
1075
 
1076
/*
1077
 *      Wait for more memory for a socket
1078
 */
1079
static void wait_for_tcp_memory(struct sock * sk)
1080
{
1081
        release_sock(sk);
1082
        if (!tcp_memory_free(sk)) {
1083
                struct wait_queue wait = { current, NULL };
1084
 
1085
                sk->socket->flags &= ~SO_NOSPACE;
1086
                add_wait_queue(sk->sleep, &wait);
1087
                for (;;) {
1088
                        if (current->signal & ~current->blocked)
1089
                                break;
1090
                        current->state = TASK_INTERRUPTIBLE;
1091
                        if (tcp_memory_free(sk))
1092
                                break;
1093
                        if (sk->shutdown & SEND_SHUTDOWN)
1094
                                break;
1095
                        if (sk->err)
1096
                                break;
1097
                        schedule();
1098
                }
1099
                current->state = TASK_RUNNING;
1100
                remove_wait_queue(sk->sleep, &wait);
1101
        }
1102
        lock_sock(sk);
1103
}
1104
 
1105
 
1106
/*
1107
 *      This routine copies from a user buffer into a socket,
1108
 *      and starts the transmit system.
1109
 */
1110
 
1111
static int do_tcp_sendmsg(struct sock *sk,
1112
        int iovlen, struct iovec *iov,
1113
        int len, int nonblock, int flags)
1114
{
1115
        int copied = 0;
1116
        struct device *dev = NULL;
1117
 
1118
        /*
1119
         *      Wait for a connection to finish.
1120
         */
1121
        while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1122
        {
1123
                if (sk->err)
1124
                        return sock_error(sk);
1125
 
1126
                if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1127
                {
1128
                        if (sk->keepopen)
1129
                                send_sig(SIGPIPE, current, 0);
1130
                        return -EPIPE;
1131
                }
1132
 
1133
                if (nonblock)
1134
                        return -EAGAIN;
1135
 
1136
                if (current->signal & ~current->blocked)
1137
                        return -ERESTARTSYS;
1138
 
1139
                wait_for_tcp_connect(sk);
1140
        }
1141
 
1142
        /*
1143
         *      Ok commence sending
1144
         */
1145
 
1146
        while (--iovlen >= 0)
1147
        {
1148
                int seglen=iov->iov_len;
1149
                unsigned char * from=iov->iov_base;
1150
                iov++;
1151
 
1152
                while(seglen > 0)
1153
                {
1154
                        int copy, delay;
1155
                        int tmp;
1156
                        struct sk_buff *skb;
1157
 
1158
                        /*
1159
                         * Stop on errors
1160
                         */
1161
                        if (sk->err)
1162
                        {
1163
                                if (copied)
1164
                                        return copied;
1165
                                return sock_error(sk);
1166
                        }
1167
 
1168
                        /*
1169
                         *      Make sure that we are established.
1170
                         */
1171
                        if (sk->shutdown & SEND_SHUTDOWN)
1172
                        {
1173
                                if (copied)
1174
                                        return copied;
1175
                                send_sig(SIGPIPE,current,0);
1176
                                return -EPIPE;
1177
                        }
1178
 
1179
                        /*
1180
                         * The following code can result in copy <= if sk->mss is ever
1181
                         * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
1182
                         * sk->mtu is constant once SYN processing is finished.  I.e. we
1183
                         * had better not get here until we've seen his SYN and at least one
1184
                         * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
1185
                         * But ESTABLISHED should guarantee that.  sk->max_window is by definition
1186
                         * non-decreasing.  Note that any ioctl to set user_mss must be done
1187
                         * before the exchange of SYN's.  If the initial ack from the other
1188
                         * end has a window of 0, max_window and thus mss will both be 0.
1189
                         */
1190
 
1191
                        /*
1192
                         *      Now we need to check if we have a half built packet.
1193
                         */
1194
#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1195
                        /*
1196
                         *      Really, we should rebuild all the queues...
1197
                         *      It's difficult. Temporary hack is to send all
1198
                         *      queued segments with allowed fragmentation.
1199
                         */
1200
                        {
1201
                                /*
1202
                                 *      new_mss may be zero. That indicates
1203
                                 *      we don't have a window estimate for
1204
                                 *      the remote box yet.
1205
                                 *              -- AC
1206
                                 */
1207
 
1208
                                int new_mss = min(sk->mtu, sk->max_window);
1209
                                if (new_mss && new_mss < sk->mss)
1210
                                {
1211
                                        tcp_send_partial(sk);
1212
                                        sk->mss = new_mss;
1213
                                }
1214
                        }
1215
#endif
1216
 
1217
                        /*
1218
                         *      If there is a partly filled frame we can fill
1219
                         *      out.
1220
                         */
1221
                        if ((skb = tcp_dequeue_partial(sk)) != NULL)
1222
                        {
1223
                                int tcp_size;
1224
 
1225
                                tcp_size = skb->tail - (unsigned char *)(skb->h.th + 1);
1226
 
1227
                                /* Add more stuff to the end of skb->len */
1228
                                if (!(flags & MSG_OOB))
1229
                                {
1230
                                        copy = min(sk->mss - tcp_size, seglen);
1231
 
1232
                                        /*
1233
                                         *      Now we may find the frame is as big, or too
1234
                                         *      big for our MSS. Thats all fine. It means the
1235
                                         *      MSS shrank (from an ICMP) after we allocated
1236
                                         *      this frame.
1237
                                         */
1238
 
1239
                                        if (tcp_size >= sk->mss)
1240
                                        {
1241
                                                /*
1242
                                                 *      Send the now forced complete frame out.
1243
                                                 *
1244
                                                 *      Note for 2.1: The MSS reduce code ought to
1245
                                                 *      flush any frames in partial that are now
1246
                                                 *      full sized. Not serious, potential tiny
1247
                                                 *      performance hit.
1248
                                                 */
1249
                                                tcp_send_skb(sk,skb);
1250
                                                /*
1251
                                                 *      Get a new buffer and try again.
1252
                                                 */
1253
                                                continue;
1254
                                        }
1255
                                        /*
1256
                                         *      Otherwise continue to fill the buffer.
1257
                                         */
1258
                                        tcp_size += copy;
1259
                                        memcpy_fromfs(skb_put(skb,copy), from, copy);
1260
                                        skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
1261
                                        from += copy;
1262
                                        copied += copy;
1263
                                        len -= copy;
1264
                                        sk->write_seq += copy;
1265
                                        seglen -= copy;
1266
                                }
1267
                                /* If we have a full packet or a new OOB
1268
                                 * message, we have to force this packet out.
1269
                                 */
1270
                                if (tcp_size >= sk->mss || (flags & MSG_OOB))
1271
                                        tcp_send_skb(sk, skb);
1272
                                else
1273
                                        tcp_enqueue_partial(skb, sk);
1274
                                continue;
1275
                        }
1276
 
1277
                /*
1278
                 * We also need to worry about the window.
1279
                 * If window < 1/2 the maximum window we've seen from this
1280
                 *   host, don't use it.  This is sender side
1281
                 *   silly window prevention, as specified in RFC1122.
1282
                 *   (Note that this is different than earlier versions of
1283
                 *   SWS prevention, e.g. RFC813.).  What we actually do is
1284
                 *   use the whole MSS.  Since the results in the right
1285
                 *   edge of the packet being outside the window, it will
1286
                 *   be queued for later rather than sent.
1287
                 */
1288
 
1289
                        copy = sk->window_seq - sk->write_seq;
1290
                        if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1291
                                copy = sk->mss;
1292
                        if (copy > seglen)
1293
                                copy = seglen;
1294
                        if (copy <= 0)
1295
                        {
1296
                                printk(KERN_CRIT "TCP: **bug**: copy=%d, sk->mss=%d\n", copy, sk->mss);
1297
                                return -EFAULT;
1298
                        }
1299
 
1300
                        /*
1301
                         *      We should really check the window here also.
1302
                         */
1303
 
1304
                        delay = 0;
1305
                        tmp = copy + sk->prot->max_header + 15;
1306
                        /* If won't fill the current packet, and it's not an OOB message,
1307
                         * then we might want to delay to allow data in the later parts
1308
                         * of iov to fill this packet out. Note that if we aren't
1309
                         * Nagling or there are no packets currently out then the top
1310
                         * level code in tcp_sendmsg() will force any partial packets out
1311
                         * after we finish building the largest packets this write allows.
1312
                         */
1313
                        if (copy < sk->mss && !(flags & MSG_OOB)) {
1314
                                tmp = tmp - copy + sk->mtu + 128;
1315
                                delay = 1;
1316
                        }
1317
                        skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
1318
 
1319
                        /*
1320
                         *      If we didn't get any memory, we need to sleep.
1321
                         */
1322
 
1323
                        if (skb == NULL)
1324
                        {
1325
                                sk->socket->flags |= SO_NOSPACE;
1326
                                if (nonblock)
1327
                                {
1328
                                        if (copied)
1329
                                                return copied;
1330
                                        return -EAGAIN;
1331
                                }
1332
 
1333
                                if (current->signal & ~current->blocked)
1334
                                {
1335
                                        if (copied)
1336
                                                return copied;
1337
                                        return -ERESTARTSYS;
1338
                                }
1339
 
1340
                                wait_for_tcp_memory(sk);
1341
                                continue;
1342
                        }
1343
 
1344
                        skb->sk = sk;
1345
                        skb->free = 0;
1346
                        skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1347
 
1348
                        /*
1349
                         * FIXME: we need to optimize this.
1350
                         * Perhaps some hints here would be good.
1351
                         */
1352
 
1353
                        tmp = sk->prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1354
                                 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1355
                        if (tmp < 0 )
1356
                        {
1357
                                sock_wfree(sk, skb);
1358
                                if (copied)
1359
                                        return(copied);
1360
                                return(tmp);
1361
                        }
1362
#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1363
                        skb->ip_hdr->frag_off |= htons(IP_DF);
1364
#endif
1365
                        skb->dev = dev;
1366
                        skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1367
                        tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1368
                        if (tmp < 0)
1369
                        {
1370
                                sock_wfree(sk, skb);
1371
                                if (copied)
1372
                                        return(copied);
1373
                                return(tmp);
1374
                        }
1375
 
1376
                        if (flags & MSG_OOB)
1377
                        {
1378
                                skb->h.th->urg = 1;
1379
                                skb->h.th->urg_ptr = ntohs(copy);
1380
                        }
1381
 
1382
                        skb->csum = csum_partial_copy_fromuser(from,
1383
                                skb_put(skb,copy), copy, 0);
1384
 
1385
                        from += copy;
1386
                        copied += copy;
1387
                        len -= copy;
1388
                        seglen -= copy;
1389
                        skb->free = 0;
1390
                        sk->write_seq += copy;
1391
 
1392
                        if (delay)
1393
                        {
1394
                                tcp_enqueue_partial(skb, sk);
1395
                                continue;
1396
                        }
1397
                        tcp_send_skb(sk, skb);
1398
                }
1399
        }
1400
        sk->err = 0;
1401
 
1402
        return copied;
1403
}
1404
 
1405
 
1406
static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
1407
          int len, int nonblock, int flags)
1408
{
1409
        int retval = -EINVAL;
1410
 
1411
        /*
1412
         *      Do sanity checking for sendmsg/sendto/send
1413
         */
1414
 
1415
        if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1416
                goto out;
1417
        if (msg->msg_name) {
1418
                struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1419
 
1420
                if (msg->msg_namelen < sizeof(*addr))
1421
                        goto out;
1422
                if (addr->sin_family && addr->sin_family != AF_INET)
1423
                        goto out;
1424
                retval = -ENOTCONN;
1425
                if(sk->state == TCP_CLOSE)
1426
                        goto out;
1427
                retval = -EISCONN;
1428
                if (addr->sin_port != sk->dummy_th.dest)
1429
                        goto out;
1430
                if (addr->sin_addr.s_addr != sk->daddr)
1431
                        goto out;
1432
        }
1433
 
1434
        lock_sock(sk);
1435
        retval = do_tcp_sendmsg(sk, msg->msg_iovlen, msg->msg_iov, len, nonblock, flags);
1436
 
1437
/*
1438
 *      Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1439
 *      interactive fast network servers. It's meant to be on and
1440
 *      it really improves the throughput though not the echo time
1441
 *      on my slow slip link - Alan
1442
 *
1443
 *      If not nagling we can send on the before case too..
1444
 */
1445
 
1446
        if (sk->partial) {
1447
                if (!sk->packets_out ||
1448
                    (sk->nonagle && before(sk->write_seq , sk->window_seq))) {
1449
                        tcp_send_partial(sk);
1450
                }
1451
        }
1452
 
1453
        release_sock(sk);
1454
 
1455
out:
1456
        return retval;
1457
}
1458
 
1459
 
1460
/*
1461
 *      Send an ack if one is backlogged at this point.
1462
 */
1463
 
1464
void tcp_read_wakeup(struct sock *sk)
1465
{
1466
        if (!sk->ack_backlog)
1467
                return;
1468
 
1469
        /*
1470
         * If we're closed, don't send an ack, or we'll get a RST
1471
         * from the closed destination.
1472
         */
1473
        if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1474
                return;
1475
 
1476
        tcp_send_ack(sk);
1477
}
1478
 
1479
 
1480
/*
1481
 *      Handle reading urgent data. BSD has very simple semantics for
1482
 *      this, no blocking and very strange errors 8)
1483
 */
1484
 
1485
static int tcp_recv_urg(struct sock * sk, int nonblock,
1486
             struct msghdr *msg, int len, int flags, int *addr_len)
1487
{
1488
        /*
1489
         *      No URG data to read
1490
         */
1491
        if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1492
                return -EINVAL; /* Yes this is right ! */
1493
 
1494
        if (sk->err)
1495
                return sock_error(sk);
1496
 
1497
        if (sk->state == TCP_CLOSE || sk->done)
1498
        {
1499
                if (!sk->done)
1500
                {
1501
                        sk->done = 1;
1502
                        return 0;
1503
                }
1504
                return -ENOTCONN;
1505
        }
1506
 
1507
        if (sk->shutdown & RCV_SHUTDOWN)
1508
        {
1509
                sk->done = 1;
1510
                return 0;
1511
        }
1512
        lock_sock(sk);
1513
        if (sk->urg_data & URG_VALID)
1514
        {
1515
                char c = sk->urg_data;
1516
                if (!(flags & MSG_PEEK))
1517
                        sk->urg_data = URG_READ;
1518
                memcpy_toiovec(msg->msg_iov, &c, 1);
1519
                if(msg->msg_name)
1520
                {
1521
                        struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
1522
                        sin->sin_family=AF_INET;
1523
                        sin->sin_addr.s_addr=sk->daddr;
1524
                        sin->sin_port=sk->dummy_th.dest;
1525
                }
1526
                if(addr_len)
1527
                        *addr_len=sizeof(struct sockaddr_in);
1528
                release_sock(sk);
1529
                return 1;
1530
        }
1531
        release_sock(sk);
1532
 
1533
        /*
1534
         * Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1535
         * the available implementations agree in this case:
1536
         * this call should never block, independent of the
1537
         * blocking state of the socket.
1538
         * Mike <pall@rz.uni-karlsruhe.de>
1539
         */
1540
        return -EAGAIN;
1541
}
1542
 
1543
/*
1544
 *      Release a skb if it is no longer needed. This routine
1545
 *      must be called with interrupts disabled or with the
1546
 *      socket locked so that the sk_buff queue operation is ok.
1547
 */
1548
 
1549
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1550
{
1551
        skb->sk = sk;
1552
        __skb_unlink(skb, &sk->receive_queue);
1553
        kfree_skb(skb, FREE_READ);
1554
}
1555
 
1556
/*
1557
 *      FIXME:
1558
 *      This routine frees used buffers.
1559
 *      It should consider sending an ACK to let the
1560
 *      other end know we now have a bigger window.
1561
 */
1562
 
1563
static void cleanup_rbuf(struct sock *sk)
1564
{
1565
        /*
1566
         * NOTE! The socket must be locked, so that we don't get
1567
         * a messed-up receive queue.
1568
         */
1569
        while (!skb_queue_empty(&sk->receive_queue)) {
1570
                struct sk_buff *skb = sk->receive_queue.next;
1571
                if (!skb->used || skb->users)
1572
                        break;
1573
                tcp_eat_skb(sk, skb);
1574
        }
1575
 
1576
        /*
1577
         * Tell the world if we raised the window.
1578
         */
1579
        if (tcp_raise_window(sk))
1580
                tcp_send_ack(sk);
1581
}
1582
 
1583
 
1584
/*
1585
 *      This routine copies from a sock struct into the user buffer.
1586
 */
1587
 
1588
static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1589
        int len, int nonblock, int flags, int *addr_len)
1590
{
1591
        struct wait_queue wait = { current, NULL };
1592
        int copied = 0;
1593
        u32 peek_seq;
1594
        volatile u32 *seq;      /* So gcc doesn't overoptimise */
1595
        unsigned long used;
1596
 
1597
        /*
1598
         *      This error should be checked.
1599
         */
1600
 
1601
        if (sk->state == TCP_LISTEN)
1602
                return -ENOTCONN;
1603
 
1604
        /*
1605
         *      Urgent data needs to be handled specially.
1606
         */
1607
 
1608
        if (flags & MSG_OOB)
1609
                return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1610
 
1611
        /*
1612
         *      Copying sequence to update. This is volatile to handle
1613
         *      the multi-reader case neatly (memcpy_to/fromfs might be
1614
         *      inline and thus not flush cached variables otherwise).
1615
         */
1616
 
1617
        peek_seq = sk->copied_seq;
1618
        seq = &sk->copied_seq;
1619
        if (flags & MSG_PEEK)
1620
                seq = &peek_seq;
1621
 
1622
        add_wait_queue(sk->sleep, &wait);
1623
        lock_sock(sk);
1624
        while (len > 0)
1625
        {
1626
                struct sk_buff * skb;
1627
                u32 offset;
1628
 
1629
                /*
1630
                 * Are we at urgent data? Stop if we have read anything.
1631
                 */
1632
 
1633
                if (copied && sk->urg_data && sk->urg_seq == *seq)
1634
                        break;
1635
 
1636
                /*
1637
                 * We need to check signals first, to get correct SIGURG
1638
                 * handling.
1639
                 */
1640
                if (current->signal & ~current->blocked) {
1641
                        if (copied)
1642
                                break;
1643
                        copied = -ERESTARTSYS;
1644
                        if (nonblock)
1645
                                copied = -EAGAIN;
1646
                        break;
1647
                }
1648
 
1649
                /*
1650
                 *      Next get a buffer.
1651
                 */
1652
 
1653
                current->state = TASK_INTERRUPTIBLE;
1654
 
1655
                skb = sk->receive_queue.next;
1656
                while (skb != (struct sk_buff *)&sk->receive_queue)
1657
                {
1658
                        if (before(*seq, skb->seq))
1659
                                break;
1660
                        offset = *seq - skb->seq;
1661
                        if (skb->h.th->syn)
1662
                                offset--;
1663
                        if (offset < skb->len)
1664
                                goto found_ok_skb;
1665
                        if (skb->h.th->fin)
1666
                                goto found_fin_ok;
1667
                        if (!(flags & MSG_PEEK))
1668
                                skb->used = 1;
1669
                        skb = skb->next;
1670
                }
1671
 
1672
                if (copied)
1673
                        break;
1674
 
1675
                if (sk->err && !(flags&MSG_PEEK))
1676
                {
1677
                        copied = sock_error(sk);
1678
                        break;
1679
                }
1680
 
1681
                if (sk->state == TCP_CLOSE)
1682
                {
1683
                        if (!sk->done)
1684
                        {
1685
                                sk->done = 1;
1686
                                break;
1687
                        }
1688
                        copied = -ENOTCONN;
1689
                        break;
1690
                }
1691
 
1692
                if (sk->shutdown & RCV_SHUTDOWN)
1693
                {
1694
                        sk->done = 1;
1695
                        break;
1696
                }
1697
 
1698
                if (nonblock)
1699
                {
1700
                        copied = -EAGAIN;
1701
                        break;
1702
                }
1703
 
1704
                cleanup_rbuf(sk);
1705
                release_sock(sk);
1706
                sk->socket->flags |= SO_WAITDATA;
1707
                schedule();
1708
                sk->socket->flags &= ~SO_WAITDATA;
1709
                lock_sock(sk);
1710
                continue;
1711
 
1712
        found_ok_skb:
1713
                /*
1714
                 *      Lock the buffer. We can be fairly relaxed as
1715
                 *      an interrupt will never steal a buffer we are
1716
                 *      using unless I've missed something serious in
1717
                 *      tcp_data.
1718
                 */
1719
 
1720
                skb->users++;
1721
 
1722
                /*
1723
                 *      Ok so how much can we use ?
1724
                 */
1725
 
1726
                used = skb->len - offset;
1727
                if (len < used)
1728
                        used = len;
1729
                /*
1730
                 *      Do we have urgent data here?
1731
                 */
1732
 
1733
                if (sk->urg_data)
1734
                {
1735
                        u32 urg_offset = sk->urg_seq - *seq;
1736
                        if (urg_offset < used)
1737
                        {
1738
                                if (!urg_offset)
1739
                                {
1740
                                        if (!sk->urginline)
1741
                                        {
1742
                                                ++*seq;
1743
                                                offset++;
1744
                                                used--;
1745
                                        }
1746
                                }
1747
                                else
1748
                                        used = urg_offset;
1749
                        }
1750
                }
1751
 
1752
                /*
1753
                 *      Copy it - We _MUST_ update *seq first so that we
1754
                 *      don't ever double read when we have dual readers
1755
                 */
1756
 
1757
                *seq += used;
1758
 
1759
                /*
1760
                 *      This memcpy_tofs can sleep. If it sleeps and we
1761
                 *      do a second read it relies on the skb->users to avoid
1762
                 *      a crash when cleanup_rbuf() gets called.
1763
                 */
1764
 
1765
                memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
1766
                        skb->h.th->doff*4 + offset, used);
1767
                copied += used;
1768
                len -= used;
1769
 
1770
                /*
1771
                 *      We now will not sleep again until we are finished
1772
                 *      with skb. Sorry if you are doing the SMP port
1773
                 *      but you'll just have to fix it neatly ;)
1774
                 */
1775
 
1776
                skb->users --;
1777
 
1778
                if (after(sk->copied_seq,sk->urg_seq))
1779
                        sk->urg_data = 0;
1780
                if (used + offset < skb->len)
1781
                        continue;
1782
 
1783
                /*
1784
                 *      Process the FIN.
1785
                 */
1786
 
1787
                if (skb->h.th->fin)
1788
                        goto found_fin_ok;
1789
                if (flags & MSG_PEEK)
1790
                        continue;
1791
                skb->used = 1;
1792
                if (!skb->users)
1793
                        tcp_eat_skb(sk, skb);
1794
                continue;
1795
 
1796
        found_fin_ok:
1797
                ++*seq;
1798
                if (flags & MSG_PEEK)
1799
                        break;
1800
 
1801
                /*
1802
                 *      All is done
1803
                 */
1804
 
1805
                skb->used = 1;
1806
                sk->shutdown |= RCV_SHUTDOWN;
1807
                break;
1808
 
1809
        }
1810
 
1811
        if(copied>0 && msg->msg_name)
1812
        {
1813
                struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
1814
                sin->sin_family=AF_INET;
1815
                sin->sin_addr.s_addr=sk->daddr;
1816
                sin->sin_port=sk->dummy_th.dest;
1817
        }
1818
        if(addr_len)
1819
                *addr_len=sizeof(struct sockaddr_in);
1820
 
1821
        remove_wait_queue(sk->sleep, &wait);
1822
        current->state = TASK_RUNNING;
1823
 
1824
        /* Clean up data we have read: This will do ACK frames */
1825
        cleanup_rbuf(sk);
1826
        release_sock(sk);
1827
        return copied;
1828
}
1829
 
1830
 
1831
 
1832
/*
1833
 *      State processing on a close. This implements the state shift for
1834
 *      sending our FIN frame. Note that we only send a FIN for some
1835
 *      states. A shutdown() may have already sent the FIN, or we may be
1836
 *      closed.
1837
 */
1838
 
1839
static int tcp_close_state(struct sock *sk, int dead)
1840
{
1841
        int ns=TCP_CLOSE;
1842
        int send_fin=0;
1843
        switch(sk->state)
1844
        {
1845
                case TCP_SYN_SENT:      /* No SYN back, no FIN needed */
1846
                        break;
1847
                case TCP_SYN_RECV:
1848
                case TCP_ESTABLISHED:   /* Closedown begin */
1849
                        ns=TCP_FIN_WAIT1;
1850
                        send_fin=1;
1851
                        break;
1852
                case TCP_FIN_WAIT1:     /* Already closing, or FIN sent: no change */
1853
                case TCP_FIN_WAIT2:
1854
                case TCP_CLOSING:
1855
                        ns=sk->state;
1856
                        break;
1857
                case TCP_CLOSE:
1858
                case TCP_LISTEN:
1859
                        break;
1860
                case TCP_LAST_ACK:      /* Could have shutdown() then close().
1861
                                           Be careful not to send double fin. */
1862
                        ns=TCP_LAST_ACK;
1863
                        break;
1864
                case TCP_CLOSE_WAIT:    /* They have FIN'd us. We send our FIN and
1865
                                           wait only for the ACK */
1866
                        ns=TCP_LAST_ACK;
1867
                        send_fin=1;
1868
        }
1869
 
1870
        tcp_set_state(sk,ns);
1871
 
1872
        /*
1873
         *      This is a (useful) BSD violating of the RFC. There is a
1874
         *      problem with TCP as specified in that the other end could
1875
         *      keep a socket open forever with no application left this end.
1876
         *      We use a 3 minute timeout (about the same as BSD) then kill
1877
         *      our end. If they send after that then tough - BUT: long enough
1878
         *      that we won't make the old 4*rto = almost no time - whoops
1879
         *      reset mistake.
1880
         */
1881
        if(dead && ns==TCP_FIN_WAIT2)
1882
        {
1883
                int timer_active=del_timer(&sk->timer);
1884
                if(timer_active)
1885
                        add_timer(&sk->timer);
1886
                else
1887
                        tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
1888
        }
1889
 
1890
        return send_fin;
1891
}
1892
 
1893
/*
1894
 *      Shutdown the sending side of a connection. Much like close except
1895
 *      that we don't receive shut down or set sk->dead.
1896
 */
1897
 
1898
void tcp_shutdown(struct sock *sk, int how)
1899
{
1900
        /*
1901
         *      We need to grab some memory, and put together a FIN,
1902
         *      and then put it into the queue to be sent.
1903
         *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1904
         */
1905
 
1906
        if (!(how & SEND_SHUTDOWN))
1907
                return;
1908
 
1909
        /*
1910
         *      If we've already sent a FIN, or it's a closed state
1911
         */
1912
 
1913
        if (sk->state == TCP_FIN_WAIT1 ||
1914
            sk->state == TCP_FIN_WAIT2 ||
1915
            sk->state == TCP_CLOSING ||
1916
            sk->state == TCP_LAST_ACK ||
1917
            sk->state == TCP_TIME_WAIT ||
1918
            sk->state == TCP_CLOSE ||
1919
            sk->state == TCP_LISTEN
1920
          )
1921
        {
1922
                return;
1923
        }
1924
        lock_sock(sk);
1925
 
1926
        /*
1927
         * flag that the sender has shutdown
1928
         */
1929
 
1930
        sk->shutdown |= SEND_SHUTDOWN;
1931
 
1932
        /*
1933
         *  Clear out any half completed packets.
1934
         */
1935
 
1936
        if (sk->partial)
1937
                tcp_send_partial(sk);
1938
 
1939
        /*
1940
         *      FIN if needed
1941
         */
1942
 
1943
        if (tcp_close_state(sk,0))
1944
                tcp_send_fin(sk);
1945
 
1946
        release_sock(sk);
1947
}
1948
 
1949
 
1950
/*
1951
 *      Return 1 if we still have things to send in our buffers.
1952
 */
1953
 
1954
static inline int closing(struct sock * sk)
1955
{
1956
        switch (sk->state) {
1957
                case TCP_FIN_WAIT1:
1958
                case TCP_CLOSING:
1959
                case TCP_LAST_ACK:
1960
                        return 1;
1961
        }
1962
        return 0;
1963
}
1964
 
1965
 
1966
static void tcp_close(struct sock *sk, unsigned long timeout)
1967
{
1968
        struct sk_buff *skb;
1969
 
1970
        /*
1971
         * We need to grab some memory, and put together a FIN,
1972
         * and then put it into the queue to be sent.
1973
         */
1974
 
1975
        lock_sock(sk);
1976
 
1977
        if(sk->state == TCP_LISTEN)
1978
        {
1979
                /*
1980
                 *      Special case
1981
                 */
1982
                tcp_set_state(sk, TCP_CLOSE);
1983
                /*
1984
                 *      Our children must die before we do now that
1985
                 *      sk->listening exists. It was right anyway but
1986
                 *      dont break this assumption.
1987
                 */
1988
                tcp_close_pending(sk);
1989
                release_sock(sk);
1990
                sk->dead = 1;
1991
                tcp_v4_unhash(sk);
1992
                return;
1993
        }
1994
 
1995
        sk->keepopen = 1;
1996
        sk->shutdown = SHUTDOWN_MASK;
1997
 
1998
        if (!sk->dead)
1999
                sk->state_change(sk);
2000
 
2001
        /*
2002
         *  We need to flush the recv. buffs.  We do this only on the
2003
         *  descriptor close, not protocol-sourced closes, because the
2004
         *  reader process may not have drained the data yet!
2005
         */
2006
 
2007
        while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2008
                kfree_skb(skb, FREE_READ);
2009
 
2010
        /*
2011
         *      Get rid off any half-completed packets.
2012
         */
2013
 
2014
        if (sk->partial)
2015
                tcp_send_partial(sk);
2016
 
2017
        /*
2018
         *      Timeout is not the same thing - however the code likes
2019
         *      to send both the same way (sigh).
2020
         */
2021
 
2022
        if (tcp_close_state(sk,1)==1)
2023
        {
2024
                tcp_send_fin(sk);
2025
        }
2026
 
2027
        if (timeout) {
2028
                cli();
2029
                release_sock(sk);
2030
                current->timeout = timeout;
2031
                while(closing(sk) && current->timeout)
2032
                {
2033
                        interruptible_sleep_on(sk->sleep);
2034
                        if (current->signal & ~current->blocked)
2035
                        {
2036
                                break;
2037
                        }
2038
                }
2039
                current->timeout=0;
2040
                lock_sock(sk);
2041
                sti();
2042
        }
2043
 
2044
        /* Now that the socket is dead, if we are in the FIN_WAIT2 state
2045
         * we may need to set up a timer.
2046
         */
2047
        if (sk->state==TCP_FIN_WAIT2)
2048
        {
2049
                int timer_active=del_timer(&sk->timer);
2050
                if(timer_active)
2051
                        add_timer(&sk->timer);
2052
                else
2053
                        tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2054
        }
2055
 
2056
        release_sock(sk);
2057
 
2058
        if(sk->state == TCP_CLOSE)
2059
                tcp_v4_unhash(sk);
2060
 
2061
        sk->dead = 1;
2062
}
2063
 
2064
 
2065
/*
2066
 * Wait for a incoming connection, avoid race
2067
 * conditions. This must be called with the socket
2068
 * locked.
2069
 */
2070
static struct sk_buff * wait_for_connect(struct sock * sk)
2071
{
2072
        struct wait_queue wait = { current, NULL };
2073
        struct sk_buff * skb = NULL;
2074
 
2075
        add_wait_queue(sk->sleep, &wait);
2076
        for (;;) {
2077
                current->state = TASK_INTERRUPTIBLE;
2078
                end_bh_atomic();
2079
                release_sock(sk);
2080
                schedule();
2081
                lock_sock(sk);
2082
                start_bh_atomic();
2083
                skb = tcp_find_established(sk);
2084
                if (skb)
2085
                        break;
2086
                if (current->signal & ~current->blocked)
2087
                        break;
2088
        }
2089
        remove_wait_queue(sk->sleep, &wait);
2090
        return skb;
2091
}
2092
 
2093
/*
2094
 *      This will accept the next outstanding connection.
2095
 *
2096
 *      Be careful about race conditions here - this is subtle.
2097
 */
2098
 
2099
static struct sock *tcp_accept(struct sock *sk, int flags)
2100
{
2101
        int error;
2102
        struct sk_buff *skb;
2103
        struct sock *newsk = NULL;
2104
 
2105
  /*
2106
   * We need to make sure that this socket is listening,
2107
   * and that it has something pending.
2108
   */
2109
 
2110
        error = EINVAL;
2111
        if (sk->state != TCP_LISTEN)
2112
                goto no_listen;
2113
 
2114
        lock_sock(sk);start_bh_atomic();
2115
 
2116
        skb = tcp_find_established(sk);
2117
        if (skb) {
2118
got_new_connect:
2119
                __skb_unlink(skb, &sk->receive_queue);
2120
                newsk = skb->sk;
2121
                kfree_skb(skb, FREE_READ);
2122
                sk->ack_backlog--;
2123
                error = 0;
2124
out:
2125
                end_bh_atomic();
2126
                release_sock(sk);
2127
no_listen:
2128
                sk->err = error;
2129
                return newsk;
2130
        }
2131
 
2132
        error = EAGAIN;
2133
        if (flags & O_NONBLOCK)
2134
                goto out;
2135
        skb = wait_for_connect(sk);
2136
        if (skb)
2137
                goto got_new_connect;
2138
        error = ERESTARTSYS;
2139
        goto out;
2140
}
2141
 
2142
/*
2143
 * Check that a TCP address is unique, don't allow multiple
2144
 * connects to/from the same address
2145
 */
2146
static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
2147
{
2148
        int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum);
2149
        struct sock * sk;
2150
 
2151
        /* Make sure we are allowed to connect here.
2152
         * But freeze the hash while we snoop around.
2153
         */
2154
        SOCKHASH_LOCK();
2155
        sk = tcp_established_hash[hashent];
2156
        for (; sk != NULL; sk = sk->next) {
2157
                if(sk->daddr            == daddr                && /* remote address */
2158
                   sk->dummy_th.dest    == dnum                 && /* remote port */
2159
                   sk->num              == snum                 && /* local port */
2160
                   sk->saddr            == saddr) {                /* local address */
2161
                        retval = 0;
2162
                        break;
2163
                }
2164
        }
2165
        SOCKHASH_UNLOCK();
2166
        return retval;
2167
}
2168
 
2169
 
2170
/*
2171
 *      This will initiate an outgoing connection.
2172
 */
2173
 
2174
static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
2175
{
2176
        struct sk_buff *buff;
2177
        struct device *dev=NULL;
2178
        unsigned char *ptr;
2179
        int tmp;
2180
        int atype;
2181
        struct tcphdr *t1;
2182
        struct rtable *rt;
2183
 
2184
        if (sk->state != TCP_CLOSE)
2185
                return(-EISCONN);
2186
 
2187
        /*
2188
         *      Don't allow a double connect.
2189
         */
2190
 
2191
        if(sk->daddr)
2192
                return -EINVAL;
2193
 
2194
        if (addr_len < 8)
2195
                return(-EINVAL);
2196
 
2197
        if (usin->sin_family && usin->sin_family != AF_INET)
2198
                return(-EAFNOSUPPORT);
2199
 
2200
        /*
2201
         *      connect() to INADDR_ANY means loopback (BSD'ism).
2202
         */
2203
 
2204
        if (usin->sin_addr.s_addr==INADDR_ANY)
2205
                usin->sin_addr.s_addr=ip_my_addr();
2206
 
2207
        /*
2208
         *      Don't want a TCP connection going to a broadcast address
2209
         */
2210
 
2211
        if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
2212
                return -ENETUNREACH;
2213
 
2214
        if (!tcp_unique_address(sk->saddr, sk->num, usin->sin_addr.s_addr, usin->sin_port))
2215
                return -EADDRNOTAVAIL;
2216
 
2217
        lock_sock(sk);
2218
        sk->daddr = usin->sin_addr.s_addr;
2219
 
2220
        sk->rcv_ack_cnt = 1;
2221
        sk->err = 0;
2222
        sk->dummy_th.dest = usin->sin_port;
2223
 
2224
        buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
2225
        if (buff == NULL)
2226
        {
2227
                release_sock(sk);
2228
                return(-ENOMEM);
2229
        }
2230
        buff->sk = sk;
2231
        buff->free = 0;
2232
        buff->localroute = sk->localroute;
2233
 
2234
        /* If this socket is bound to a particular device, make sure we use it. */
2235
        dev = sk->bound_device;
2236
 
2237
        /*
2238
         *      Put in the IP header and routing stuff.
2239
         */
2240
 
2241
        tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2242
                IPPROTO_TCP, sk->opt, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2243
        if (tmp < 0)
2244
        {
2245
                sock_wfree(sk, buff);
2246
                release_sock(sk);
2247
                return(-ENETUNREACH);
2248
        }
2249
        if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
2250
                sk->saddr = rt->rt_src;
2251
        sk->rcv_saddr = sk->saddr;
2252
 
2253
        /*
2254
         * Set up our outgoing TCP sequence number
2255
         */
2256
        sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
2257
                                                   sk->dummy_th.source,
2258
                                                   usin->sin_port);
2259
        sk->window_seq = sk->write_seq;
2260
        sk->rcv_ack_seq = sk->write_seq -1;
2261
 
2262
        t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
2263
 
2264
        memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
2265
        buff->seq = sk->write_seq++;
2266
        t1->seq = htonl(buff->seq);
2267
        sk->sent_seq = sk->write_seq;
2268
        buff->end_seq = sk->write_seq;
2269
        t1->ack = 0;
2270
        t1->window = 2;
2271
        t1->syn = 1;
2272
        t1->doff = 6;
2273
        /* use 512 or whatever user asked for */
2274
 
2275
        if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2276
                sk->window_clamp=rt->rt_window;
2277
        else
2278
                sk->window_clamp=0;
2279
 
2280
        if (sk->user_mss)
2281
                sk->mtu = sk->user_mss;
2282
        else if (rt)
2283
                sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
2284
        else
2285
                sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
2286
 
2287
        /*
2288
         *      but not bigger than device MTU
2289
         */
2290
 
2291
        sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
2292
 
2293
        /* Must check it here, just to be absolutely safe.  If we end up
2294
         * with an sk->mtu of zero, we can thus end up with an sk->mss
2295
         * of zero, which causes us to bomb out in tcp_do_sendmsg. -DaveM
2296
         */
2297
        if(sk->mtu < 32)
2298
                sk->mtu = 32;   /* Sanity limit */
2299
 
2300
        /*
2301
         *      Put in the TCP options to say MTU.
2302
         */
2303
 
2304
        ptr = skb_put(buff,4);
2305
        ptr[0] = 2;
2306
        ptr[1] = 4;
2307
        ptr[2] = (sk->mtu) >> 8;
2308
        ptr[3] = (sk->mtu) & 0xff;
2309
        buff->csum = csum_partial(ptr, 4, 0);
2310
        tcp_send_check(t1, sk->saddr, sk->daddr,
2311
                  sizeof(struct tcphdr) + 4, buff);
2312
 
2313
        tcp_set_state(sk,TCP_SYN_SENT);
2314
 
2315
        /* Socket identity change complete, no longer
2316
         * in TCP_CLOSE, so rehash.
2317
         */
2318
        tcp_v4_rehash(sk);
2319
 
2320
        if(rt&&rt->rt_flags&RTF_IRTT)
2321
                sk->rto = rt->rt_irtt;
2322
        else
2323
                sk->rto = TCP_TIMEOUT_INIT;
2324
        sk->delack_timer.function = tcp_delack_timer;
2325
        sk->delack_timer.data = (unsigned long) sk;
2326
        sk->retransmit_timer.function = tcp_retransmit_timer;
2327
        sk->retransmit_timer.data = (unsigned long)sk;
2328
        sk->retransmits = 0;
2329
        sk->prot->queue_xmit(sk, dev, buff, 0);
2330
        tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2331
        tcp_statistics.TcpActiveOpens++;
2332
        tcp_statistics.TcpOutSegs++;
2333
 
2334
        release_sock(sk);
2335
        return(0);
2336
}
2337
 
2338
/*
2339
 *      Socket option code for TCP.
2340
 */
2341
 
2342
int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
2343
{
2344
        int val,err;
2345
 
2346
        if(level!=SOL_TCP)
2347
                return ip_setsockopt(sk,level,optname,optval,optlen);
2348
 
2349
        if (optval == NULL)
2350
                return(-EINVAL);
2351
 
2352
        err=verify_area(VERIFY_READ, optval, sizeof(int));
2353
        if(err)
2354
                return err;
2355
 
2356
        val = get_user((int *)optval);
2357
 
2358
        switch(optname)
2359
        {
2360
                case TCP_MAXSEG:
2361
/*
2362
 * values greater than interface MTU won't take effect.  however at
2363
 * the point when this call is done we typically don't yet know
2364
 * which interface is going to be used
2365
 */
2366
                        if(val<1||val>MAX_WINDOW)
2367
                                return -EINVAL;
2368
                        sk->user_mss=val;
2369
                        return 0;
2370
                case TCP_NODELAY:
2371
                        sk->nonagle=(val==0)?0:1;
2372
                        return 0;
2373
                default:
2374
                        return(-ENOPROTOOPT);
2375
        }
2376
}
2377
 
2378
int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
2379
{
2380
        int val,err;
2381
 
2382
        if(level!=SOL_TCP)
2383
                return ip_getsockopt(sk,level,optname,optval,optlen);
2384
 
2385
        switch(optname)
2386
        {
2387
                case TCP_MAXSEG:
2388
                        val=sk->user_mss;
2389
                        break;
2390
                case TCP_NODELAY:
2391
                        val=sk->nonagle;
2392
                        break;
2393
                default:
2394
                        return(-ENOPROTOOPT);
2395
        }
2396
        err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2397
        if(err)
2398
                return err;
2399
        put_user(sizeof(int),(int *) optlen);
2400
 
2401
        err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2402
        if(err)
2403
                return err;
2404
        put_user(val,(int *)optval);
2405
 
2406
        return(0);
2407
}
2408
 
2409
 
2410
struct proto tcp_prot = {
2411
        (struct sock *)&tcp_prot,       /* sklist_next */
2412
        (struct sock *)&tcp_prot,       /* sklist_prev */
2413
        tcp_close,                      /* close */
2414
        ip_build_header,                /* build_header */
2415
        tcp_connect,                    /* connect */
2416
        tcp_accept,                     /* accept */
2417
        ip_queue_xmit,                  /* queue_xmit */
2418
        tcp_retransmit,                 /* retransmit */
2419
        tcp_write_wakeup,               /* write_wakeup */
2420
        tcp_read_wakeup,                /* read_wakeup */
2421
        tcp_rcv,                        /* rcv */
2422
        tcp_select,                     /* select */
2423
        tcp_ioctl,                      /* ioctl */
2424
        NULL,                           /* init */
2425
        tcp_shutdown,                   /* shutdown */
2426
        tcp_setsockopt,                 /* setsockopt */
2427
        tcp_getsockopt,                 /* getsockopt */
2428
        tcp_sendmsg,                    /* sendmsg */
2429
        tcp_recvmsg,                    /* recvmsg */
2430
        NULL,                           /* bind */
2431
        tcp_v4_hash,                    /* hash */
2432
        tcp_v4_unhash,                  /* unhash */
2433
        tcp_v4_rehash,                  /* rehash */
2434
        tcp_good_socknum,               /* good_socknum */
2435
        tcp_v4_verify_bind,             /* verify_bind */
2436
        128,                            /* max_header */
2437
        0,                               /* retransmits */
2438
        "TCP",                          /* name */
2439
        0,                               /* inuse */
2440
 
2441
};

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.