OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [net/] [ipv4/] [tcp_ipv4.c] - Blame information for rev 1774

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * INET         An implementation of the TCP/IP protocol suite for the LINUX
3
 *              operating system.  INET is implemented using the  BSD Socket
4
 *              interface as the means of communication with the user level.
5
 *
6
 *              Implementation of the Transmission Control Protocol(TCP).
7
 *
8
 * Version:     $Id: tcp_ipv4.c,v 1.1.1.1 2004-04-15 01:13:38 phoenix Exp $
9
 *
10
 *              IPv4 specific functions
11
 *
12
 *
13
 *              code split from:
14
 *              linux/ipv4/tcp.c
15
 *              linux/ipv4/tcp_input.c
16
 *              linux/ipv4/tcp_output.c
17
 *
18
 *              See tcp.c for author information
19
 *
20
 *      This program is free software; you can redistribute it and/or
21
 *      modify it under the terms of the GNU General Public License
22
 *      as published by the Free Software Foundation; either version
23
 *      2 of the License, or (at your option) any later version.
24
 */
25
 
26
/*
27
 * Changes:
28
 *              David S. Miller :       New socket lookup architecture.
29
 *                                      This code is dedicated to John Dyson.
30
 *              David S. Miller :       Change semantics of established hash,
31
 *                                      half is devoted to TIME_WAIT sockets
32
 *                                      and the rest go in the other half.
33
 *              Andi Kleen :            Add support for syncookies and fixed
34
 *                                      some bugs: ip options weren't passed to
35
 *                                      the TCP layer, missed a check for an ACK bit.
36
 *              Andi Kleen :            Implemented fast path mtu discovery.
37
 *                                      Fixed many serious bugs in the
38
 *                                      open_request handling and moved
39
 *                                      most of it into the af independent code.
40
 *                                      Added tail drop and some other bugfixes.
41
 *                                      Added new listen sematics.
42
 *              Mike McLagan    :       Routing by source
43
 *      Juan Jose Ciarlante:            ip_dynaddr bits
44
 *              Andi Kleen:             various fixes.
45
 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
46
 *      Andi Kleen              :       Fix new listen.
47
 *      Andi Kleen              :       Fix accept error reporting.
48
 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49
 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50
 *                                      a single port at the same time.
51
 */
52
 
53
#include <linux/config.h>
54
 
55
#include <linux/types.h>
56
#include <linux/fcntl.h>
57
#include <linux/random.h>
58
#include <linux/cache.h>
59
#include <linux/jhash.h>
60
#include <linux/init.h>
61
 
62
#include <net/icmp.h>
63
#include <net/tcp.h>
64
#include <net/ipv6.h>
65
#include <net/inet_common.h>
66
 
67
#include <linux/inet.h>
68
#include <linux/stddef.h>
69
#include <linux/ipsec.h>
70
 
71
extern int sysctl_ip_dynaddr;
72
extern int sysctl_ip_default_ttl;
73
int sysctl_tcp_tw_reuse = 0;
74
int sysctl_tcp_low_latency = 0;
75
 
76
/* Check TCP sequence numbers in ICMP packets. */
77
#define ICMP_MIN_LENGTH 8
78
 
79
/* Socket used for sending RSTs */
80
static struct inode tcp_inode;
81
static struct socket *tcp_socket=&tcp_inode.u.socket_i;
82
 
83
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
84
                       struct sk_buff *skb);
85
 
86
/*
87
 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
88
 */
89
struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
90
        __tcp_ehash:          NULL,
91
        __tcp_bhash:          NULL,
92
        __tcp_bhash_size:     0,
93
        __tcp_ehash_size:     0,
94
        __tcp_listening_hash: { NULL, },
95
        __tcp_lhash_lock:     RW_LOCK_UNLOCKED,
96
        __tcp_lhash_users:    ATOMIC_INIT(0),
97
        __tcp_lhash_wait:
98
          __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
99
        __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
100
};
101
 
102
/*
103
 * This array holds the first and last local port number.
104
 * For high-usage systems, use sysctl to change this to
105
 * 32768-61000
106
 */
107
int sysctl_local_port_range[2] = { 1024, 4999 };
108
int tcp_port_rover = (1024 - 1);
109
 
110
static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
111
                                 __u32 faddr, __u16 fport)
112
{
113
        int h = ((laddr ^ lport) ^ (faddr ^ fport));
114
        h ^= h>>16;
115
        h ^= h>>8;
116
        return h & (tcp_ehash_size - 1);
117
}
118
 
119
static __inline__ int tcp_sk_hashfn(struct sock *sk)
120
{
121
        __u32 laddr = sk->rcv_saddr;
122
        __u16 lport = sk->num;
123
        __u32 faddr = sk->daddr;
124
        __u16 fport = sk->dport;
125
 
126
        return tcp_hashfn(laddr, lport, faddr, fport);
127
}
128
 
129
/* Allocate and initialize a new TCP local port bind bucket.
130
 * The bindhash mutex for snum's hash chain must be held here.
131
 */
132
struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133
                                          unsigned short snum)
134
{
135
        struct tcp_bind_bucket *tb;
136
 
137
        tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
138
        if(tb != NULL) {
139
                tb->port = snum;
140
                tb->fastreuse = 0;
141
                tb->owners = NULL;
142
                if((tb->next = head->chain) != NULL)
143
                        tb->next->pprev = &tb->next;
144
                head->chain = tb;
145
                tb->pprev = &head->chain;
146
        }
147
        return tb;
148
}
149
 
150
/* Caller must disable local BH processing. */
151
static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
152
{
153
        struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
154
        struct tcp_bind_bucket *tb;
155
 
156
        spin_lock(&head->lock);
157
        tb = (struct tcp_bind_bucket *)sk->prev;
158
        if ((child->bind_next = tb->owners) != NULL)
159
                tb->owners->bind_pprev = &child->bind_next;
160
        tb->owners = child;
161
        child->bind_pprev = &tb->owners;
162
        child->prev = (struct sock *) tb;
163
        spin_unlock(&head->lock);
164
}
165
 
166
inline void tcp_inherit_port(struct sock *sk, struct sock *child)
167
{
168
        local_bh_disable();
169
        __tcp_inherit_port(sk, child);
170
        local_bh_enable();
171
}
172
 
173
static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
174
{
175
        sk->num = snum;
176
        if ((sk->bind_next = tb->owners) != NULL)
177
                tb->owners->bind_pprev = &sk->bind_next;
178
        tb->owners = sk;
179
        sk->bind_pprev = &tb->owners;
180
        sk->prev = (struct sock *) tb;
181
}
182
 
183
static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184
{
185
        struct sock *sk2 = tb->owners;
186
        int sk_reuse = sk->reuse;
187
 
188
        for( ; sk2 != NULL; sk2 = sk2->bind_next) {
189
                if (sk != sk2 &&
190
                    sk2->reuse <= 1 &&
191
                    !ipv6_only_sock(sk2) &&
192
                    (!sk->bound_dev_if ||
193
                     !sk2->bound_dev_if ||
194
                     sk->bound_dev_if == sk2->bound_dev_if)) {
195
                        if (!sk_reuse   ||
196
                            !sk2->reuse ||
197
                            sk2->state == TCP_LISTEN) {
198
                                if (!sk2->rcv_saddr     ||
199
                                    !sk->rcv_saddr      ||
200
                                    (sk2->rcv_saddr == sk->rcv_saddr))
201
                                        break;
202
                        }
203
                }
204
        }
205
        return sk2 != NULL;
206
}
207
 
208
/* Obtain a reference to a local port for the given sock,
209
 * if snum is zero it means select any available local port.
210
 */
211
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212
{
213
        struct tcp_bind_hashbucket *head;
214
        struct tcp_bind_bucket *tb;
215
        int ret;
216
 
217
        local_bh_disable();
218
        if (snum == 0) {
219
                int low = sysctl_local_port_range[0];
220
                int high = sysctl_local_port_range[1];
221
                int remaining = (high - low) + 1;
222
                int rover;
223
 
224
                spin_lock(&tcp_portalloc_lock);
225
                rover = tcp_port_rover;
226
                do {    rover++;
227
                        if ((rover < low) || (rover > high))
228
                                rover = low;
229
                        head = &tcp_bhash[tcp_bhashfn(rover)];
230
                        spin_lock(&head->lock);
231
                        for (tb = head->chain; tb; tb = tb->next)
232
                                if (tb->port == rover)
233
                                        goto next;
234
                        break;
235
                next:
236
                        spin_unlock(&head->lock);
237
                } while (--remaining > 0);
238
                tcp_port_rover = rover;
239
                spin_unlock(&tcp_portalloc_lock);
240
 
241
                /* Exhausted local port range during search? */
242
                ret = 1;
243
                if (remaining <= 0)
244
                        goto fail;
245
 
246
                /* OK, here is the one we will use.  HEAD is
247
                 * non-NULL and we hold it's mutex.
248
                 */
249
                snum = rover;
250
                tb = NULL;
251
        } else {
252
                head = &tcp_bhash[tcp_bhashfn(snum)];
253
                spin_lock(&head->lock);
254
                for (tb = head->chain; tb != NULL; tb = tb->next)
255
                        if (tb->port == snum)
256
                                break;
257
        }
258
        if (tb != NULL && tb->owners != NULL) {
259
                if (sk->reuse > 1)
260
                        goto success;
261
                if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
262
                        goto success;
263
                } else {
264
                        ret = 1;
265
                        if (tcp_bind_conflict(sk, tb))
266
                                goto fail_unlock;
267
                }
268
        }
269
        ret = 1;
270
        if (tb == NULL &&
271
            (tb = tcp_bucket_create(head, snum)) == NULL)
272
                        goto fail_unlock;
273
        if (tb->owners == NULL) {
274
                if (sk->reuse && sk->state != TCP_LISTEN)
275
                        tb->fastreuse = 1;
276
                else
277
                        tb->fastreuse = 0;
278
        } else if (tb->fastreuse &&
279
                   ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
280
                tb->fastreuse = 0;
281
success:
282
        if (sk->prev == NULL)
283
                tcp_bind_hash(sk, tb, snum);
284
        BUG_TRAP(sk->prev == (struct sock *) tb);
285
        ret = 0;
286
 
287
fail_unlock:
288
        spin_unlock(&head->lock);
289
fail:
290
        local_bh_enable();
291
        return ret;
292
}
293
 
294
/* Get rid of any references to a local port held by the
295
 * given sock.
296
 */
297
inline void __tcp_put_port(struct sock *sk)
298
{
299
        struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
300
        struct tcp_bind_bucket *tb;
301
 
302
        spin_lock(&head->lock);
303
        tb = (struct tcp_bind_bucket *) sk->prev;
304
        if (sk->bind_next)
305
                sk->bind_next->bind_pprev = sk->bind_pprev;
306
        *(sk->bind_pprev) = sk->bind_next;
307
        sk->prev = NULL;
308
        sk->num = 0;
309
        if (tb->owners == NULL) {
310
                if (tb->next)
311
                        tb->next->pprev = tb->pprev;
312
                *(tb->pprev) = tb->next;
313
                kmem_cache_free(tcp_bucket_cachep, tb);
314
        }
315
        spin_unlock(&head->lock);
316
}
317
 
318
void tcp_put_port(struct sock *sk)
319
{
320
        local_bh_disable();
321
        __tcp_put_port(sk);
322
        local_bh_enable();
323
}
324
 
325
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
326
 * Look, when several writers sleep and reader wakes them up, all but one
327
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
328
 * this, _but_ remember, it adds useless work on UP machines (wake up each
329
 * exclusive lock release). It should be ifdefed really.
330
 */
331
 
332
void tcp_listen_wlock(void)
333
{
334
        write_lock(&tcp_lhash_lock);
335
 
336
        if (atomic_read(&tcp_lhash_users)) {
337
                DECLARE_WAITQUEUE(wait, current);
338
 
339
                add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
340
                for (;;) {
341
                        set_current_state(TASK_UNINTERRUPTIBLE);
342
                        if (atomic_read(&tcp_lhash_users) == 0)
343
                                break;
344
                        write_unlock_bh(&tcp_lhash_lock);
345
                        schedule();
346
                        write_lock_bh(&tcp_lhash_lock);
347
                }
348
 
349
                __set_current_state(TASK_RUNNING);
350
                remove_wait_queue(&tcp_lhash_wait, &wait);
351
        }
352
}
353
 
354
static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
355
{
356
        struct sock **skp;
357
        rwlock_t *lock;
358
 
359
        BUG_TRAP(sk->pprev==NULL);
360
        if(listen_possible && sk->state == TCP_LISTEN) {
361
                skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
362
                lock = &tcp_lhash_lock;
363
                tcp_listen_wlock();
364
        } else {
365
                skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
366
                lock = &tcp_ehash[sk->hashent].lock;
367
                write_lock(lock);
368
        }
369
        if((sk->next = *skp) != NULL)
370
                (*skp)->pprev = &sk->next;
371
        *skp = sk;
372
        sk->pprev = skp;
373
        sock_prot_inc_use(sk->prot);
374
        write_unlock(lock);
375
        if (listen_possible && sk->state == TCP_LISTEN)
376
                wake_up(&tcp_lhash_wait);
377
}
378
 
379
static void tcp_v4_hash(struct sock *sk)
380
{
381
        if (sk->state != TCP_CLOSE) {
382
                local_bh_disable();
383
                __tcp_v4_hash(sk, 1);
384
                local_bh_enable();
385
        }
386
}
387
 
388
void tcp_unhash(struct sock *sk)
389
{
390
        rwlock_t *lock;
391
 
392
        if (!sk->pprev)
393
                goto ende;
394
 
395
        if (sk->state == TCP_LISTEN) {
396
                local_bh_disable();
397
                tcp_listen_wlock();
398
                lock = &tcp_lhash_lock;
399
        } else {
400
                struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
401
                lock = &head->lock;
402
                write_lock_bh(&head->lock);
403
        }
404
 
405
        if(sk->pprev) {
406
                if(sk->next)
407
                        sk->next->pprev = sk->pprev;
408
                *sk->pprev = sk->next;
409
                sk->pprev = NULL;
410
                sock_prot_dec_use(sk->prot);
411
        }
412
        write_unlock_bh(lock);
413
 
414
 ende:
415
        if (sk->state == TCP_LISTEN)
416
                wake_up(&tcp_lhash_wait);
417
}
418
 
419
/* Don't inline this cruft.  Here are some nice properties to
420
 * exploit here.  The BSD API does not allow a listening TCP
421
 * to specify the remote port nor the remote address for the
422
 * connection.  So always assume those are both wildcarded
423
 * during the search since they can never be otherwise.
424
 */
425
static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
426
{
427
        struct sock *result = NULL;
428
        int score, hiscore;
429
 
430
        hiscore=-1;
431
        for(; sk; sk = sk->next) {
432
                if(sk->num == hnum && !ipv6_only_sock(sk)) {
433
                        __u32 rcv_saddr = sk->rcv_saddr;
434
 
435
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
436
                        score = sk->family == PF_INET ? 1 : 0;
437
#else
438
                        score = 1;
439
#endif
440
                        if(rcv_saddr) {
441
                                if (rcv_saddr != daddr)
442
                                        continue;
443
                                score+=2;
444
                        }
445
                        if (sk->bound_dev_if) {
446
                                if (sk->bound_dev_if != dif)
447
                                        continue;
448
                                score+=2;
449
                        }
450
                        if (score == 5)
451
                                return sk;
452
                        if (score > hiscore) {
453
                                hiscore = score;
454
                                result = sk;
455
                        }
456
                }
457
        }
458
        return result;
459
}
460
 
461
/* Optimize the common listener case. */
462
inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
463
{
464
        struct sock *sk;
465
 
466
        read_lock(&tcp_lhash_lock);
467
        sk = tcp_listening_hash[tcp_lhashfn(hnum)];
468
        if (sk) {
469
                if (sk->num == hnum &&
470
                    sk->next == NULL &&
471
                    (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
472
                    (sk->family == PF_INET || !ipv6_only_sock(sk)) &&
473
                    !sk->bound_dev_if)
474
                        goto sherry_cache;
475
                sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
476
        }
477
        if (sk) {
478
sherry_cache:
479
                sock_hold(sk);
480
        }
481
        read_unlock(&tcp_lhash_lock);
482
        return sk;
483
}
484
 
485
/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486
 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
487
 *
488
 * Local BH must be disabled here.
489
 */
490
 
491
static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
492
                                                       u32 daddr, u16 hnum, int dif)
493
{
494
        struct tcp_ehash_bucket *head;
495
        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
496
        __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
497
        struct sock *sk;
498
        int hash;
499
 
500
        /* Optimize here for direct hit, only listening connections can
501
         * have wildcards anyways.
502
         */
503
        hash = tcp_hashfn(daddr, hnum, saddr, sport);
504
        head = &tcp_ehash[hash];
505
        read_lock(&head->lock);
506
        for(sk = head->chain; sk; sk = sk->next) {
507
                if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508
                        goto hit; /* You sunk my battleship! */
509
        }
510
 
511
        /* Must check for a TIME_WAIT'er before going to listener hash. */
512
        for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
513
                if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
514
                        goto hit;
515
        read_unlock(&head->lock);
516
 
517
        return NULL;
518
 
519
hit:
520
        sock_hold(sk);
521
        read_unlock(&head->lock);
522
        return sk;
523
}
524
 
525
static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526
                                           u32 daddr, u16 hnum, int dif)
527
{
528
        struct sock *sk;
529
 
530
        sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
531
 
532
        if (sk)
533
                return sk;
534
 
535
        return tcp_v4_lookup_listener(daddr, hnum, dif);
536
}
537
 
538
inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
539
{
540
        struct sock *sk;
541
 
542
        local_bh_disable();
543
        sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
544
        local_bh_enable();
545
 
546
        return sk;
547
}
548
 
549
static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
550
{
551
        return secure_tcp_sequence_number(skb->nh.iph->daddr,
552
                                          skb->nh.iph->saddr,
553
                                          skb->h.th->dest,
554
                                          skb->h.th->source);
555
}
556
 
557
/* called with local bh disabled */
558
static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
559
                                      struct tcp_tw_bucket **twp)
560
{
561
        u32 daddr = sk->rcv_saddr;
562
        u32 saddr = sk->daddr;
563
        int dif = sk->bound_dev_if;
564
        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
565
        __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
566
        int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
567
        struct tcp_ehash_bucket *head = &tcp_ehash[hash];
568
        struct sock *sk2, **skp;
569
        struct tcp_tw_bucket *tw;
570
 
571
        write_lock(&head->lock);
572
 
573
        /* Check TIME-WAIT sockets first. */
574
        for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
575
            skp = &sk2->next) {
576
                tw = (struct tcp_tw_bucket*)sk2;
577
 
578
                if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
579
                        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
580
 
581
                        /* With PAWS, it is safe from the viewpoint
582
                           of data integrity. Even without PAWS it
583
                           is safe provided sequence spaces do not
584
                           overlap i.e. at data rates <= 80Mbit/sec.
585
 
586
                           Actually, the idea is close to VJ's one,
587
                           only timestamp cache is held not per host,
588
                           but per port pair and TW bucket is used
589
                           as state holder.
590
 
591
                           If TW bucket has been already destroyed we
592
                           fall back to VJ's scheme and use initial
593
                           timestamp retrieved from peer table.
594
                         */
595
                        if (tw->ts_recent_stamp &&
596
                            (!twp || (sysctl_tcp_tw_reuse &&
597
                                      xtime.tv_sec - tw->ts_recent_stamp > 1))) {
598
                                if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
599
                                        tp->write_seq = 1;
600
                                tp->ts_recent = tw->ts_recent;
601
                                tp->ts_recent_stamp = tw->ts_recent_stamp;
602
                                sock_hold(sk2);
603
                                skp = &head->chain;
604
                                goto unique;
605
                        } else
606
                                goto not_unique;
607
                }
608
        }
609
        tw = NULL;
610
 
611
        /* And established part... */
612
        for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
613
                if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
614
                        goto not_unique;
615
        }
616
 
617
unique:
618
        /* Must record num and sport now. Otherwise we will see
619
         * in hash table socket with a funny identity. */
620
        sk->num = lport;
621
        sk->sport = htons(lport);
622
        BUG_TRAP(sk->pprev==NULL);
623
        if ((sk->next = *skp) != NULL)
624
                (*skp)->pprev = &sk->next;
625
 
626
        *skp = sk;
627
        sk->pprev = skp;
628
        sk->hashent = hash;
629
        sock_prot_inc_use(sk->prot);
630
        write_unlock(&head->lock);
631
 
632
        if (twp) {
633
                *twp = tw;
634
                NET_INC_STATS_BH(TimeWaitRecycled);
635
        } else if (tw) {
636
                /* Silly. Should hash-dance instead... */
637
                tcp_tw_deschedule(tw);
638
                tcp_timewait_kill(tw);
639
                NET_INC_STATS_BH(TimeWaitRecycled);
640
 
641
                tcp_tw_put(tw);
642
        }
643
 
644
        return 0;
645
 
646
not_unique:
647
        write_unlock(&head->lock);
648
        return -EADDRNOTAVAIL;
649
}
650
 
651
/*
652
 * Bind a port for a connect operation and hash it.
653
 */
654
static int tcp_v4_hash_connect(struct sock *sk)
655
{
656
        unsigned short snum = sk->num;
657
        struct tcp_bind_hashbucket *head;
658
        struct tcp_bind_bucket *tb;
659
 
660
        if (snum == 0) {
661
                int rover;
662
                int low = sysctl_local_port_range[0];
663
                int high = sysctl_local_port_range[1];
664
                int remaining = (high - low) + 1;
665
                struct tcp_tw_bucket *tw = NULL;
666
 
667
                local_bh_disable();
668
 
669
                /* TODO. Actually it is not so bad idea to remove
670
                 * tcp_portalloc_lock before next submission to Linus.
671
                 * As soon as we touch this place at all it is time to think.
672
                 *
673
                 * Now it protects single _advisory_ variable tcp_port_rover,
674
                 * hence it is mostly useless.
675
                 * Code will work nicely if we just delete it, but
676
                 * I am afraid in contented case it will work not better or
677
                 * even worse: another cpu just will hit the same bucket
678
                 * and spin there.
679
                 * So some cpu salt could remove both contention and
680
                 * memory pingpong. Any ideas how to do this in a nice way?
681
                 */
682
                spin_lock(&tcp_portalloc_lock);
683
                rover = tcp_port_rover;
684
 
685
                do {
686
                        rover++;
687
                        if ((rover < low) || (rover > high))
688
                                rover = low;
689
                        head = &tcp_bhash[tcp_bhashfn(rover)];
690
                        spin_lock(&head->lock);
691
 
692
                        /* Does not bother with rcv_saddr checks,
693
                         * because the established check is already
694
                         * unique enough.
695
                         */
696
                        for (tb = head->chain; tb; tb = tb->next) {
697
                                if (tb->port == rover) {
698
                                        BUG_TRAP(tb->owners != NULL);
699
                                        if (tb->fastreuse >= 0)
700
                                                goto next_port;
701
                                        if (!__tcp_v4_check_established(sk, rover, &tw))
702
                                                goto ok;
703
                                        goto next_port;
704
                                }
705
                        }
706
 
707
                        tb = tcp_bucket_create(head, rover);
708
                        if (!tb) {
709
                                spin_unlock(&head->lock);
710
                                break;
711
                        }
712
                        tb->fastreuse = -1;
713
                        goto ok;
714
 
715
                next_port:
716
                        spin_unlock(&head->lock);
717
                } while (--remaining > 0);
718
                tcp_port_rover = rover;
719
                spin_unlock(&tcp_portalloc_lock);
720
 
721
                local_bh_enable();
722
 
723
                return -EADDRNOTAVAIL;
724
 
725
        ok:
726
                /* All locks still held and bhs disabled */
727
                tcp_port_rover = rover;
728
                spin_unlock(&tcp_portalloc_lock);
729
 
730
                tcp_bind_hash(sk, tb, rover);
731
                if (!sk->pprev) {
732
                        sk->sport = htons(rover);
733
                        __tcp_v4_hash(sk, 0);
734
                }
735
                spin_unlock(&head->lock);
736
 
737
                if (tw) {
738
                        tcp_tw_deschedule(tw);
739
                        tcp_timewait_kill(tw);
740
                        tcp_tw_put(tw);
741
                }
742
 
743
                local_bh_enable();
744
                return 0;
745
        }
746
 
747
        head  = &tcp_bhash[tcp_bhashfn(snum)];
748
        tb  = (struct tcp_bind_bucket *)sk->prev;
749
        spin_lock_bh(&head->lock);
750
        if (tb->owners == sk && sk->bind_next == NULL) {
751
                __tcp_v4_hash(sk, 0);
752
                spin_unlock_bh(&head->lock);
753
                return 0;
754
        } else {
755
                int ret;
756
                spin_unlock(&head->lock);
757
                /* No definite answer... Walk to established hash table */
758
                ret = __tcp_v4_check_established(sk, snum, NULL);
759
                local_bh_enable();
760
                return ret;
761
        }
762
}
763
 
764
/* This will initiate an outgoing connection. */
765
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
766
{
767
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
768
        struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
769
        struct rtable *rt;
770
        u32 daddr, nexthop;
771
        int tmp;
772
        int err;
773
 
774
        if (addr_len < sizeof(struct sockaddr_in))
775
                return(-EINVAL);
776
 
777
        if (usin->sin_family != AF_INET)
778
                return(-EAFNOSUPPORT);
779
 
780
        nexthop = daddr = usin->sin_addr.s_addr;
781
        if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
782
                if (daddr == 0)
783
                        return -EINVAL;
784
                nexthop = sk->protinfo.af_inet.opt->faddr;
785
        }
786
 
787
        tmp = ip_route_connect(&rt, nexthop, sk->saddr,
788
                               RT_CONN_FLAGS(sk), sk->bound_dev_if);
789
        if (tmp < 0)
790
                return tmp;
791
 
792
        if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
793
                ip_rt_put(rt);
794
                return -ENETUNREACH;
795
        }
796
 
797
        __sk_dst_set(sk, &rt->u.dst);
798
        sk->route_caps = rt->u.dst.dev->features;
799
 
800
        if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
801
                daddr = rt->rt_dst;
802
 
803
        if (!sk->saddr)
804
                sk->saddr = rt->rt_src;
805
        sk->rcv_saddr = sk->saddr;
806
 
807
        if (tp->ts_recent_stamp && sk->daddr != daddr) {
808
                /* Reset inherited state */
809
                tp->ts_recent = 0;
810
                tp->ts_recent_stamp = 0;
811
                tp->write_seq = 0;
812
        }
813
 
814
        if (sysctl_tcp_tw_recycle &&
815
            !tp->ts_recent_stamp &&
816
            rt->rt_dst == daddr) {
817
                struct inet_peer *peer = rt_get_peer(rt);
818
 
819
                /* VJ's idea. We save last timestamp seen from
820
                 * the destination in peer table, when entering state TIME-WAIT
821
                 * and initialize ts_recent from it, when trying new connection.
822
                 */
823
 
824
                if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
825
                        tp->ts_recent_stamp = peer->tcp_ts_stamp;
826
                        tp->ts_recent = peer->tcp_ts;
827
                }
828
        }
829
 
830
        sk->dport = usin->sin_port;
831
        sk->daddr = daddr;
832
 
833
        tp->ext_header_len = 0;
834
        if (sk->protinfo.af_inet.opt)
835
                tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
836
 
837
        tp->mss_clamp = 536;
838
 
839
        /* Socket identity is still unknown (sport may be zero).
840
         * However we set state to SYN-SENT and not releasing socket
841
         * lock select source port, enter ourselves into the hash tables and
842
         * complete initalization after this.
843
         */
844
        tcp_set_state(sk, TCP_SYN_SENT);
845
        err = tcp_v4_hash_connect(sk);
846
        if (err)
847
                goto failure;
848
 
849
        if (!tp->write_seq)
850
                tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
851
                                                           sk->sport, usin->sin_port);
852
 
853
        sk->protinfo.af_inet.id = tp->write_seq^jiffies;
854
 
855
        err = tcp_connect(sk);
856
        if (err)
857
                goto failure;
858
 
859
        return 0;
860
 
861
failure:
862
        tcp_set_state(sk, TCP_CLOSE);
863
        __sk_dst_reset(sk);
864
        sk->route_caps = 0;
865
        sk->dport = 0;
866
        return err;
867
}
868
 
869
static __inline__ int tcp_v4_iif(struct sk_buff *skb)
870
{
871
        return ((struct rtable*)skb->dst)->rt_iif;
872
}
873
 
874
static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
875
{
876
        return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
877
}
878
 
879
static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
880
                                              struct open_request ***prevp,
881
                                              __u16 rport,
882
                                              __u32 raddr, __u32 laddr)
883
{
884
        struct tcp_listen_opt *lopt = tp->listen_opt;
885
        struct open_request *req, **prev;
886
 
887
        for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
888
             (req = *prev) != NULL;
889
             prev = &req->dl_next) {
890
                if (req->rmt_port == rport &&
891
                    req->af.v4_req.rmt_addr == raddr &&
892
                    req->af.v4_req.loc_addr == laddr &&
893
                    TCP_INET_FAMILY(req->class->family)) {
894
                        BUG_TRAP(req->sk == NULL);
895
                        *prevp = prev;
896
                        return req;
897
                }
898
        }
899
 
900
        return NULL;
901
}
902
 
903
static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
904
{
905
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
906
        struct tcp_listen_opt *lopt = tp->listen_opt;
907
        u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
908
 
909
        req->expires = jiffies + TCP_TIMEOUT_INIT;
910
        req->retrans = 0;
911
        req->sk = NULL;
912
        req->dl_next = lopt->syn_table[h];
913
 
914
        write_lock(&tp->syn_wait_lock);
915
        lopt->syn_table[h] = req;
916
        write_unlock(&tp->syn_wait_lock);
917
 
918
        tcp_synq_added(sk);
919
}
920
 
921
 
922
/*
923
 * This routine does path mtu discovery as defined in RFC1191.
924
 */
925
static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
926
{
927
        struct dst_entry *dst;
928
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
929
 
930
        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
931
         * send out by Linux are always <576bytes so they should go through
932
         * unfragmented).
933
         */
934
        if (sk->state == TCP_LISTEN)
935
                return;
936
 
937
        /* We don't check in the destentry if pmtu discovery is forbidden
938
         * on this route. We just assume that no packet_to_big packets
939
         * are send back when pmtu discovery is not active.
940
         * There is a small race when the user changes this flag in the
941
         * route, but I think that's acceptable.
942
         */
943
        if ((dst = __sk_dst_check(sk, 0)) == NULL)
944
                return;
945
 
946
        ip_rt_update_pmtu(dst, mtu);
947
 
948
        /* Something is about to be wrong... Remember soft error
949
         * for the case, if this connection will not able to recover.
950
         */
951
        if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
952
                sk->err_soft = EMSGSIZE;
953
 
954
        if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
955
            tp->pmtu_cookie > dst->pmtu) {
956
                tcp_sync_mss(sk, dst->pmtu);
957
 
958
                /* Resend the TCP packet because it's
959
                 * clear that the old packet has been
960
                 * dropped. This is the new "fast" path mtu
961
                 * discovery.
962
                 */
963
                tcp_simple_retransmit(sk);
964
        } /* else let the usual retransmit timer handle it */
965
}
966
 
967
/*
968
 * This routine is called by the ICMP module when it gets some
969
 * sort of error condition.  If err < 0 then the socket should
970
 * be closed and the error returned to the user.  If err > 0
971
 * it's just the icmp type << 8 | icmp code.  After adjustment
972
 * header points to the first 8 bytes of the tcp header.  We need
973
 * to find the appropriate port.
974
 *
975
 * The locking strategy used here is very "optimistic". When
976
 * someone else accesses the socket the ICMP is just dropped
977
 * and for some paths there is no check at all.
978
 * A more general error queue to queue errors for later handling
979
 * is probably better.
980
 *
981
 */
982
 
983
void tcp_v4_err(struct sk_buff *skb, u32 info)
984
{
985
        struct iphdr *iph = (struct iphdr*)skb->data;
986
        struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
987
        struct tcp_opt *tp;
988
        int type = skb->h.icmph->type;
989
        int code = skb->h.icmph->code;
990
        struct sock *sk;
991
        __u32 seq;
992
        int err;
993
 
994
        if (skb->len < (iph->ihl << 2) + 8) {
995
                ICMP_INC_STATS_BH(IcmpInErrors);
996
                return;
997
        }
998
 
999
        sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
1000
        if (sk == NULL) {
1001
                ICMP_INC_STATS_BH(IcmpInErrors);
1002
                return;
1003
        }
1004
        if (sk->state == TCP_TIME_WAIT) {
1005
                tcp_tw_put((struct tcp_tw_bucket*)sk);
1006
                return;
1007
        }
1008
 
1009
        bh_lock_sock(sk);
1010
        /* If too many ICMPs get dropped on busy
1011
         * servers this needs to be solved differently.
1012
         */
1013
        if (sk->lock.users != 0)
1014
                NET_INC_STATS_BH(LockDroppedIcmps);
1015
 
1016
        if (sk->state == TCP_CLOSE)
1017
                goto out;
1018
 
1019
        tp = &sk->tp_pinfo.af_tcp;
1020
        seq = ntohl(th->seq);
1021
        if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1022
                NET_INC_STATS(OutOfWindowIcmps);
1023
                goto out;
1024
        }
1025
 
1026
        switch (type) {
1027
        case ICMP_SOURCE_QUENCH:
1028
                /* This is deprecated, but if someone generated it,
1029
                 * we have no reasons to ignore it.
1030
                 */
1031
                if (sk->lock.users == 0)
1032
                        tcp_enter_cwr(tp);
1033
                goto out;
1034
        case ICMP_PARAMETERPROB:
1035
                err = EPROTO;
1036
                break;
1037
        case ICMP_DEST_UNREACH:
1038
                if (code > NR_ICMP_UNREACH)
1039
                        goto out;
1040
 
1041
                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1042
                        if (sk->lock.users == 0)
1043
                                do_pmtu_discovery(sk, iph, info);
1044
                        goto out;
1045
                }
1046
 
1047
                err = icmp_err_convert[code].errno;
1048
                break;
1049
        case ICMP_TIME_EXCEEDED:
1050
                err = EHOSTUNREACH;
1051
                break;
1052
        default:
1053
                goto out;
1054
        }
1055
 
1056
        switch (sk->state) {
1057
                struct open_request *req, **prev;
1058
        case TCP_LISTEN:
1059
                if (sk->lock.users != 0)
1060
                        goto out;
1061
 
1062
                req = tcp_v4_search_req(tp, &prev,
1063
                                        th->dest,
1064
                                        iph->daddr, iph->saddr);
1065
                if (!req)
1066
                        goto out;
1067
 
1068
                /* ICMPs are not backlogged, hence we cannot get
1069
                   an established socket here.
1070
                 */
1071
                BUG_TRAP(req->sk == NULL);
1072
 
1073
                if (seq != req->snt_isn) {
1074
                        NET_INC_STATS_BH(OutOfWindowIcmps);
1075
                        goto out;
1076
                }
1077
 
1078
                /*
1079
                 * Still in SYN_RECV, just remove it silently.
1080
                 * There is no good way to pass the error to the newly
1081
                 * created socket, and POSIX does not want network
1082
                 * errors returned from accept().
1083
                 */
1084
                tcp_synq_drop(sk, req, prev);
1085
                goto out;
1086
 
1087
        case TCP_SYN_SENT:
1088
        case TCP_SYN_RECV:  /* Cannot happen.
1089
                               It can f.e. if SYNs crossed.
1090
                             */
1091
                if (sk->lock.users == 0) {
1092
                        TCP_INC_STATS_BH(TcpAttemptFails);
1093
                        sk->err = err;
1094
 
1095
                        sk->error_report(sk);
1096
 
1097
                        tcp_done(sk);
1098
                } else {
1099
                        sk->err_soft = err;
1100
                }
1101
                goto out;
1102
        }
1103
 
1104
        /* If we've already connected we will keep trying
1105
         * until we time out, or the user gives up.
1106
         *
1107
         * rfc1122 4.2.3.9 allows to consider as hard errors
1108
         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1109
         * but it is obsoleted by pmtu discovery).
1110
         *
1111
         * Note, that in modern internet, where routing is unreliable
1112
         * and in each dark corner broken firewalls sit, sending random
1113
         * errors ordered by their masters even this two messages finally lose
1114
         * their original sense (even Linux sends invalid PORT_UNREACHs)
1115
         *
1116
         * Now we are in compliance with RFCs.
1117
         *                                                      --ANK (980905)
1118
         */
1119
 
1120
        if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1121
                sk->err = err;
1122
                sk->error_report(sk);
1123
        } else  { /* Only an error on timeout */
1124
                sk->err_soft = err;
1125
        }
1126
 
1127
out:
1128
        bh_unlock_sock(sk);
1129
        sock_put(sk);
1130
}
1131
 
1132
/* This routine computes an IPv4 TCP checksum. */
1133
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1134
                       struct sk_buff *skb)
1135
{
1136
        if (skb->ip_summed == CHECKSUM_HW) {
1137
                th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1138
                skb->csum = offsetof(struct tcphdr, check);
1139
        } else {
1140
                th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1141
                                         csum_partial((char *)th, th->doff<<2, skb->csum));
1142
        }
1143
}
1144
 
1145
/*
1146
 *      This routine will send an RST to the other tcp.
1147
 *
1148
 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1149
 *                    for reset.
1150
 *      Answer: if a packet caused RST, it is not for a socket
1151
 *              existing in our system, if it is matched to a socket,
1152
 *              it is just duplicate segment or bug in other side's TCP.
1153
 *              So that we build reply only basing on parameters
1154
 *              arrived with segment.
1155
 *      Exception: precedence violation. We do not implement it in any case.
1156
 */
1157
 
1158
static void tcp_v4_send_reset(struct sk_buff *skb)
1159
{
1160
        struct tcphdr *th = skb->h.th;
1161
        struct tcphdr rth;
1162
        struct ip_reply_arg arg;
1163
 
1164
        /* Never send a reset in response to a reset. */
1165
        if (th->rst)
1166
                return;
1167
 
1168
        if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1169
                return;
1170
 
1171
        /* Swap the send and the receive. */
1172
        memset(&rth, 0, sizeof(struct tcphdr));
1173
        rth.dest = th->source;
1174
        rth.source = th->dest;
1175
        rth.doff = sizeof(struct tcphdr)/4;
1176
        rth.rst = 1;
1177
 
1178
        if (th->ack) {
1179
                rth.seq = th->ack_seq;
1180
        } else {
1181
                rth.ack = 1;
1182
                rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1183
                                    + skb->len - (th->doff<<2));
1184
        }
1185
 
1186
        memset(&arg, 0, sizeof arg);
1187
        arg.iov[0].iov_base = (unsigned char *)&rth;
1188
        arg.iov[0].iov_len  = sizeof rth;
1189
        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1190
                                      skb->nh.iph->saddr, /*XXX*/
1191
                                      sizeof(struct tcphdr),
1192
                                      IPPROTO_TCP,
1193
                                      0);
1194
        arg.n_iov = 1;
1195
        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1196
 
1197
        tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
1198
        ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1199
 
1200
        TCP_INC_STATS_BH(TcpOutSegs);
1201
        TCP_INC_STATS_BH(TcpOutRsts);
1202
}
1203
 
1204
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1205
   outside socket context is ugly, certainly. What can I do?
1206
 */
1207
 
1208
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1209
{
1210
        struct tcphdr *th = skb->h.th;
1211
        struct {
1212
                struct tcphdr th;
1213
                u32 tsopt[3];
1214
        } rep;
1215
        struct ip_reply_arg arg;
1216
 
1217
        memset(&rep.th, 0, sizeof(struct tcphdr));
1218
        memset(&arg, 0, sizeof arg);
1219
 
1220
        arg.iov[0].iov_base = (unsigned char *)&rep;
1221
        arg.iov[0].iov_len  = sizeof(rep.th);
1222
        arg.n_iov = 1;
1223
        if (ts) {
1224
                rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
1225
                                     (TCPOPT_NOP << 16) |
1226
                                     (TCPOPT_TIMESTAMP << 8) |
1227
                                     TCPOLEN_TIMESTAMP);
1228
                rep.tsopt[1] = htonl(tcp_time_stamp);
1229
                rep.tsopt[2] = htonl(ts);
1230
                arg.iov[0].iov_len = sizeof(rep);
1231
        }
1232
 
1233
        /* Swap the send and the receive. */
1234
        rep.th.dest = th->source;
1235
        rep.th.source = th->dest;
1236
        rep.th.doff = arg.iov[0].iov_len/4;
1237
        rep.th.seq = htonl(seq);
1238
        rep.th.ack_seq = htonl(ack);
1239
        rep.th.ack = 1;
1240
        rep.th.window = htons(win);
1241
 
1242
        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1243
                                      skb->nh.iph->saddr, /*XXX*/
1244
                                      arg.iov[0].iov_len,
1245
                                      IPPROTO_TCP,
1246
                                      0);
1247
        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1248
 
1249
        ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1250
 
1251
        TCP_INC_STATS_BH(TcpOutSegs);
1252
}
1253
 
1254
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1255
{
1256
        struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1257
 
1258
        tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1259
                        tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1260
 
1261
        tcp_tw_put(tw);
1262
}
1263
 
1264
static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1265
{
1266
        tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1267
                        req->ts_recent);
1268
}
1269
 
1270
static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1271
{
1272
        struct rtable *rt;
1273
        struct ip_options *opt;
1274
 
1275
        opt = req->af.v4_req.opt;
1276
        if(ip_route_output(&rt, ((opt && opt->srr) ?
1277
                                 opt->faddr :
1278
                                 req->af.v4_req.rmt_addr),
1279
                           req->af.v4_req.loc_addr,
1280
                           RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
1281
                IP_INC_STATS_BH(IpOutNoRoutes);
1282
                return NULL;
1283
        }
1284
        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1285
                ip_rt_put(rt);
1286
                IP_INC_STATS_BH(IpOutNoRoutes);
1287
                return NULL;
1288
        }
1289
        return &rt->u.dst;
1290
}
1291
 
1292
/*
1293
 *      Send a SYN-ACK after having received an ACK.
1294
 *      This still operates on a open_request only, not on a big
1295
 *      socket.
1296
 */
1297
static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1298
                              struct dst_entry *dst)
1299
{
1300
        int err = -1;
1301
        struct sk_buff * skb;
1302
 
1303
        /* First, grab a route. */
1304
        if (dst == NULL &&
1305
            (dst = tcp_v4_route_req(sk, req)) == NULL)
1306
                goto out;
1307
 
1308
        skb = tcp_make_synack(sk, dst, req);
1309
 
1310
        if (skb) {
1311
                struct tcphdr *th = skb->h.th;
1312
 
1313
                th->check = tcp_v4_check(th, skb->len,
1314
                                         req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1315
                                         csum_partial((char *)th, skb->len, skb->csum));
1316
 
1317
                err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1318
                                            req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1319
                if (err == NET_XMIT_CN)
1320
                        err = 0;
1321
        }
1322
 
1323
out:
1324
        dst_release(dst);
1325
        return err;
1326
}
1327
 
1328
/*
1329
 *      IPv4 open_request destructor.
1330
 */
1331
static void tcp_v4_or_free(struct open_request *req)
1332
{
1333
        if (req->af.v4_req.opt)
1334
                kfree(req->af.v4_req.opt);
1335
}
1336
 
1337
static inline void syn_flood_warning(struct sk_buff *skb)
1338
{
1339
        static unsigned long warntime;
1340
 
1341
        if (jiffies - warntime > HZ*60) {
1342
                warntime = jiffies;
1343
                printk(KERN_INFO
1344
                       "possible SYN flooding on port %d. Sending cookies.\n",
1345
                       ntohs(skb->h.th->dest));
1346
        }
1347
}
1348
 
1349
/*
1350
 * Save and compile IPv4 options into the open_request if needed.
1351
 */
1352
static inline struct ip_options *
1353
tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1354
{
1355
        struct ip_options *opt = &(IPCB(skb)->opt);
1356
        struct ip_options *dopt = NULL;
1357
 
1358
        if (opt && opt->optlen) {
1359
                int opt_size = optlength(opt);
1360
                dopt = kmalloc(opt_size, GFP_ATOMIC);
1361
                if (dopt) {
1362
                        if (ip_options_echo(dopt, skb)) {
1363
                                kfree(dopt);
1364
                                dopt = NULL;
1365
                        }
1366
                }
1367
        }
1368
        return dopt;
1369
}
1370
 
1371
/*
1372
 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1373
 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1374
 * It would be better to replace it with a global counter for all sockets
1375
 * but then some measure against one socket starving all other sockets
1376
 * would be needed.
1377
 *
1378
 * It was 128 by default. Experiments with real servers show, that
1379
 * it is absolutely not enough even at 100conn/sec. 256 cures most
1380
 * of problems. This value is adjusted to 128 for very small machines
1381
 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1382
 * Further increasing requires to change hash table size.
1383
 */
1384
int sysctl_max_syn_backlog = 256;
1385
 
1386
struct or_calltable or_ipv4 = {
1387
        PF_INET,
1388
        tcp_v4_send_synack,
1389
        tcp_v4_or_send_ack,
1390
        tcp_v4_or_free,
1391
        tcp_v4_send_reset
1392
};
1393
 
1394
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1395
{
1396
        struct tcp_opt tp;
1397
        struct open_request *req;
1398
        __u32 saddr = skb->nh.iph->saddr;
1399
        __u32 daddr = skb->nh.iph->daddr;
1400
        __u32 isn = TCP_SKB_CB(skb)->when;
1401
        struct dst_entry *dst = NULL;
1402
#ifdef CONFIG_SYN_COOKIES
1403
        int want_cookie = 0;
1404
#else
1405
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1406
#endif
1407
 
1408
        /* Never answer to SYNs send to broadcast or multicast */
1409
        if (((struct rtable *)skb->dst)->rt_flags &
1410
            (RTCF_BROADCAST|RTCF_MULTICAST))
1411
                goto drop;
1412
 
1413
        /* TW buckets are converted to open requests without
1414
         * limitations, they conserve resources and peer is
1415
         * evidently real one.
1416
         */
1417
        if (tcp_synq_is_full(sk) && !isn) {
1418
#ifdef CONFIG_SYN_COOKIES
1419
                if (sysctl_tcp_syncookies) {
1420
                        want_cookie = 1;
1421
                } else
1422
#endif
1423
                goto drop;
1424
        }
1425
 
1426
        /* Accept backlog is full. If we have already queued enough
1427
         * of warm entries in syn queue, drop request. It is better than
1428
         * clogging syn queue with openreqs with exponentially increasing
1429
         * timeout.
1430
         */
1431
        if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1432
                goto drop;
1433
 
1434
        req = tcp_openreq_alloc();
1435
        if (req == NULL)
1436
                goto drop;
1437
 
1438
        tcp_clear_options(&tp);
1439
        tp.mss_clamp = 536;
1440
        tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1441
 
1442
        tcp_parse_options(skb, &tp, 0);
1443
 
1444
        if (want_cookie) {
1445
                tcp_clear_options(&tp);
1446
                tp.saw_tstamp = 0;
1447
        }
1448
 
1449
        if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1450
                /* Some OSes (unknown ones, but I see them on web server, which
1451
                 * contains information interesting only for windows'
1452
                 * users) do not send their stamp in SYN. It is easy case.
1453
                 * We simply do not advertise TS support.
1454
                 */
1455
                tp.saw_tstamp = 0;
1456
                tp.tstamp_ok = 0;
1457
        }
1458
        tp.tstamp_ok = tp.saw_tstamp;
1459
 
1460
        tcp_openreq_init(req, &tp, skb);
1461
 
1462
        req->af.v4_req.loc_addr = daddr;
1463
        req->af.v4_req.rmt_addr = saddr;
1464
        req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1465
        req->class = &or_ipv4;
1466
        if (!want_cookie)
1467
                TCP_ECN_create_request(req, skb->h.th);
1468
 
1469
        if (want_cookie) {
1470
#ifdef CONFIG_SYN_COOKIES
1471
                syn_flood_warning(skb);
1472
#endif
1473
                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1474
        } else if (isn == 0) {
1475
                struct inet_peer *peer = NULL;
1476
 
1477
                /* VJ's idea. We save last timestamp seen
1478
                 * from the destination in peer table, when entering
1479
                 * state TIME-WAIT, and check against it before
1480
                 * accepting new connection request.
1481
                 *
1482
                 * If "isn" is not zero, this request hit alive
1483
                 * timewait bucket, so that all the necessary checks
1484
                 * are made in the function processing timewait state.
1485
                 */
1486
                if (tp.saw_tstamp &&
1487
                    sysctl_tcp_tw_recycle &&
1488
                    (dst = tcp_v4_route_req(sk, req)) != NULL &&
1489
                    (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1490
                    peer->v4daddr == saddr) {
1491
                        if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1492
                            (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1493
                                NET_INC_STATS_BH(PAWSPassiveRejected);
1494
                                dst_release(dst);
1495
                                goto drop_and_free;
1496
                        }
1497
                }
1498
                /* Kill the following clause, if you dislike this way. */
1499
                else if (!sysctl_tcp_syncookies &&
1500
                         (sysctl_max_syn_backlog - tcp_synq_len(sk)
1501
                          < (sysctl_max_syn_backlog>>2)) &&
1502
                         (!peer || !peer->tcp_ts_stamp) &&
1503
                         (!dst || !dst->rtt)) {
1504
                        /* Without syncookies last quarter of
1505
                         * backlog is filled with destinations, proven to be alive.
1506
                         * It means that we continue to communicate
1507
                         * to destinations, already remembered
1508
                         * to the moment of synflood.
1509
                         */
1510
                        NETDEBUG(if (net_ratelimit()) \
1511
                                printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1512
                                        NIPQUAD(saddr), ntohs(skb->h.th->source)));
1513
                        dst_release(dst);
1514
                        goto drop_and_free;
1515
                }
1516
 
1517
                isn = tcp_v4_init_sequence(sk, skb);
1518
        }
1519
        req->snt_isn = isn;
1520
 
1521
        if (tcp_v4_send_synack(sk, req, dst))
1522
                goto drop_and_free;
1523
 
1524
        if (want_cookie) {
1525
                tcp_openreq_free(req);
1526
        } else {
1527
                tcp_v4_synq_add(sk, req);
1528
        }
1529
        return 0;
1530
 
1531
drop_and_free:
1532
        tcp_openreq_free(req);
1533
drop:
1534
        TCP_INC_STATS_BH(TcpAttemptFails);
1535
        return 0;
1536
}
1537
 
1538
 
1539
/*
1540
 * The three way handshake has completed - we got a valid synack -
1541
 * now create the new socket.
1542
 */
1543
struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1544
                                   struct open_request *req,
1545
                                   struct dst_entry *dst)
1546
{
1547
        struct tcp_opt *newtp;
1548
        struct sock *newsk;
1549
 
1550
        if (tcp_acceptq_is_full(sk))
1551
                goto exit_overflow;
1552
 
1553
        if (dst == NULL &&
1554
            (dst = tcp_v4_route_req(sk, req)) == NULL)
1555
                goto exit;
1556
 
1557
        newsk = tcp_create_openreq_child(sk, req, skb);
1558
        if (!newsk)
1559
                goto exit;
1560
 
1561
        newsk->dst_cache = dst;
1562
        newsk->route_caps = dst->dev->features;
1563
 
1564
        newtp = &(newsk->tp_pinfo.af_tcp);
1565
        newsk->daddr = req->af.v4_req.rmt_addr;
1566
        newsk->saddr = req->af.v4_req.loc_addr;
1567
        newsk->rcv_saddr = req->af.v4_req.loc_addr;
1568
        newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1569
        req->af.v4_req.opt = NULL;
1570
        newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1571
        newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1572
        newtp->ext_header_len = 0;
1573
        if (newsk->protinfo.af_inet.opt)
1574
                newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1575
        newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1576
 
1577
        tcp_sync_mss(newsk, dst->pmtu);
1578
        newtp->advmss = dst->advmss;
1579
        tcp_initialize_rcv_mss(newsk);
1580
 
1581
        __tcp_v4_hash(newsk, 0);
1582
        __tcp_inherit_port(sk, newsk);
1583
 
1584
        return newsk;
1585
 
1586
exit_overflow:
1587
        NET_INC_STATS_BH(ListenOverflows);
1588
exit:
1589
        NET_INC_STATS_BH(ListenDrops);
1590
        dst_release(dst);
1591
        return NULL;
1592
}
1593
 
1594
static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1595
{
1596
        struct open_request *req, **prev;
1597
        struct tcphdr *th = skb->h.th;
1598
        struct iphdr *iph = skb->nh.iph;
1599
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1600
        struct sock *nsk;
1601
 
1602
        /* Find possible connection requests. */
1603
        req = tcp_v4_search_req(tp, &prev,
1604
                                th->source,
1605
                                iph->saddr, iph->daddr);
1606
        if (req)
1607
                return tcp_check_req(sk, skb, req, prev);
1608
 
1609
        nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1610
                                          th->source,
1611
                                          skb->nh.iph->daddr,
1612
                                          ntohs(th->dest),
1613
                                          tcp_v4_iif(skb));
1614
 
1615
        if (nsk) {
1616
                if (nsk->state != TCP_TIME_WAIT) {
1617
                        bh_lock_sock(nsk);
1618
                        return nsk;
1619
                }
1620
                tcp_tw_put((struct tcp_tw_bucket*)nsk);
1621
                return NULL;
1622
        }
1623
 
1624
#ifdef CONFIG_SYN_COOKIES
1625
        if (!th->rst && !th->syn && th->ack)
1626
                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1627
#endif
1628
        return sk;
1629
}
1630
 
1631
static int tcp_v4_checksum_init(struct sk_buff *skb)
1632
{
1633
        if (skb->ip_summed == CHECKSUM_HW) {
1634
                skb->ip_summed = CHECKSUM_UNNECESSARY;
1635
                if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1636
                                  skb->nh.iph->daddr,skb->csum))
1637
                        return 0;
1638
 
1639
                NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1640
                skb->ip_summed = CHECKSUM_NONE;
1641
        }
1642
        if (skb->len <= 76) {
1643
                if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1644
                                 skb->nh.iph->daddr,
1645
                                 skb_checksum(skb, 0, skb->len, 0)))
1646
                        return -1;
1647
                skb->ip_summed = CHECKSUM_UNNECESSARY;
1648
        } else {
1649
                skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1650
                                          skb->nh.iph->daddr,0);
1651
        }
1652
        return 0;
1653
}
1654
 
1655
 
1656
/* The socket must have it's spinlock held when we get
1657
 * here.
1658
 *
1659
 * We have a potential double-lock case here, so even when
1660
 * doing backlog processing we use the BH locking scheme.
1661
 * This is because we cannot sleep with the original spinlock
1662
 * held.
1663
 */
1664
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1665
{
1666
        IP_INC_STATS_BH(IpInDelivers);
1667
 
1668
        if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1669
                TCP_CHECK_TIMER(sk);
1670
                if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1671
                        goto reset;
1672
                TCP_CHECK_TIMER(sk);
1673
                return 0;
1674
        }
1675
 
1676
        if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1677
                goto csum_err;
1678
 
1679
        if (sk->state == TCP_LISTEN) {
1680
                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1681
                if (!nsk)
1682
                        goto discard;
1683
 
1684
                if (nsk != sk) {
1685
                        if (tcp_child_process(sk, nsk, skb))
1686
                                goto reset;
1687
                        return 0;
1688
                }
1689
        }
1690
 
1691
        TCP_CHECK_TIMER(sk);
1692
        if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1693
                goto reset;
1694
        TCP_CHECK_TIMER(sk);
1695
        return 0;
1696
 
1697
reset:
1698
        tcp_v4_send_reset(skb);
1699
discard:
1700
        kfree_skb(skb);
1701
        /* Be careful here. If this function gets more complicated and
1702
         * gcc suffers from register pressure on the x86, sk (in %ebx)
1703
         * might be destroyed here. This current version compiles correctly,
1704
         * but you have been warned.
1705
         */
1706
        return 0;
1707
 
1708
csum_err:
1709
        TCP_INC_STATS_BH(TcpInErrs);
1710
        goto discard;
1711
}
1712
 
1713
/*
1714
 *      From tcp_input.c
1715
 */
1716
 
1717
int tcp_v4_rcv(struct sk_buff *skb)
1718
{
1719
        struct tcphdr *th;
1720
        struct sock *sk;
1721
        int ret;
1722
 
1723
        if (skb->pkt_type!=PACKET_HOST)
1724
                goto discard_it;
1725
 
1726
        /* Count it even if it's bad */
1727
        TCP_INC_STATS_BH(TcpInSegs);
1728
 
1729
        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1730
                goto discard_it;
1731
 
1732
        th = skb->h.th;
1733
 
1734
        if (th->doff < sizeof(struct tcphdr)/4)
1735
                goto bad_packet;
1736
        if (!pskb_may_pull(skb, th->doff*4))
1737
                goto discard_it;
1738
 
1739
        /* An explanation is required here, I think.
1740
         * Packet length and doff are validated by header prediction,
1741
         * provided case of th->doff==0 is elimineted.
1742
         * So, we defer the checks. */
1743
        if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1744
             tcp_v4_checksum_init(skb) < 0))
1745
                goto bad_packet;
1746
 
1747
        th = skb->h.th;
1748
        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1749
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1750
                                    skb->len - th->doff*4);
1751
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1752
        TCP_SKB_CB(skb)->when = 0;
1753
        TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1754
        TCP_SKB_CB(skb)->sacked = 0;
1755
 
1756
        sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1757
                             skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1758
 
1759
        if (!sk)
1760
                goto no_tcp_socket;
1761
 
1762
process:
1763
        if(!ipsec_sk_policy(sk,skb))
1764
                goto discard_and_relse;
1765
 
1766
        if (sk->state == TCP_TIME_WAIT)
1767
                goto do_time_wait;
1768
 
1769
        if (sk_filter(sk, skb, 0))
1770
                goto discard_and_relse;
1771
 
1772
        skb->dev = NULL;
1773
 
1774
        bh_lock_sock(sk);
1775
        ret = 0;
1776
        if (!sk->lock.users) {
1777
                if (!tcp_prequeue(sk, skb))
1778
                        ret = tcp_v4_do_rcv(sk, skb);
1779
        } else
1780
                sk_add_backlog(sk, skb);
1781
        bh_unlock_sock(sk);
1782
 
1783
        sock_put(sk);
1784
 
1785
        return ret;
1786
 
1787
no_tcp_socket:
1788
        if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1789
bad_packet:
1790
                TCP_INC_STATS_BH(TcpInErrs);
1791
        } else {
1792
                tcp_v4_send_reset(skb);
1793
        }
1794
 
1795
discard_it:
1796
        /* Discard frame. */
1797
        kfree_skb(skb);
1798
        return 0;
1799
 
1800
discard_and_relse:
1801
        sock_put(sk);
1802
        goto discard_it;
1803
 
1804
do_time_wait:
1805
        if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1806
                TCP_INC_STATS_BH(TcpInErrs);
1807
                tcp_tw_put((struct tcp_tw_bucket *) sk);
1808
                goto discard_it;
1809
        }
1810
        switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1811
                                          skb, th, skb->len)) {
1812
        case TCP_TW_SYN:
1813
        {
1814
                struct sock *sk2;
1815
 
1816
                sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1817
                if (sk2 != NULL) {
1818
                        tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1819
                        tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1820
                        tcp_tw_put((struct tcp_tw_bucket *)sk);
1821
                        sk = sk2;
1822
                        goto process;
1823
                }
1824
                /* Fall through to ACK */
1825
        }
1826
        case TCP_TW_ACK:
1827
                tcp_v4_timewait_ack(sk, skb);
1828
                break;
1829
        case TCP_TW_RST:
1830
                goto no_tcp_socket;
1831
        case TCP_TW_SUCCESS:;
1832
        }
1833
        goto discard_it;
1834
}
1835
 
1836
/* With per-bucket locks this operation is not-atomic, so that
1837
 * this version is not worse.
1838
 */
1839
static void __tcp_v4_rehash(struct sock *sk)
1840
{
1841
        sk->prot->unhash(sk);
1842
        sk->prot->hash(sk);
1843
}
1844
 
1845
static int tcp_v4_reselect_saddr(struct sock *sk)
1846
{
1847
        int err;
1848
        struct rtable *rt;
1849
        __u32 old_saddr = sk->saddr;
1850
        __u32 new_saddr;
1851
        __u32 daddr = sk->daddr;
1852
 
1853
        if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1854
                daddr = sk->protinfo.af_inet.opt->faddr;
1855
 
1856
        /* Query new route. */
1857
        err = ip_route_connect(&rt, daddr, 0,
1858
                               RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1859
                               sk->bound_dev_if);
1860
        if (err)
1861
                return err;
1862
 
1863
        __sk_dst_set(sk, &rt->u.dst);
1864
        sk->route_caps = rt->u.dst.dev->features;
1865
 
1866
        new_saddr = rt->rt_src;
1867
 
1868
        if (new_saddr == old_saddr)
1869
                return 0;
1870
 
1871
        if (sysctl_ip_dynaddr > 1) {
1872
                printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1873
                       "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1874
                       NIPQUAD(old_saddr),
1875
                       NIPQUAD(new_saddr));
1876
        }
1877
 
1878
        sk->saddr = new_saddr;
1879
        sk->rcv_saddr = new_saddr;
1880
 
1881
        /* XXX The only one ugly spot where we need to
1882
         * XXX really change the sockets identity after
1883
         * XXX it has entered the hashes. -DaveM
1884
         *
1885
         * Besides that, it does not check for connection
1886
         * uniqueness. Wait for troubles.
1887
         */
1888
        __tcp_v4_rehash(sk);
1889
        return 0;
1890
}
1891
 
1892
int tcp_v4_rebuild_header(struct sock *sk)
1893
{
1894
        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1895
        u32 daddr;
1896
        int err;
1897
 
1898
        /* Route is OK, nothing to do. */
1899
        if (rt != NULL)
1900
                return 0;
1901
 
1902
        /* Reroute. */
1903
        daddr = sk->daddr;
1904
        if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1905
                daddr = sk->protinfo.af_inet.opt->faddr;
1906
 
1907
        err = ip_route_output(&rt, daddr, sk->saddr,
1908
                              RT_CONN_FLAGS(sk), sk->bound_dev_if);
1909
        if (!err) {
1910
                __sk_dst_set(sk, &rt->u.dst);
1911
                sk->route_caps = rt->u.dst.dev->features;
1912
                return 0;
1913
        }
1914
 
1915
        /* Routing failed... */
1916
        sk->route_caps = 0;
1917
 
1918
        if (!sysctl_ip_dynaddr ||
1919
            sk->state != TCP_SYN_SENT ||
1920
            (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1921
            (err = tcp_v4_reselect_saddr(sk)) != 0)
1922
                sk->err_soft=-err;
1923
 
1924
        return err;
1925
}
1926
 
1927
static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1928
{
1929
        struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1930
 
1931
        sin->sin_family         = AF_INET;
1932
        sin->sin_addr.s_addr    = sk->daddr;
1933
        sin->sin_port           = sk->dport;
1934
}
1935
 
1936
/* VJ's idea. Save last timestamp seen from this destination
1937
 * and hold it at least for normal timewait interval to use for duplicate
1938
 * segment detection in subsequent connections, before they enter synchronized
1939
 * state.
1940
 */
1941
 
1942
int tcp_v4_remember_stamp(struct sock *sk)
1943
{
1944
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1945
        struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1946
        struct inet_peer *peer = NULL;
1947
        int release_it = 0;
1948
 
1949
        if (rt == NULL || rt->rt_dst != sk->daddr) {
1950
                peer = inet_getpeer(sk->daddr, 1);
1951
                release_it = 1;
1952
        } else {
1953
                if (rt->peer == NULL)
1954
                        rt_bind_peer(rt, 1);
1955
                peer = rt->peer;
1956
        }
1957
 
1958
        if (peer) {
1959
                if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1960
                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1961
                     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1962
                        peer->tcp_ts_stamp = tp->ts_recent_stamp;
1963
                        peer->tcp_ts = tp->ts_recent;
1964
                }
1965
                if (release_it)
1966
                        inet_putpeer(peer);
1967
                return 1;
1968
        }
1969
 
1970
        return 0;
1971
}
1972
 
1973
int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1974
{
1975
        struct inet_peer *peer = NULL;
1976
 
1977
        peer = inet_getpeer(tw->daddr, 1);
1978
 
1979
        if (peer) {
1980
                if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1981
                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1982
                     peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1983
                        peer->tcp_ts_stamp = tw->ts_recent_stamp;
1984
                        peer->tcp_ts = tw->ts_recent;
1985
                }
1986
                inet_putpeer(peer);
1987
                return 1;
1988
        }
1989
 
1990
        return 0;
1991
}
1992
 
1993
struct tcp_func ipv4_specific = {
1994
        ip_queue_xmit,
1995
        tcp_v4_send_check,
1996
        tcp_v4_rebuild_header,
1997
        tcp_v4_conn_request,
1998
        tcp_v4_syn_recv_sock,
1999
        tcp_v4_remember_stamp,
2000
        sizeof(struct iphdr),
2001
 
2002
        ip_setsockopt,
2003
        ip_getsockopt,
2004
        v4_addr2sockaddr,
2005
        sizeof(struct sockaddr_in)
2006
};
2007
 
2008
/* NOTE: A lot of things set to zero explicitly by call to
2009
 *       sk_alloc() so need not be done here.
2010
 */
2011
static int tcp_v4_init_sock(struct sock *sk)
2012
{
2013
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2014
 
2015
        skb_queue_head_init(&tp->out_of_order_queue);
2016
        tcp_init_xmit_timers(sk);
2017
        tcp_prequeue_init(tp);
2018
 
2019
        tp->rto  = TCP_TIMEOUT_INIT;
2020
        tp->mdev = TCP_TIMEOUT_INIT;
2021
 
2022
        /* So many TCP implementations out there (incorrectly) count the
2023
         * initial SYN frame in their delayed-ACK and congestion control
2024
         * algorithms that we must have the following bandaid to talk
2025
         * efficiently to them.  -DaveM
2026
         */
2027
        tp->snd_cwnd = 2;
2028
 
2029
        /* See draft-stevens-tcpca-spec-01 for discussion of the
2030
         * initialization of these values.
2031
         */
2032
        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2033
        tp->snd_cwnd_clamp = ~0;
2034
        tp->mss_cache = 536;
2035
 
2036
        tp->reordering = sysctl_tcp_reordering;
2037
 
2038
        sk->state = TCP_CLOSE;
2039
 
2040
        sk->write_space = tcp_write_space;
2041
        sk->use_write_queue = 1;
2042
 
2043
        sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
2044
 
2045
        sk->sndbuf = sysctl_tcp_wmem[1];
2046
        sk->rcvbuf = sysctl_tcp_rmem[1];
2047
 
2048
        atomic_inc(&tcp_sockets_allocated);
2049
 
2050
        return 0;
2051
}
2052
 
2053
static int tcp_v4_destroy_sock(struct sock *sk)
2054
{
2055
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2056
 
2057
        tcp_clear_xmit_timers(sk);
2058
 
2059
        /* Cleanup up the write buffer. */
2060
        tcp_writequeue_purge(sk);
2061
 
2062
        /* Cleans up our, hopefully empty, out_of_order_queue. */
2063
        __skb_queue_purge(&tp->out_of_order_queue);
2064
 
2065
        /* Clean prequeue, it must be empty really */
2066
        __skb_queue_purge(&tp->ucopy.prequeue);
2067
 
2068
        /* Clean up a referenced TCP bind bucket. */
2069
        if(sk->prev != NULL)
2070
                tcp_put_port(sk);
2071
 
2072
        /* If sendmsg cached page exists, toss it. */
2073
        if (tp->sndmsg_page != NULL)
2074
                __free_page(tp->sndmsg_page);
2075
 
2076
        atomic_dec(&tcp_sockets_allocated);
2077
 
2078
        return 0;
2079
}
2080
 
2081
/* Proc filesystem TCP sock list dumping. */
2082
static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
2083
{
2084
        int ttd = req->expires - jiffies;
2085
 
2086
        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2087
                " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2088
                i,
2089
                req->af.v4_req.loc_addr,
2090
                ntohs(sk->sport),
2091
                req->af.v4_req.rmt_addr,
2092
                ntohs(req->rmt_port),
2093
                TCP_SYN_RECV,
2094
                0,0, /* could print option size, but that is af dependent. */
2095
                1,   /* timers active (only the expire timer) */
2096
                ttd,
2097
                req->retrans,
2098
                uid,
2099
                0,  /* non standard timer */
2100
                0, /* open_requests have no inode */
2101
                atomic_read(&sk->refcnt),
2102
                req
2103
                );
2104
}
2105
 
2106
static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2107
{
2108
        unsigned int dest, src;
2109
        __u16 destp, srcp;
2110
        int timer_active;
2111
        unsigned long timer_expires;
2112
        struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2113
 
2114
        dest  = sp->daddr;
2115
        src   = sp->rcv_saddr;
2116
        destp = ntohs(sp->dport);
2117
        srcp  = ntohs(sp->sport);
2118
        if (tp->pending == TCP_TIME_RETRANS) {
2119
                timer_active    = 1;
2120
                timer_expires   = tp->timeout;
2121
        } else if (tp->pending == TCP_TIME_PROBE0) {
2122
                timer_active    = 4;
2123
                timer_expires   = tp->timeout;
2124
        } else if (timer_pending(&sp->timer)) {
2125
                timer_active    = 2;
2126
                timer_expires   = sp->timer.expires;
2127
        } else {
2128
                timer_active    = 0;
2129
                timer_expires = jiffies;
2130
        }
2131
 
2132
        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2133
                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2134
                i, src, srcp, dest, destp, sp->state,
2135
                tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2136
                timer_active, timer_expires-jiffies,
2137
                tp->retransmits,
2138
                sock_i_uid(sp),
2139
                tp->probes_out,
2140
                sock_i_ino(sp),
2141
                atomic_read(&sp->refcnt), sp,
2142
                tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2143
                tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2144
                );
2145
}
2146
 
2147
static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2148
{
2149
        unsigned int dest, src;
2150
        __u16 destp, srcp;
2151
        int ttd = tw->ttd - jiffies;
2152
 
2153
        if (ttd < 0)
2154
                ttd = 0;
2155
 
2156
        dest  = tw->daddr;
2157
        src   = tw->rcv_saddr;
2158
        destp = ntohs(tw->dport);
2159
        srcp  = ntohs(tw->sport);
2160
 
2161
        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2162
                " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2163
                i, src, srcp, dest, destp, tw->substate, 0, 0,
2164
                3, ttd, 0, 0, 0, 0,
2165
                atomic_read(&tw->refcnt), tw);
2166
}
2167
 
2168
#define TMPSZ 150
2169
 
2170
int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2171
{
2172
        int len = 0, num = 0, i;
2173
        off_t begin, pos = 0;
2174
        char tmpbuf[TMPSZ+1];
2175
 
2176
        if (offset < TMPSZ)
2177
                len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2178
                               "  sl  local_address rem_address   st tx_queue "
2179
                               "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2180
 
2181
        pos = TMPSZ;
2182
 
2183
        /* First, walk listening socket table. */
2184
        tcp_listen_lock();
2185
        for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2186
                struct sock *sk;
2187
                struct tcp_listen_opt *lopt;
2188
                int k;
2189
 
2190
                for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2191
                        struct open_request *req;
2192
                        int uid;
2193
                        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2194
 
2195
                        if (!TCP_INET_FAMILY(sk->family))
2196
                                goto skip_listen;
2197
 
2198
                        pos += TMPSZ;
2199
                        if (pos >= offset) {
2200
                                get_tcp_sock(sk, tmpbuf, num);
2201
                                len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2202
                                if (pos >= offset + length) {
2203
                                        tcp_listen_unlock();
2204
                                        goto out_no_bh;
2205
                                }
2206
                        }
2207
 
2208
skip_listen:
2209
                        uid = sock_i_uid(sk);
2210
                        read_lock_bh(&tp->syn_wait_lock);
2211
                        lopt = tp->listen_opt;
2212
                        if (lopt && lopt->qlen != 0) {
2213
                                for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2214
                                        for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2215
                                                if (!TCP_INET_FAMILY(req->class->family))
2216
                                                        continue;
2217
 
2218
                                                pos += TMPSZ;
2219
                                                if (pos <= offset)
2220
                                                        continue;
2221
                                                get_openreq(sk, req, tmpbuf, num, uid);
2222
                                                len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2223
                                                if (pos >= offset + length) {
2224
                                                        read_unlock_bh(&tp->syn_wait_lock);
2225
                                                        tcp_listen_unlock();
2226
                                                        goto out_no_bh;
2227
                                                }
2228
                                        }
2229
                                }
2230
                        }
2231
                        read_unlock_bh(&tp->syn_wait_lock);
2232
 
2233
                        /* Completed requests are in normal socket hash table */
2234
                }
2235
        }
2236
        tcp_listen_unlock();
2237
 
2238
        local_bh_disable();
2239
 
2240
        /* Next, walk established hash chain. */
2241
        for (i = 0; i < tcp_ehash_size; i++) {
2242
                struct tcp_ehash_bucket *head = &tcp_ehash[i];
2243
                struct sock *sk;
2244
                struct tcp_tw_bucket *tw;
2245
 
2246
                read_lock(&head->lock);
2247
                for(sk = head->chain; sk; sk = sk->next, num++) {
2248
                        if (!TCP_INET_FAMILY(sk->family))
2249
                                continue;
2250
                        pos += TMPSZ;
2251
                        if (pos <= offset)
2252
                                continue;
2253
                        get_tcp_sock(sk, tmpbuf, num);
2254
                        len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2255
                        if (pos >= offset + length) {
2256
                                read_unlock(&head->lock);
2257
                                goto out;
2258
                        }
2259
                }
2260
                for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2261
                     tw != NULL;
2262
                     tw = (struct tcp_tw_bucket *)tw->next, num++) {
2263
                        if (!TCP_INET_FAMILY(tw->family))
2264
                                continue;
2265
                        pos += TMPSZ;
2266
                        if (pos <= offset)
2267
                                continue;
2268
                        get_timewait_sock(tw, tmpbuf, num);
2269
                        len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2270
                        if (pos >= offset + length) {
2271
                                read_unlock(&head->lock);
2272
                                goto out;
2273
                        }
2274
                }
2275
                read_unlock(&head->lock);
2276
        }
2277
 
2278
out:
2279
        local_bh_enable();
2280
out_no_bh:
2281
 
2282
        begin = len - (pos - offset);
2283
        *start = buffer + begin;
2284
        len -= begin;
2285
        if (len > length)
2286
                len = length;
2287
        if (len < 0)
2288
                len = 0;
2289
        return len;
2290
}
2291
 
2292
struct proto tcp_prot = {
2293
        name:           "TCP",
2294
        close:          tcp_close,
2295
        connect:        tcp_v4_connect,
2296
        disconnect:     tcp_disconnect,
2297
        accept:         tcp_accept,
2298
        ioctl:          tcp_ioctl,
2299
        init:           tcp_v4_init_sock,
2300
        destroy:        tcp_v4_destroy_sock,
2301
        shutdown:       tcp_shutdown,
2302
        setsockopt:     tcp_setsockopt,
2303
        getsockopt:     tcp_getsockopt,
2304
        sendmsg:        tcp_sendmsg,
2305
        recvmsg:        tcp_recvmsg,
2306
        backlog_rcv:    tcp_v4_do_rcv,
2307
        hash:           tcp_v4_hash,
2308
        unhash:         tcp_unhash,
2309
        get_port:       tcp_v4_get_port,
2310
};
2311
 
2312
 
2313
 
2314
void __init tcp_v4_init(struct net_proto_family *ops)
2315
{
2316
        int err;
2317
 
2318
        tcp_inode.i_mode = S_IFSOCK;
2319
        tcp_inode.i_sock = 1;
2320
        tcp_inode.i_uid = 0;
2321
        tcp_inode.i_gid = 0;
2322
        init_waitqueue_head(&tcp_inode.i_wait);
2323
        init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2324
 
2325
        tcp_socket->inode = &tcp_inode;
2326
        tcp_socket->state = SS_UNCONNECTED;
2327
        tcp_socket->type=SOCK_RAW;
2328
 
2329
        if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2330
                panic("Failed to create the TCP control socket.\n");
2331
        tcp_socket->sk->allocation=GFP_ATOMIC;
2332
        tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2333
 
2334
        /* Unhash it so that IP input processing does not even
2335
         * see it, we do not wish this socket to see incoming
2336
         * packets.
2337
         */
2338
        tcp_socket->sk->prot->unhash(tcp_socket->sk);
2339
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.