OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [net/] [ipv4/] [ipvs/] [ip_vs_conn.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * IPVS         An implementation of the IP virtual server support for the
3
 *              LINUX operating system.  IPVS is now implemented as a module
4
 *              over the Netfilter framework. IPVS can be used to build a
5
 *              high-performance and highly available server based on a
6
 *              cluster of servers.
7
 *
8
 * Version:     $Id: ip_vs_conn.c,v 1.1.1.1 2004-04-15 01:14:00 phoenix Exp $
9
 *
10
 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11
 *              Peter Kese <peter.kese@ijs.si>
12
 *              Julian Anastasov <ja@ssi.bg>
13
 *
14
 *              This program is free software; you can redistribute it and/or
15
 *              modify it under the terms of the GNU General Public License
16
 *              as published by the Free Software Foundation; either version
17
 *              2 of the License, or (at your option) any later version.
18
 *
19
 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20
 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21
 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
22
 *
23
 * Changes:
24
 *
25
 */
26
 
27
#include <linux/module.h>
28
#include <linux/kernel.h>
29
#include <linux/vmalloc.h>
30
#include <linux/ip.h>
31
#include <linux/tcp.h>                  /* for tcphdr */
32
#include <linux/in.h>
33
#include <linux/proc_fs.h>              /* for proc_net_* */
34
#include <asm/softirq.h>                /* for local_bh_* */
35
#include <net/ip.h>
36
#include <net/tcp.h>                    /* for csum_tcpudp_magic */
37
#include <net/udp.h>
38
#include <net/icmp.h>                   /* for icmp_send */
39
#include <net/route.h>                  /* for ip_route_output */
40
#include <linux/netfilter.h>
41
#include <linux/netfilter_ipv4.h>
42
#include <linux/jhash.h>
43
#include <linux/random.h>
44
 
45
#include <net/ip_vs.h>
46
 
47
 
48
/*
49
 *  Connection hash table: for input and output packets lookups of IPVS
50
 */
51
static struct list_head *ip_vs_conn_tab;
52
 
53
/* SLAB cache for IPVS connections */
54
static kmem_cache_t *ip_vs_conn_cachep;
55
 
56
/* counter for current IPVS connections */
57
static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
58
 
59
/* counter for no-client-port connections */
60
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
61
 
62
/* random value for IPVS connection hash */
63
static unsigned int ip_vs_conn_rnd;
64
 
65
/*
66
 *  Fine locking granularity for big connection hash table
67
 */
68
#define CT_LOCKARRAY_BITS  4
69
#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
70
#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
71
 
72
struct ip_vs_aligned_lock
73
{
74
        rwlock_t        l;
75
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
76
 
77
/* lock array for conn table */
78
struct ip_vs_aligned_lock
79
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
80
 
81
static inline void ct_read_lock(unsigned key)
82
{
83
        read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
84
}
85
 
86
static inline void ct_read_unlock(unsigned key)
87
{
88
        read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
89
}
90
 
91
static inline void ct_write_lock(unsigned key)
92
{
93
        write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
94
}
95
 
96
static inline void ct_write_unlock(unsigned key)
97
{
98
        write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
99
}
100
 
101
static inline void ct_read_lock_bh(unsigned key)
102
{
103
        read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
104
}
105
 
106
static inline void ct_read_unlock_bh(unsigned key)
107
{
108
        read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
109
}
110
 
111
static inline void ct_write_lock_bh(unsigned key)
112
{
113
        write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
114
}
115
 
116
static inline void ct_write_unlock_bh(unsigned key)
117
{
118
        write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
119
}
120
 
121
 
122
/*
123
 *      Returns hash value for IPVS connection entry
124
 */
125
static unsigned
126
ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
127
{
128
        return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
129
                & IP_VS_CONN_TAB_MASK;
130
}
131
 
132
 
133
/*
134
 *      Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
135
 *      returns bool success.
136
 */
137
static int ip_vs_conn_hash(struct ip_vs_conn *cp)
138
{
139
        unsigned hash;
140
        int ret;
141
 
142
        /* Hash by protocol, client address and port */
143
        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
144
 
145
        ct_write_lock(hash);
146
 
147
        if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
148
                list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
149
                cp->flags |= IP_VS_CONN_F_HASHED;
150
                atomic_inc(&cp->refcnt);
151
                ret = 1;
152
        } else {
153
                IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
154
                          "called from %p\n", __builtin_return_address(0));
155
                ret = 0;
156
        }
157
 
158
        ct_write_unlock(hash);
159
 
160
        return ret;
161
}
162
 
163
 
164
/*
165
 *      UNhashes ip_vs_conn from ip_vs_conn_tab.
166
 *      returns bool success.
167
 */
168
static int ip_vs_conn_unhash(struct ip_vs_conn *cp)
169
{
170
        unsigned hash;
171
        int ret;
172
 
173
        /* unhash it and decrease its reference counter */
174
        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
175
        ct_write_lock(hash);
176
 
177
        if (cp->flags & IP_VS_CONN_F_HASHED) {
178
                list_del(&cp->c_list);
179
                cp->flags &= ~IP_VS_CONN_F_HASHED;
180
                atomic_dec(&cp->refcnt);
181
                ret = 1;
182
        } else
183
                ret = 0;
184
 
185
        ct_write_unlock(hash);
186
 
187
        return ret;
188
}
189
 
190
 
191
/*
192
 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
193
 *  Called for pkts coming from OUTside-to-INside.
194
 *      s_addr, s_port: pkt source address (foreign host)
195
 *      d_addr, d_port: pkt dest address (load balancer)
196
 */
197
static inline struct ip_vs_conn *__ip_vs_conn_in_get
198
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
199
{
200
        unsigned hash;
201
        struct ip_vs_conn *cp;
202
        struct list_head *l,*e;
203
 
204
        hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
205
        l = &ip_vs_conn_tab[hash];
206
 
207
        ct_read_lock(hash);
208
 
209
        for (e=l->next; e!=l; e=e->next) {
210
                cp = list_entry(e, struct ip_vs_conn, c_list);
211
                if (s_addr==cp->caddr && s_port==cp->cport &&
212
                    d_port==cp->vport && d_addr==cp->vaddr &&
213
                    protocol==cp->protocol) {
214
                        /* HIT */
215
                        atomic_inc(&cp->refcnt);
216
                        ct_read_unlock(hash);
217
                        return cp;
218
                }
219
        }
220
 
221
        ct_read_unlock(hash);
222
 
223
        return NULL;
224
}
225
 
226
struct ip_vs_conn *ip_vs_conn_in_get
227
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
228
{
229
        struct ip_vs_conn *cp;
230
 
231
        cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
232
        if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
233
                cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
234
 
235
        IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
236
                  ip_vs_proto_name(protocol),
237
                  NIPQUAD(s_addr), ntohs(s_port),
238
                  NIPQUAD(d_addr), ntohs(d_port),
239
                  cp?"hit":"not hit");
240
 
241
        return cp;
242
}
243
 
244
 
245
/*
246
 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
247
 *  Called for pkts coming from inside-to-OUTside.
248
 *      s_addr, s_port: pkt source address (inside host)
249
 *      d_addr, d_port: pkt dest address (foreign host)
250
 */
251
struct ip_vs_conn *ip_vs_conn_out_get
252
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
253
{
254
        unsigned hash;
255
        struct ip_vs_conn *cp, *ret=NULL;
256
        struct list_head *l,*e;
257
 
258
        /*
259
         *      Check for "full" addressed entries
260
         */
261
        hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
262
        l = &ip_vs_conn_tab[hash];
263
 
264
        ct_read_lock(hash);
265
 
266
        for (e=l->next; e!=l; e=e->next) {
267
                cp = list_entry(e, struct ip_vs_conn, c_list);
268
                if (d_addr == cp->caddr && d_port == cp->cport &&
269
                    s_port == cp->dport && s_addr == cp->daddr &&
270
                    protocol == cp->protocol) {
271
                        /* HIT */
272
                        atomic_inc(&cp->refcnt);
273
                        ret = cp;
274
                        break;
275
                }
276
        }
277
 
278
        ct_read_unlock(hash);
279
 
280
        IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
281
                  ip_vs_proto_name(protocol),
282
                  NIPQUAD(s_addr), ntohs(s_port),
283
                  NIPQUAD(d_addr), ntohs(d_port),
284
                  ret?"hit":"not hit");
285
 
286
        return ret;
287
}
288
 
289
 
290
/*
291
 *      Put back the conn and restart its timer with its timeout
292
 */
293
void ip_vs_conn_put(struct ip_vs_conn *cp)
294
{
295
        /* reset it expire in its timeout */
296
        mod_timer(&cp->timer, jiffies+cp->timeout);
297
 
298
        __ip_vs_conn_put(cp);
299
}
300
 
301
 
302
/*
303
 *      Timeout table[state]
304
 */
305
struct ip_vs_timeout_table vs_timeout_table = {
306
        ATOMIC_INIT(0),  /* refcnt */
307
        0,               /* scale  */
308
        {
309
                [IP_VS_S_NONE]          =       30*60*HZ,
310
                [IP_VS_S_ESTABLISHED]   =       15*60*HZ,
311
                [IP_VS_S_SYN_SENT]      =       2*60*HZ,
312
                [IP_VS_S_SYN_RECV]      =       1*60*HZ,
313
                [IP_VS_S_FIN_WAIT]      =       2*60*HZ,
314
                [IP_VS_S_TIME_WAIT]     =       2*60*HZ,
315
                [IP_VS_S_CLOSE]         =       10*HZ,
316
                [IP_VS_S_CLOSE_WAIT]    =       60*HZ,
317
                [IP_VS_S_LAST_ACK]      =       30*HZ,
318
                [IP_VS_S_LISTEN]        =       2*60*HZ,
319
                [IP_VS_S_SYNACK]        =       120*HZ,
320
                [IP_VS_S_UDP]           =       5*60*HZ,
321
                [IP_VS_S_ICMP]          =       1*60*HZ,
322
                [IP_VS_S_LAST]          =       2*HZ,
323
        },      /* timeout */
324
};
325
 
326
 
327
struct ip_vs_timeout_table vs_timeout_table_dos = {
328
        ATOMIC_INIT(0),  /* refcnt */
329
        0,               /* scale  */
330
        {
331
                [IP_VS_S_NONE]          =       15*60*HZ,
332
                [IP_VS_S_ESTABLISHED]   =       8*60*HZ,
333
                [IP_VS_S_SYN_SENT]      =       60*HZ,
334
                [IP_VS_S_SYN_RECV]      =       10*HZ,
335
                [IP_VS_S_FIN_WAIT]      =       60*HZ,
336
                [IP_VS_S_TIME_WAIT]     =       60*HZ,
337
                [IP_VS_S_CLOSE]         =       10*HZ,
338
                [IP_VS_S_CLOSE_WAIT]    =       60*HZ,
339
                [IP_VS_S_LAST_ACK]      =       30*HZ,
340
                [IP_VS_S_LISTEN]        =       2*60*HZ,
341
                [IP_VS_S_SYNACK]        =       100*HZ,
342
                [IP_VS_S_UDP]           =       3*60*HZ,
343
                [IP_VS_S_ICMP]          =       1*60*HZ,
344
                [IP_VS_S_LAST]          =       2*HZ,
345
        },      /* timeout */
346
};
347
 
348
 
349
/*
350
 *      Timeout table to use for the VS entries
351
 *      If NULL we use the default table (vs_timeout_table).
352
 *      Under flood attack we switch to vs_timeout_table_dos
353
 */
354
 
355
static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table;
356
 
357
static const char * state_name_table[IP_VS_S_LAST+1] = {
358
        [IP_VS_S_NONE]          =       "NONE",
359
        [IP_VS_S_ESTABLISHED]   =       "ESTABLISHED",
360
        [IP_VS_S_SYN_SENT]      =       "SYN_SENT",
361
        [IP_VS_S_SYN_RECV]      =       "SYN_RECV",
362
        [IP_VS_S_FIN_WAIT]      =       "FIN_WAIT",
363
        [IP_VS_S_TIME_WAIT]     =       "TIME_WAIT",
364
        [IP_VS_S_CLOSE]         =       "CLOSE",
365
        [IP_VS_S_CLOSE_WAIT]    =       "CLOSE_WAIT",
366
        [IP_VS_S_LAST_ACK]      =       "LAST_ACK",
367
        [IP_VS_S_LISTEN]        =       "LISTEN",
368
        [IP_VS_S_SYNACK]        =       "SYNACK",
369
        [IP_VS_S_UDP]           =       "UDP",
370
        [IP_VS_S_ICMP]          =       "ICMP",
371
        [IP_VS_S_LAST]          =       "BUG!",
372
};
373
 
374
#define sNO IP_VS_S_NONE
375
#define sES IP_VS_S_ESTABLISHED
376
#define sSS IP_VS_S_SYN_SENT
377
#define sSR IP_VS_S_SYN_RECV
378
#define sFW IP_VS_S_FIN_WAIT
379
#define sTW IP_VS_S_TIME_WAIT
380
#define sCL IP_VS_S_CLOSE
381
#define sCW IP_VS_S_CLOSE_WAIT
382
#define sLA IP_VS_S_LAST_ACK
383
#define sLI IP_VS_S_LISTEN
384
#define sSA IP_VS_S_SYNACK
385
 
386
struct vs_tcp_states_t {
387
        int next_state[IP_VS_S_LAST];   /* should be _LAST_TCP */
388
};
389
 
390
const char * ip_vs_state_name(int state)
391
{
392
        if (state >= IP_VS_S_LAST)
393
                return "ERR!";
394
        return state_name_table[state] ? state_name_table[state] : "?";
395
}
396
 
397
static struct vs_tcp_states_t vs_tcp_states [] = {
398
/*      INPUT */
399
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
400
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
401
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
402
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
403
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
404
 
405
/*      OUTPUT */
406
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
407
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
408
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
409
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
410
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
411
 
412
/*      INPUT-ONLY */
413
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
414
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
415
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
416
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
417
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
418
};
419
 
420
static struct vs_tcp_states_t vs_tcp_states_dos [] = {
421
/*      INPUT */
422
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
423
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
424
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
425
/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
426
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
427
 
428
/*      OUTPUT */
429
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
430
/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
431
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
432
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
433
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
434
 
435
/*      INPUT-ONLY */
436
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
437
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
438
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
439
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
440
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
441
};
442
 
443
static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states;
444
 
445
void ip_vs_secure_tcp_set(int on)
446
{
447
        if (on) {
448
                ip_vs_state_table = vs_tcp_states_dos;
449
                ip_vs_timeout_table = &vs_timeout_table_dos;
450
        } else {
451
                ip_vs_state_table = vs_tcp_states;
452
                ip_vs_timeout_table = &vs_timeout_table;
453
        }
454
}
455
 
456
 
457
static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off)
458
{
459
        /*
460
         *      [0-3]: input states, [4-7]: output, [8-11] input only states.
461
         */
462
        if (th->rst)
463
                return state_off+3;
464
        if (th->syn)
465
                return state_off+0;
466
        if (th->fin)
467
                return state_off+1;
468
        if (th->ack)
469
                return state_off+2;
470
        return -1;
471
}
472
 
473
 
474
static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state)
475
{
476
        struct ip_vs_timeout_table *vstim = cp->timeout_table;
477
 
478
        /*
479
         *      Use default timeout table if no specific for this entry
480
         */
481
        if (!vstim)
482
                vstim = &vs_timeout_table;
483
 
484
        cp->timeout = vstim->timeout[cp->state=state];
485
 
486
        if (vstim->scale) {
487
                int scale = vstim->scale;
488
 
489
                if (scale<0)
490
                        cp->timeout >>= -scale;
491
                else if (scale > 0)
492
                        cp->timeout <<= scale;
493
        }
494
 
495
        return state;
496
}
497
 
498
 
499
static inline int
500
vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th)
501
{
502
        int state_idx;
503
        int new_state = IP_VS_S_CLOSE;
504
 
505
        /*
506
         *    Update state offset to INPUT_ONLY if necessary
507
         *    or delete NO_OUTPUT flag if output packet detected
508
         */
509
        if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
510
                if (state_off == VS_STATE_OUTPUT)
511
                        cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
512
                else
513
                        state_off = VS_STATE_INPUT_ONLY;
514
        }
515
 
516
        if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) {
517
                IP_VS_DBG(8, "vs_tcp_state_idx(%d)=%d!!!\n",
518
                          state_off, state_idx);
519
                goto tcp_state_out;
520
        }
521
 
522
        new_state = ip_vs_state_table[state_idx].next_state[cp->state];
523
 
524
  tcp_state_out:
525
        if (new_state != cp->state) {
526
                struct ip_vs_dest *dest = cp->dest;
527
 
528
                IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
529
                          "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
530
                          ip_vs_proto_name(cp->protocol),
531
                          (state_off==VS_STATE_OUTPUT)?"output ":"input ",
532
                          th->syn? 'S' : '.',
533
                          th->fin? 'F' : '.',
534
                          th->ack? 'A' : '.',
535
                          th->rst? 'R' : '.',
536
                          NIPQUAD(cp->daddr), ntohs(cp->dport),
537
                          NIPQUAD(cp->caddr), ntohs(cp->cport),
538
                          ip_vs_state_name(cp->state),
539
                          ip_vs_state_name(new_state),
540
                          atomic_read(&cp->refcnt));
541
                if (dest) {
542
                        if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
543
                            (new_state != IP_VS_S_ESTABLISHED)) {
544
                                atomic_dec(&dest->activeconns);
545
                                atomic_inc(&dest->inactconns);
546
                                cp->flags |= IP_VS_CONN_F_INACTIVE;
547
                        } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
548
                                   (new_state == IP_VS_S_ESTABLISHED)) {
549
                                atomic_inc(&dest->activeconns);
550
                                atomic_dec(&dest->inactconns);
551
                                cp->flags &= ~IP_VS_CONN_F_INACTIVE;
552
                        }
553
                }
554
        }
555
 
556
        return vs_set_state_timeout(cp, new_state);
557
}
558
 
559
 
560
/*
561
 *      Handle state transitions
562
 */
563
int ip_vs_set_state(struct ip_vs_conn *cp,
564
                    int state_off, struct iphdr *iph, void *tp)
565
{
566
        int ret;
567
 
568
        spin_lock(&cp->lock);
569
        switch (iph->protocol) {
570
        case IPPROTO_TCP:
571
                ret = vs_tcp_state(cp, state_off, tp);
572
                break;
573
        case IPPROTO_UDP:
574
                ret = vs_set_state_timeout(cp, IP_VS_S_UDP);
575
                break;
576
        case IPPROTO_ICMP:
577
                ret = vs_set_state_timeout(cp, IP_VS_S_ICMP);
578
                break;
579
        default:
580
                ret = -1;
581
        }
582
        spin_unlock(&cp->lock);
583
 
584
        return ret;
585
}
586
 
587
 
588
/*
589
 *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
590
 */
591
int ip_vs_conn_listen(struct ip_vs_conn *cp)
592
{
593
        vs_set_state_timeout(cp, IP_VS_S_LISTEN);
594
        return cp->timeout;
595
}
596
 
597
 
598
/*
599
 *      Bypass transmitter
600
 *      Let packets bypass the destination when the destination is not
601
 *      available, it may be only used in transparent cache cluster.
602
 */
603
static int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
604
{
605
        struct rtable *rt;                      /* Route to the other host */
606
        struct iphdr  *iph = skb->nh.iph;
607
        u8     tos = iph->tos;
608
        int    mtu;
609
 
610
        EnterFunction(10);
611
 
612
        if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) {
613
                IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
614
                             "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
615
                goto tx_error_icmp;
616
        }
617
 
618
        /* MTU checking */
619
        mtu = rt->u.dst.pmtu;
620
        if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
621
                ip_rt_put(rt);
622
                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
623
                IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
624
                goto tx_error;
625
        }
626
 
627
        /* update checksum because skb might be defragmented */
628
        ip_send_check(iph);
629
 
630
        if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
631
                if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
632
                        ip_rt_put(rt);
633
                        IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
634
                        goto tx_error;
635
                }
636
        }
637
 
638
        /* drop old route */
639
        dst_release(skb->dst);
640
        skb->dst = &rt->u.dst;
641
 
642
#ifdef CONFIG_NETFILTER_DEBUG
643
        skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
644
#endif /* CONFIG_NETFILTER_DEBUG */
645
        skb->nfcache |= NFC_IPVS_PROPERTY;
646
        ip_send(skb);
647
 
648
        LeaveFunction(10);
649
        return NF_STOLEN;
650
 
651
  tx_error_icmp:
652
        dst_link_failure(skb);
653
  tx_error:
654
        kfree_skb(skb);
655
        return NF_STOLEN;
656
}
657
 
658
 
659
/*
660
 *      NULL transmitter (do nothing except return NF_ACCEPT)
661
 */
662
static int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
663
{
664
        return NF_ACCEPT;
665
}
666
 
667
 
668
/*
669
 *      NAT transmitter (only for outside-to-inside nat forwarding)
670
 */
671
static int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
672
{
673
        struct rtable *rt;              /* Route to the other host */
674
        struct iphdr  *iph;
675
        union ip_vs_tphdr h;
676
        int ihl;
677
        unsigned short size;
678
        int mtu;
679
 
680
        EnterFunction(10);
681
 
682
        /*
683
         * If it has ip_vs_app helper, the helper may change the payload,
684
         * so it needs full checksum checking and checksum calculation.
685
         * If not, only the header (such as IP address and port number)
686
         * will be changed, so it is fast to do incremental checksum update,
687
         * and let the destination host  do final checksum checking.
688
         */
689
 
690
        if (cp->app && skb_is_nonlinear(skb)
691
            && skb_linearize(skb, GFP_ATOMIC) != 0)
692
                return NF_DROP;
693
 
694
        iph = skb->nh.iph;
695
        ihl = iph->ihl << 2;
696
        h.raw = (char*) iph + ihl;
697
        size = ntohs(iph->tot_len) - ihl;
698
 
699
        /* do TCP/UDP checksum checking if it has application helper */
700
        if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
701
                switch (skb->ip_summed) {
702
                case CHECKSUM_NONE:
703
                        skb->csum = csum_partial(h.raw, size, 0);
704
 
705
                case CHECKSUM_HW:
706
                        if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
707
                                              iph->protocol, skb->csum)) {
708
                                IP_VS_DBG_RL("Incoming failed %s checksum "
709
                                             "from %d.%d.%d.%d (size=%d)!\n",
710
                                             ip_vs_proto_name(iph->protocol),
711
                                             NIPQUAD(iph->saddr),
712
                                             size);
713
                                goto tx_error;
714
                        }
715
                        break;
716
                default:
717
                        /* CHECKSUM_UNNECESSARY */
718
                        break;
719
                }
720
        }
721
 
722
        /*
723
         *  Check if it is no_cport connection ...
724
         */
725
        if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
726
                if (ip_vs_conn_unhash(cp)) {
727
                        spin_lock(&cp->lock);
728
                        if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
729
                                atomic_dec(&ip_vs_conn_no_cport_cnt);
730
                                cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
731
                                cp->cport = h.portp[0];
732
                                IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
733
                        }
734
                        spin_unlock(&cp->lock);
735
 
736
                        /* hash on new dport */
737
                        ip_vs_conn_hash(cp);
738
                }
739
        }
740
 
741
        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
742
                goto tx_error_icmp;
743
 
744
        /* MTU checking */
745
        mtu = rt->u.dst.pmtu;
746
        if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
747
                ip_rt_put(rt);
748
                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
749
                IP_VS_DBG_RL("ip_vs_nat_xmit(): frag needed\n");
750
                goto tx_error;
751
        }
752
 
753
        /* drop old route */
754
        dst_release(skb->dst);
755
        skb->dst = &rt->u.dst;
756
 
757
        /* copy-on-write the packet before mangling it */
758
        if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
759
                return NF_DROP;
760
 
761
        /* mangle the packet */
762
        iph->daddr = cp->daddr;
763
        h.portp[1] = cp->dport;
764
 
765
        /*
766
         *      Attempt ip_vs_app call.
767
         *      will fix ip_vs_conn and iph ack_seq stuff
768
         */
769
        if (ip_vs_app_pkt_in(cp, skb) != 0) {
770
                /* skb data has probably changed, update pointers */
771
                iph = skb->nh.iph;
772
                h.raw = (char*) iph + ihl;
773
                size = skb->len - ihl;
774
        }
775
 
776
        /*
777
         *      Adjust TCP/UDP checksums
778
         */
779
        if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
780
                /* Only port and addr are changed, do fast csum update */
781
                ip_vs_fast_check_update(&h, cp->vaddr, cp->daddr,
782
                                        cp->vport, cp->dport, iph->protocol);
783
                if (skb->ip_summed == CHECKSUM_HW)
784
                        skb->ip_summed = CHECKSUM_NONE;
785
        } else {
786
                /* full checksum calculation */
787
                switch (iph->protocol) {
788
                case IPPROTO_TCP:
789
                        h.th->check = 0;
790
                        h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
791
                                                        size, iph->protocol,
792
                                                        csum_partial(h.raw, size, 0));
793
                        break;
794
                case IPPROTO_UDP:
795
                        h.uh->check = 0;
796
                        h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
797
                                                        size, iph->protocol,
798
                                                        csum_partial(h.raw, size, 0));
799
                        if (h.uh->check == 0)
800
                                h.uh->check = 0xFFFF;
801
                        break;
802
                }
803
                skb->ip_summed = CHECKSUM_UNNECESSARY;
804
        }
805
        ip_send_check(iph);
806
 
807
        IP_VS_DBG(10, "NAT to %u.%u.%u.%u:%d\n",
808
                  NIPQUAD(iph->daddr), ntohs(h.portp[1]));
809
 
810
        /* FIXME: when application helper enlarges the packet and the length
811
           is larger than the MTU of outgoing device, there will be still
812
           MTU problem. */
813
 
814
#ifdef CONFIG_NETFILTER_DEBUG
815
        skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
816
#endif /* CONFIG_NETFILTER_DEBUG */
817
        skb->nfcache |= NFC_IPVS_PROPERTY;
818
        ip_send(skb);
819
 
820
        LeaveFunction(10);
821
        return NF_STOLEN;
822
 
823
  tx_error_icmp:
824
        dst_link_failure(skb);
825
  tx_error:
826
        kfree_skb(skb);
827
        return NF_STOLEN;
828
}
829
 
830
 
831
/*
832
 *   IP Tunneling transmitter
833
 *
834
 *   This function encapsulates the packet in a new IP packet, its
835
 *   destination will be set to cp->daddr. Most code of this function
836
 *   is taken from ipip.c.
837
 *
838
 *   It is used in VS/TUN cluster. The load balancer selects a real
839
 *   server from a cluster based on a scheduling algorithm,
840
 *   encapsulates the request packet and forwards it to the selected
841
 *   server. For example, all real servers are configured with
842
 *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
843
 *   the encapsulated packet, it will decapsulate the packet, processe
844
 *   the request and return the response packets directly to the client
845
 *   without passing the load balancer. This can greatly increase the
846
 *   scalability of virtual server.
847
 */
848
static int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
849
{
850
        struct rtable *rt;                      /* Route to the other host */
851
        struct net_device *tdev;                /* Device to other host */
852
        struct iphdr  *old_iph = skb->nh.iph;
853
        u8     tos = old_iph->tos;
854
        u16    df = old_iph->frag_off;
855
        struct iphdr  *iph;                     /* Our new IP header */
856
        int    max_headroom;                    /* The extra header space needed */
857
        int    mtu;
858
 
859
        EnterFunction(10);
860
 
861
        if (skb->protocol != __constant_htons(ETH_P_IP)) {
862
                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
863
                             "ETH_P_IP: %d, skb protocol: %d\n",
864
                             __constant_htons(ETH_P_IP), skb->protocol);
865
                goto tx_error;
866
        }
867
 
868
        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
869
                goto tx_error_icmp;
870
 
871
        tdev = rt->u.dst.dev;
872
 
873
        mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
874
        if (mtu < 68) {
875
                ip_rt_put(rt);
876
                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
877
                goto tx_error;
878
        }
879
        if (skb->dst && mtu < skb->dst->pmtu)
880
                skb->dst->pmtu = mtu;
881
 
882
        df |= (old_iph->frag_off&__constant_htons(IP_DF));
883
 
884
        if ((old_iph->frag_off&__constant_htons(IP_DF))
885
            && mtu < ntohs(old_iph->tot_len)) {
886
                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
887
                ip_rt_put(rt);
888
                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
889
                goto tx_error;
890
        }
891
 
892
        /* update checksum because skb might be defragmented */
893
        ip_send_check(old_iph);
894
 
895
        /*
896
         * Okay, now see if we can stuff it in the buffer as-is.
897
         */
898
        max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
899
 
900
        if (skb_headroom(skb) < max_headroom
901
            || skb_cloned(skb) || skb_shared(skb)) {
902
                struct sk_buff *new_skb =
903
                        skb_realloc_headroom(skb, max_headroom);
904
                if (!new_skb) {
905
                        ip_rt_put(rt);
906
                        IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
907
                        return NF_DROP;
908
                }
909
                kfree_skb(skb);
910
                skb = new_skb;
911
                old_iph = skb->nh.iph;
912
        }
913
 
914
        skb->h.raw = skb->nh.raw;
915
        skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
916
        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
917
 
918
        /* drop old route */
919
        dst_release(skb->dst);
920
        skb->dst = &rt->u.dst;
921
 
922
        /*
923
         *      Push down and install the IPIP header.
924
         */
925
        iph                     =       skb->nh.iph;
926
        iph->version            =       4;
927
        iph->ihl                =       sizeof(struct iphdr)>>2;
928
        iph->frag_off           =       df;
929
        iph->protocol           =       IPPROTO_IPIP;
930
        iph->tos                =       tos;
931
        iph->daddr              =       rt->rt_dst;
932
        iph->saddr              =       rt->rt_src;
933
        iph->ttl                =       old_iph->ttl;
934
        iph->tot_len            =       htons(skb->len);
935
        ip_select_ident(iph, &rt->u.dst, NULL);
936
        ip_send_check(iph);
937
 
938
        skb->ip_summed = CHECKSUM_NONE;
939
#ifdef CONFIG_NETFILTER_DEBUG
940
        skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
941
#endif /* CONFIG_NETFILTER_DEBUG */
942
        skb->nfcache |= NFC_IPVS_PROPERTY;
943
        ip_send(skb);
944
 
945
        LeaveFunction(10);
946
 
947
        return NF_STOLEN;
948
 
949
  tx_error_icmp:
950
        dst_link_failure(skb);
951
  tx_error:
952
        kfree_skb(skb);
953
        return NF_STOLEN;
954
}
955
 
956
 
957
/*
958
 *      Direct Routing transmitter
959
 */
960
static int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
961
{
962
        struct rtable *rt;                      /* Route to the other host */
963
        struct iphdr  *iph = skb->nh.iph;
964
        int    mtu;
965
 
966
        EnterFunction(10);
967
 
968
        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
969
                goto tx_error_icmp;
970
 
971
        /* MTU checking */
972
        mtu = rt->u.dst.pmtu;
973
        if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
974
                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
975
                ip_rt_put(rt);
976
                IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
977
                goto tx_error;
978
        }
979
 
980
        /* update checksum because skb might be defragmented */
981
        ip_send_check(iph);
982
 
983
        if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
984
                if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
985
                        ip_rt_put(rt);
986
                        IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
987
                        goto tx_error;
988
                }
989
        }
990
 
991
        /* drop old route */
992
        dst_release(skb->dst);
993
        skb->dst = &rt->u.dst;
994
 
995
#ifdef CONFIG_NETFILTER_DEBUG
996
        skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
997
#endif /* CONFIG_NETFILTER_DEBUG */
998
        skb->nfcache |= NFC_IPVS_PROPERTY;
999
        ip_send(skb);
1000
 
1001
#if 0000
1002
        NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
1003
                do_ip_send);
1004
#endif
1005
        LeaveFunction(10);
1006
        return NF_STOLEN;
1007
 
1008
  tx_error_icmp:
1009
        dst_link_failure(skb);
1010
  tx_error:
1011
        kfree_skb(skb);
1012
        return NF_STOLEN;
1013
}
1014
 
1015
 
1016
/*
1017
 *  Bind a connection entry with the corresponding packet_xmit.
1018
 *  Called by ip_vs_conn_new.
1019
 */
1020
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
1021
{
1022
        switch (IP_VS_FWD_METHOD(cp)) {
1023
        case IP_VS_CONN_F_MASQ:
1024
                cp->packet_xmit = ip_vs_nat_xmit;
1025
                break;
1026
 
1027
        case IP_VS_CONN_F_TUNNEL:
1028
                cp->packet_xmit = ip_vs_tunnel_xmit;
1029
                break;
1030
 
1031
        case IP_VS_CONN_F_DROUTE:
1032
                cp->packet_xmit = ip_vs_dr_xmit;
1033
                break;
1034
 
1035
        case IP_VS_CONN_F_LOCALNODE:
1036
                cp->packet_xmit = ip_vs_null_xmit;
1037
                break;
1038
 
1039
        case IP_VS_CONN_F_BYPASS:
1040
                cp->packet_xmit = ip_vs_bypass_xmit;
1041
                break;
1042
        }
1043
}
1044
 
1045
 
1046
/*
1047
 *  Bind a connection entry with a virtual service destination
1048
 *  Called just after a new connection entry is created.
1049
 */
1050
static inline void
1051
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
1052
{
1053
        /* if dest is NULL, then return directly */
1054
        if (!dest)
1055
                return;
1056
 
1057
        /* Increase the refcnt counter of the dest */
1058
        atomic_inc(&dest->refcnt);
1059
 
1060
        /* Bind with the destination and its corresponding transmitter */
1061
        cp->flags |= atomic_read(&dest->conn_flags);
1062
        cp->dest = dest;
1063
 
1064
        IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
1065
                  "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
1066
                  ip_vs_proto_name(cp->protocol),
1067
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
1068
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
1069
                  NIPQUAD(cp->daddr), ntohs(cp->dport),
1070
                  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
1071
                  cp->flags, atomic_read(&cp->refcnt),
1072
                  atomic_read(&dest->refcnt));
1073
}
1074
 
1075
 
1076
/*
1077
 *  Unbind a connection entry with its VS destination
1078
 *  Called by the ip_vs_conn_expire function.
1079
 */
1080
static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
1081
{
1082
        struct ip_vs_dest *dest = cp->dest;
1083
 
1084
        /* if dest is NULL, then return directly */
1085
        if (!dest)
1086
                return;
1087
 
1088
        IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
1089
                  "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
1090
                  "s:%s flg:%X cnt:%d destcnt:%d",
1091
                  ip_vs_proto_name(cp->protocol),
1092
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
1093
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
1094
                  NIPQUAD(cp->daddr), ntohs(cp->dport),
1095
                  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
1096
                  cp->flags, atomic_read(&cp->refcnt),
1097
                  atomic_read(&dest->refcnt));
1098
 
1099
        /*
1100
         * Decrease the inactconns or activeconns counter
1101
         * if it is not a connection template ((cp->cport!=0)
1102
         *   || (cp->flags & IP_VS_CONN_F_NO_CPORT)).
1103
         */
1104
        if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
1105
                if (cp->flags & IP_VS_CONN_F_INACTIVE) {
1106
                        atomic_dec(&dest->inactconns);
1107
                } else {
1108
                        atomic_dec(&dest->activeconns);
1109
                }
1110
        }
1111
 
1112
        /*
1113
         * Simply decrease the refcnt of the dest, because the
1114
         * dest will be either in service's destination list
1115
         * or in the trash.
1116
         */
1117
        atomic_dec(&dest->refcnt);
1118
}
1119
 
1120
 
1121
/*
1122
 *  Checking if the destination of a connection template is available.
1123
 *  If available, return 1, otherwise invalidate this connection
1124
 *  template and return 0.
1125
 */
1126
int ip_vs_check_template(struct ip_vs_conn *ct)
1127
{
1128
        struct ip_vs_dest *dest = ct->dest;
1129
 
1130
        /*
1131
         * Checking the dest server status.
1132
         */
1133
        if ((dest == NULL) ||
1134
            !(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1135
                IP_VS_DBG(9, "check_template: dest not available for "
1136
                          "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
1137
                          "-> d:%u.%u.%u.%u:%d\n",
1138
                          ip_vs_proto_name(ct->protocol),
1139
                          NIPQUAD(ct->caddr), ntohs(ct->cport),
1140
                          NIPQUAD(ct->vaddr), ntohs(ct->vport),
1141
                          NIPQUAD(ct->daddr), ntohs(ct->dport));
1142
 
1143
                /*
1144
                 * Invalidate the connection template
1145
                 */
1146
                if (ct->cport) {
1147
                        if (ip_vs_conn_unhash(ct)) {
1148
                                ct->dport = 65535;
1149
                                ct->vport = 65535;
1150
                                ct->cport = 0;
1151
                                ip_vs_conn_hash(ct);
1152
                        }
1153
                }
1154
 
1155
                /*
1156
                 * Simply decrease the refcnt of the template,
1157
                 * don't restart its timer.
1158
                 */
1159
                atomic_dec(&ct->refcnt);
1160
                return 0;
1161
        }
1162
        return 1;
1163
}
1164
 
1165
 
1166
static inline void
1167
ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim)
1168
{
1169
        atomic_inc(&vstim->refcnt);
1170
        cp->timeout_table = vstim;
1171
}
1172
 
1173
static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp)
1174
{
1175
        struct ip_vs_timeout_table *vstim = cp->timeout_table;
1176
 
1177
        if (!vstim)
1178
                return;
1179
        cp->timeout_table = NULL;
1180
        atomic_dec(&vstim->refcnt);
1181
}
1182
 
1183
 
1184
static void ip_vs_conn_expire(unsigned long data)
1185
{
1186
        struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
1187
 
1188
        if (cp->timeout_table)
1189
                cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT];
1190
        else
1191
                cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT];
1192
 
1193
        /*
1194
         *      hey, I'm using it
1195
         */
1196
        atomic_inc(&cp->refcnt);
1197
 
1198
        /*
1199
         *      do I control anybody?
1200
         */
1201
        if (atomic_read(&cp->n_control))
1202
                goto expire_later;
1203
 
1204
        /*
1205
         *      unhash it if it is hashed in the conn table
1206
         */
1207
        if (!ip_vs_conn_unhash(cp))
1208
                goto expire_later;
1209
 
1210
        /*
1211
         *      refcnt==1 implies I'm the only one referrer
1212
         */
1213
        if (likely(atomic_read(&cp->refcnt) == 1)) {
1214
                /* make sure that there is no timer on it now */
1215
                if (timer_pending(&cp->timer))
1216
                        del_timer(&cp->timer);
1217
 
1218
                /* does anybody control me? */
1219
                if (cp->control)
1220
                        ip_vs_control_del(cp);
1221
 
1222
                ip_vs_unbind_dest(cp);
1223
                ip_vs_unbind_app(cp);
1224
                ip_vs_timeout_detach(cp);
1225
                if (cp->flags & IP_VS_CONN_F_NO_CPORT)
1226
                        atomic_dec(&ip_vs_conn_no_cport_cnt);
1227
                atomic_dec(&ip_vs_conn_count);
1228
 
1229
                kmem_cache_free(ip_vs_conn_cachep, cp);
1230
                return;
1231
        }
1232
 
1233
        /* hash it back to the table */
1234
        ip_vs_conn_hash(cp);
1235
 
1236
  expire_later:
1237
        IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
1238
                  atomic_read(&cp->refcnt)-1,
1239
                  atomic_read(&cp->n_control));
1240
 
1241
        ip_vs_conn_put(cp);
1242
}
1243
 
1244
 
1245
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
1246
{
1247
        cp->timeout = 0;
1248
        mod_timer(&cp->timer, jiffies);
1249
        __ip_vs_conn_put(cp);
1250
}
1251
 
1252
/*
1253
 *  Create a new connection entry and hash it into the ip_vs_conn_tab.
1254
 */
1255
struct ip_vs_conn *
1256
ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
1257
               __u32 daddr, __u16 dport, unsigned flags,
1258
               struct ip_vs_dest *dest)
1259
{
1260
        struct ip_vs_conn *cp;
1261
 
1262
        cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
1263
        if (cp == NULL) {
1264
                IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
1265
                return NULL;
1266
        }
1267
 
1268
        memset(cp, 0, sizeof(*cp));
1269
        INIT_LIST_HEAD(&cp->c_list);
1270
        init_timer(&cp->timer);
1271
        cp->timer.data     = (unsigned long)cp;
1272
        cp->timer.function = ip_vs_conn_expire;
1273
        ip_vs_timeout_attach(cp, ip_vs_timeout_table);
1274
        cp->protocol       = proto;
1275
        cp->caddr          = caddr;
1276
        cp->cport          = cport;
1277
        cp->vaddr          = vaddr;
1278
        cp->vport          = vport;
1279
        cp->daddr          = daddr;
1280
        cp->dport          = dport;
1281
        cp->flags          = flags;
1282
        cp->app_data       = NULL;
1283
        cp->control        = NULL;
1284
        cp->lock           = SPIN_LOCK_UNLOCKED;
1285
 
1286
        atomic_set(&cp->n_control, 0);
1287
        atomic_set(&cp->in_pkts, 0);
1288
 
1289
        atomic_inc(&ip_vs_conn_count);
1290
        if (flags & IP_VS_CONN_F_NO_CPORT)
1291
                atomic_inc(&ip_vs_conn_no_cport_cnt);
1292
 
1293
        /* Bind its application helper (only for VS/NAT) if any */
1294
        ip_vs_bind_app(cp);
1295
 
1296
        /* Bind the connection with a destination server */
1297
        ip_vs_bind_dest(cp, dest);
1298
 
1299
        /* Set its state and timeout */
1300
        vs_set_state_timeout(cp, IP_VS_S_NONE);
1301
 
1302
        /* Bind its packet transmitter */
1303
        ip_vs_bind_xmit(cp);
1304
 
1305
        /*
1306
         * Set the entry is referenced by the current thread before hashing
1307
         * it in the table, so that other thread run ip_vs_random_dropentry
1308
         * but cannot drop this entry.
1309
         */
1310
        atomic_set(&cp->refcnt, 1);
1311
 
1312
        /* Hash it in the ip_vs_conn_tab finally */
1313
        ip_vs_conn_hash(cp);
1314
 
1315
        return cp;
1316
}
1317
 
1318
 
1319
/*
1320
 *      /proc/net/ip_vs_conn entries
1321
 */
1322
static int
1323
ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
1324
{
1325
        off_t pos=0;
1326
        int idx, len=0;
1327
        char temp[70];
1328
        struct ip_vs_conn *cp;
1329
        struct list_head *l, *e;
1330
 
1331
        pos = 128;
1332
        if (pos > offset) {
1333
                len += sprintf(buffer+len, "%-127s\n",
1334
                               "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires");
1335
        }
1336
 
1337
        for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1338
                /*
1339
                 *      Lock is actually only need in next loop
1340
                 *      we are called from uspace: must stop bh.
1341
                 */
1342
                ct_read_lock_bh(idx);
1343
 
1344
                l = &ip_vs_conn_tab[idx];
1345
                for (e=l->next; e!=l; e=e->next) {
1346
                        cp = list_entry(e, struct ip_vs_conn, c_list);
1347
                        pos += 128;
1348
                        if (pos <= offset)
1349
                                continue;
1350
                        sprintf(temp,
1351
                                "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
1352
                                ip_vs_proto_name(cp->protocol),
1353
                                ntohl(cp->caddr), ntohs(cp->cport),
1354
                                ntohl(cp->vaddr), ntohs(cp->vport),
1355
                                ntohl(cp->daddr), ntohs(cp->dport),
1356
                                ip_vs_state_name(cp->state),
1357
                                (cp->timer.expires-jiffies)/HZ);
1358
                        len += sprintf(buffer+len, "%-127s\n", temp);
1359
                        if (pos >= offset+length) {
1360
                                ct_read_unlock_bh(idx);
1361
                                goto done;
1362
                        }
1363
                }
1364
                ct_read_unlock_bh(idx);
1365
        }
1366
 
1367
  done:
1368
        *start = buffer+len-(pos-offset);       /* Start of wanted data */
1369
        len = pos-offset;
1370
        if (len > length)
1371
                len = length;
1372
        if (len < 0)
1373
                len = 0;
1374
        return len;
1375
}
1376
 
1377
 
1378
/*
1379
 *      Randomly drop connection entries before running out of memory
1380
 */
1381
static inline int todrop_entry(struct ip_vs_conn *cp)
1382
{
1383
        /*
1384
         * The drop rate array needs tuning for real environments.
1385
         * Called from timer bh only => no locking
1386
         */
1387
        static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1388
        static char todrop_counter[9] = {0};
1389
        int i;
1390
 
1391
        /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1392
           This will leave enough time for normal connection to get
1393
           through. */
1394
        if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
1395
                return 0;
1396
 
1397
        /* Don't drop the entry if its number of incoming packets is not
1398
           located in [0, 8] */
1399
        i = atomic_read(&cp->in_pkts);
1400
        if (i > 8 || i < 0) return 0;
1401
 
1402
        if (!todrop_rate[i]) return 0;
1403
        if (--todrop_counter[i] > 0) return 0;
1404
 
1405
        todrop_counter[i] = todrop_rate[i];
1406
        return 1;
1407
}
1408
 
1409
 
1410
void ip_vs_random_dropentry(void)
1411
{
1412
        int idx;
1413
        struct ip_vs_conn *cp;
1414
        struct list_head *l,*e;
1415
        struct ip_vs_conn *ct;
1416
 
1417
        /*
1418
         * Randomly scan 1/32 of the whole table every second
1419
         */
1420
        for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) {
1421
                unsigned hash = net_random()&IP_VS_CONN_TAB_MASK;
1422
 
1423
                /*
1424
                 *  Lock is actually needed in this loop.
1425
                 */
1426
                ct_write_lock(hash);
1427
 
1428
                l = &ip_vs_conn_tab[hash];
1429
                for (e=l->next; e!=l; e=e->next) {
1430
                        cp = list_entry(e, struct ip_vs_conn, c_list);
1431
                        if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
1432
                                /* connection template */
1433
                                continue;
1434
                        switch(cp->state) {
1435
                        case IP_VS_S_SYN_RECV:
1436
                        case IP_VS_S_SYNACK:
1437
                                break;
1438
 
1439
                        case IP_VS_S_ESTABLISHED:
1440
                        case IP_VS_S_UDP:
1441
                                if (todrop_entry(cp))
1442
                                        break;
1443
                                continue;
1444
 
1445
                        default:
1446
                                continue;
1447
                        }
1448
 
1449
                        /*
1450
                         * Drop the entry, and drop its ct if not referenced
1451
                         */
1452
                        atomic_inc(&cp->refcnt);
1453
                        ct_write_unlock(hash);
1454
 
1455
                        if ((ct = cp->control))
1456
                                atomic_inc(&ct->refcnt);
1457
                        IP_VS_DBG(4, "del connection\n");
1458
                        ip_vs_conn_expire_now(cp);
1459
                        if (ct) {
1460
                                IP_VS_DBG(4, "del conn template\n");
1461
                                ip_vs_conn_expire_now(ct);
1462
                        }
1463
                        ct_write_lock(hash);
1464
                }
1465
                ct_write_unlock(hash);
1466
        }
1467
}
1468
 
1469
 
1470
/*
1471
 *      Flush all the connection entries in the ip_vs_conn_tab
1472
 */
1473
static void ip_vs_conn_flush(void)
1474
{
1475
        int idx;
1476
        struct ip_vs_conn *cp;
1477
        struct list_head *l,*e;
1478
        struct ip_vs_conn *ct;
1479
 
1480
  flush_again:
1481
        for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
1482
                /*
1483
                 *  Lock is actually needed in this loop.
1484
                 */
1485
                ct_write_lock_bh(idx);
1486
 
1487
                l = &ip_vs_conn_tab[idx];
1488
                for (e=l->next; e!=l; e=e->next) {
1489
                        cp = list_entry(e, struct ip_vs_conn, c_list);
1490
                        atomic_inc(&cp->refcnt);
1491
                        ct_write_unlock(idx);
1492
 
1493
                        if ((ct = cp->control))
1494
                                atomic_inc(&ct->refcnt);
1495
                        IP_VS_DBG(4, "del connection\n");
1496
                        ip_vs_conn_expire_now(cp);
1497
                        if (ct) {
1498
                                IP_VS_DBG(4, "del conn template\n");
1499
                                ip_vs_conn_expire_now(ct);
1500
                        }
1501
                        ct_write_lock(idx);
1502
                }
1503
                ct_write_unlock_bh(idx);
1504
        }
1505
 
1506
        /* the counter may be not NULL, because maybe some conn entries
1507
           are run by slow timer handler or unhashed but still referred */
1508
        if (atomic_read(&ip_vs_conn_count) != 0) {
1509
                schedule();
1510
                goto flush_again;
1511
        }
1512
}
1513
 
1514
 
1515
int ip_vs_conn_init(void)
1516
{
1517
        int idx;
1518
 
1519
        /*
1520
         * Allocate the connection hash table and initialize its list heads
1521
         */
1522
        ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
1523
        if (!ip_vs_conn_tab)
1524
                return -ENOMEM;
1525
 
1526
        IP_VS_INFO("Connection hash table configured "
1527
                   "(size=%d, memory=%ldKbytes)\n",
1528
                   IP_VS_CONN_TAB_SIZE,
1529
                   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
1530
        IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
1531
                  sizeof(struct ip_vs_conn));
1532
 
1533
        for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1534
                INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
1535
        }
1536
 
1537
        for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
1538
                __ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
1539
        }
1540
 
1541
        /* Allocate ip_vs_conn slab cache */
1542
        ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1543
                                              sizeof(struct ip_vs_conn), 0,
1544
                                              SLAB_HWCACHE_ALIGN, NULL, NULL);
1545
        if (!ip_vs_conn_cachep) {
1546
                vfree(ip_vs_conn_tab);
1547
                return -ENOMEM;
1548
        }
1549
 
1550
        proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
1551
 
1552
        /* calculate the random value for connection hash */
1553
        get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1554
 
1555
        return 0;
1556
}
1557
 
1558
void ip_vs_conn_cleanup(void)
1559
{
1560
        /* flush all the connection entries first */
1561
        ip_vs_conn_flush();
1562
 
1563
        /* Release the empty cache */
1564
        kmem_cache_destroy(ip_vs_conn_cachep);
1565
        proc_net_remove("ip_vs_conn");
1566
        vfree(ip_vs_conn_tab);
1567
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.