OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [net/] [ipv4/] [ipvs/] [ip_vs_lblc.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * IPVS:        Locality-Based Least-Connection scheduling module
3
 *
4
 * Version:     $Id: ip_vs_lblc.c,v 1.1.1.1 2004-04-15 01:14:02 phoenix Exp $
5
 *
6
 * Authors:     Wensong Zhang <wensong@gnuchina.org>
7
 *
8
 *              This program is free software; you can redistribute it and/or
9
 *              modify it under the terms of the GNU General Public License
10
 *              as published by the Free Software Foundation; either version
11
 *              2 of the License, or (at your option) any later version.
12
 *
13
 * Changes:
14
 *     Martin Hamilton         :    fixed the terrible locking bugs
15
 *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
16
 *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
17
 *     Wensong Zhang           :    added doing full expiration check to
18
 *                                   collect stale entries of 24+ hours when
19
 *                                   no partial expire check in a half hour
20
 *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
21
 *                                   to avoid the possible race between timer
22
 *                                   handler and del_timer thread in SMP
23
 *
24
 */
25
 
26
/*
27
 * The lblc algorithm is as follows (pseudo code):
28
 *
29
 *       if cachenode[dest_ip] is null then
30
 *               n, cachenode[dest_ip] <- {weighted least-conn node};
31
 *       else
32
 *               n <- cachenode[dest_ip];
33
 *               if (n is dead) OR
34
 *                  (n.conns>n.weight AND
35
 *                   there is a node m with m.conns<m.weight/2) then
36
 *                 n, cachenode[dest_ip] <- {weighted least-conn node};
37
 *
38
 *       return n;
39
 *
40
 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
41
 * me to write this module.
42
 */
43
 
44
#include <linux/module.h>
45
#include <linux/kernel.h>
46
 
47
/* for sysctl */
48
#include <linux/fs.h>
49
#include <linux/sysctl.h>
50
 
51
#include <net/ip_vs.h>
52
 
53
 
54
/*
55
 *    It is for garbage collection of stale IPVS lblc entries,
56
 *    when the table is full.
57
 */
58
#define CHECK_EXPIRE_INTERVAL   (60*HZ)
59
#define ENTRY_TIMEOUT           (6*60*HZ)
60
 
61
/*
62
 *    It is for full expiration check.
63
 *    When there is no partial expiration check (garbage collection)
64
 *    in a half hour, do a full expiration check to collect stale
65
 *    entries that haven't been touched for a day.
66
 */
67
#define COUNT_FOR_FULL_EXPIRATION   30
68
static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
69
 
70
 
71
/*
72
 *     for IPVS lblc entry hash table
73
 */
74
#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
75
#define CONFIG_IP_VS_LBLC_TAB_BITS      10
76
#endif
77
#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
78
#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
79
#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
80
 
81
 
82
/*
83
 *      IPVS lblc entry represents an association between destination
84
 *      IP address and its destination server
85
 */
86
struct ip_vs_lblc_entry {
87
        struct list_head        list;
88
        __u32                   addr;           /* destination IP address */
89
        struct ip_vs_dest       *dest;          /* real server (cache) */
90
        unsigned long           lastuse;        /* last used time */
91
};
92
 
93
 
94
/*
95
 *      IPVS lblc hash table
96
 */
97
struct ip_vs_lblc_table {
98
        rwlock_t                lock;           /* lock for this table */
99
        struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
100
        atomic_t                entries;        /* number of entries */
101
        int                     max_size;       /* maximum size of entries */
102
        struct timer_list       periodic_timer; /* collect stale entries */
103
        int                     rover;          /* rover for expire check */
104
        int                     counter;        /* counter for no expire */
105
};
106
 
107
 
108
/*
109
 *      IPVS LBLC sysctl table
110
 */
111
struct ip_vs_lblc_sysctl_table {
112
        struct ctl_table_header *sysctl_header;
113
        ctl_table vs_vars[2];
114
        ctl_table vs_dir[2];
115
        ctl_table ipv4_dir[2];
116
        ctl_table root_dir[2];
117
};
118
 
119
 
120
static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = {
121
        NULL,
122
        {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration",
123
          &sysctl_ip_vs_lblc_expiration,
124
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
125
         {0}},
126
        {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars},
127
         {0}},
128
        {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir},
129
         {0}},
130
        {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir},
131
         {0}}
132
};
133
 
134
 
135
/*
136
 *      new/free a ip_vs_lblc_entry, which is a mapping of a destionation
137
 *      IP address to a server.
138
 */
139
static inline struct ip_vs_lblc_entry *
140
ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
141
{
142
        struct ip_vs_lblc_entry *en;
143
 
144
        en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
145
        if (en == NULL) {
146
                IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
147
                return NULL;
148
        }
149
 
150
        INIT_LIST_HEAD(&en->list);
151
        en->addr = daddr;
152
 
153
        atomic_inc(&dest->refcnt);
154
        en->dest = dest;
155
 
156
        return en;
157
}
158
 
159
 
160
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
161
{
162
        list_del(&en->list);
163
        /*
164
         * We don't kfree dest because it is refered either by its service
165
         * or the trash dest list.
166
         */
167
        atomic_dec(&en->dest->refcnt);
168
        kfree(en);
169
}
170
 
171
 
172
/*
173
 *      Returns hash value for IPVS LBLC entry
174
 */
175
static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
176
{
177
        return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
178
}
179
 
180
 
181
/*
182
 *      Hash an entry in the ip_vs_lblc_table.
183
 *      returns bool success.
184
 */
185
static int
186
ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
187
{
188
        unsigned hash;
189
 
190
        if (!list_empty(&en->list)) {
191
                IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
192
                          "called from %p\n", __builtin_return_address(0));
193
                return 0;
194
        }
195
 
196
        /*
197
         *      Hash by destination IP address
198
         */
199
        hash = ip_vs_lblc_hashkey(en->addr);
200
 
201
        write_lock(&tbl->lock);
202
        list_add(&en->list, &tbl->bucket[hash]);
203
        atomic_inc(&tbl->entries);
204
        write_unlock(&tbl->lock);
205
 
206
        return 1;
207
}
208
 
209
 
210
#if 0000
211
/*
212
 *      Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
213
 *      returns bool success.
214
 */
215
static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
216
                             struct ip_vs_lblc_entry *en)
217
{
218
        if (list_empty(&en->list)) {
219
                IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
220
                          "called from %p\n", __builtin_return_address(0));
221
                return 0;
222
        }
223
 
224
        /*
225
         * Remove it from the table
226
         */
227
        write_lock(&tbl->lock);
228
        list_del(&en->list);
229
        INIT_LIST_HEAD(&en->list);
230
        write_unlock(&tbl->lock);
231
 
232
        return 1;
233
}
234
#endif
235
 
236
 
237
/*
238
 *  Get ip_vs_lblc_entry associated with supplied parameters.
239
 */
240
static inline struct ip_vs_lblc_entry *
241
ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
242
{
243
        unsigned hash;
244
        struct ip_vs_lblc_entry *en;
245
        struct list_head *l,*e;
246
 
247
        hash = ip_vs_lblc_hashkey(addr);
248
        l = &tbl->bucket[hash];
249
 
250
        read_lock(&tbl->lock);
251
 
252
        for (e=l->next; e!=l; e=e->next) {
253
                en = list_entry(e, struct ip_vs_lblc_entry, list);
254
                if (en->addr == addr) {
255
                        /* HIT */
256
                        read_unlock(&tbl->lock);
257
                        return en;
258
                }
259
        }
260
 
261
        read_unlock(&tbl->lock);
262
 
263
        return NULL;
264
}
265
 
266
 
267
/*
268
 *      Flush all the entries of the specified table.
269
 */
270
static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
271
{
272
        int i;
273
        struct list_head *l;
274
        struct ip_vs_lblc_entry *en;
275
 
276
        for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
277
                write_lock(&tbl->lock);
278
                for (l=&tbl->bucket[i]; l->next!=l; ) {
279
                        en = list_entry(l->next,
280
                                        struct ip_vs_lblc_entry, list);
281
                        ip_vs_lblc_free(en);
282
                        atomic_dec(&tbl->entries);
283
                }
284
                write_unlock(&tbl->lock);
285
        }
286
}
287
 
288
 
289
static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
290
{
291
        unsigned long now = jiffies;
292
        int i, j;
293
        struct list_head *l, *e;
294
        struct ip_vs_lblc_entry *en;
295
 
296
        for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
297
                j = (j + 1) & IP_VS_LBLC_TAB_MASK;
298
                e = l = &tbl->bucket[j];
299
                write_lock(&tbl->lock);
300
                while (e->next != l) {
301
                        en = list_entry(e->next,
302
                                        struct ip_vs_lblc_entry, list);
303
                        if ((now - en->lastuse) <
304
                            sysctl_ip_vs_lblc_expiration) {
305
                                e = e->next;
306
                                continue;
307
                        }
308
                        ip_vs_lblc_free(en);
309
                        atomic_dec(&tbl->entries);
310
                }
311
                write_unlock(&tbl->lock);
312
        }
313
        tbl->rover = j;
314
}
315
 
316
 
317
/*
318
 *      Periodical timer handler for IPVS lblc table
319
 *      It is used to collect stale entries when the number of entries
320
 *      exceeds the maximum size of the table.
321
 *
322
 *      Fixme: we probably need more complicated algorithm to collect
323
 *             entries that have not been used for a long time even
324
 *             if the number of entries doesn't exceed the maximum size
325
 *             of the table.
326
 *      The full expiration check is for this purpose now.
327
 */
328
static void ip_vs_lblc_check_expire(unsigned long data)
329
{
330
        struct ip_vs_lblc_table *tbl;
331
        unsigned long now = jiffies;
332
        int goal;
333
        int i, j;
334
        struct list_head *l, *e;
335
        struct ip_vs_lblc_entry *en;
336
 
337
        tbl = (struct ip_vs_lblc_table *)data;
338
 
339
        if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
340
                /* do full expiration check */
341
                ip_vs_lblc_full_check(tbl);
342
                tbl->counter = 1;
343
                goto out;
344
        }
345
 
346
        if (atomic_read(&tbl->entries) <= tbl->max_size) {
347
                tbl->counter++;
348
                goto out;
349
        }
350
 
351
        goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
352
        if (goal > tbl->max_size/2)
353
                goal = tbl->max_size/2;
354
 
355
        for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
356
                j = (j + 1) & IP_VS_LBLC_TAB_MASK;
357
                e = l = &tbl->bucket[j];
358
                write_lock(&tbl->lock);
359
                while (e->next != l) {
360
                        en = list_entry(e->next,
361
                                        struct ip_vs_lblc_entry, list);
362
                        if ((now - en->lastuse) < ENTRY_TIMEOUT) {
363
                                e = e->next;
364
                                continue;
365
                        }
366
                        ip_vs_lblc_free(en);
367
                        atomic_dec(&tbl->entries);
368
                        goal--;
369
                }
370
                write_unlock(&tbl->lock);
371
                if (goal <= 0)
372
                        break;
373
        }
374
        tbl->rover = j;
375
 
376
  out:
377
        mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
378
}
379
 
380
 
381
static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
382
{
383
        int i;
384
        struct ip_vs_lblc_table *tbl;
385
 
386
        /*
387
         *    Allocate the ip_vs_lblc_table for this service
388
         */
389
        tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
390
        if (tbl == NULL) {
391
                IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
392
                return -ENOMEM;
393
        }
394
        svc->sched_data = tbl;
395
        IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) allocated for "
396
                  "current service\n",
397
                  sizeof(struct ip_vs_lblc_table));
398
 
399
        /*
400
         *    Initialize the hash buckets
401
         */
402
        for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
403
                INIT_LIST_HEAD(&tbl->bucket[i]);
404
        }
405
        tbl->lock = RW_LOCK_UNLOCKED;
406
        tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
407
        tbl->rover = 0;
408
        tbl->counter = 1;
409
 
410
        /*
411
         *    Hook periodic timer for garbage collection
412
         */
413
        init_timer(&tbl->periodic_timer);
414
        tbl->periodic_timer.data = (unsigned long)tbl;
415
        tbl->periodic_timer.function = ip_vs_lblc_check_expire;
416
        tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
417
        add_timer(&tbl->periodic_timer);
418
 
419
        return 0;
420
}
421
 
422
 
423
static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
424
{
425
        struct ip_vs_lblc_table *tbl = svc->sched_data;
426
 
427
        /* remove periodic timer */
428
        del_timer_sync(&tbl->periodic_timer);
429
 
430
        /* got to clean up table entries here */
431
        ip_vs_lblc_flush(tbl);
432
 
433
        /* release the table itself */
434
        kfree(svc->sched_data);
435
        IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) released\n",
436
                  sizeof(struct ip_vs_lblc_table));
437
 
438
        return 0;
439
}
440
 
441
 
442
static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
443
{
444
        return 0;
445
}
446
 
447
 
448
static inline struct ip_vs_dest *
449
__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
450
{
451
        register struct list_head *l, *e;
452
        struct ip_vs_dest *dest, *least;
453
        int loh, doh;
454
 
455
        /*
456
         * We think the overhead of processing active connections is fifty
457
         * times higher than that of inactive connections in average. (This
458
         * fifty times might not be accurate, we will change it later.) We
459
         * use the following formula to estimate the overhead:
460
         *                dest->activeconns*50 + dest->inactconns
461
         * and the load:
462
         *                (dest overhead) / dest->weight
463
         *
464
         * Remember -- no floats in kernel mode!!!
465
         * The comparison of h1*w2 > h2*w1 is equivalent to that of
466
         *                h1/w1 > h2/w2
467
         * if every weight is larger than zero.
468
         *
469
         * The server with weight=0 is quiesced and will not receive any
470
         * new connection.
471
         */
472
 
473
        l = &svc->destinations;
474
        for (e=l->next; e!=l; e=e->next) {
475
                least = list_entry(e, struct ip_vs_dest, n_list);
476
                if (atomic_read(&least->weight) > 0) {
477
                        loh = atomic_read(&least->activeconns) * 50
478
                                + atomic_read(&least->inactconns);
479
                        goto nextstage;
480
                }
481
        }
482
        return NULL;
483
 
484
        /*
485
         *    Find the destination with the least load.
486
         */
487
  nextstage:
488
        for (e=e->next; e!=l; e=e->next) {
489
                dest = list_entry(e, struct ip_vs_dest, n_list);
490
                doh = atomic_read(&dest->activeconns) * 50
491
                        + atomic_read(&dest->inactconns);
492
                if (loh * atomic_read(&dest->weight) >
493
                    doh * atomic_read(&least->weight)) {
494
                        least = dest;
495
                        loh = doh;
496
                }
497
        }
498
 
499
        IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
500
                  "activeconns %d refcnt %d weight %d overhead %d\n",
501
                  NIPQUAD(least->addr), ntohs(least->port),
502
                  atomic_read(&least->activeconns),
503
                  atomic_read(&least->refcnt),
504
                  atomic_read(&least->weight), loh);
505
 
506
        return least;
507
}
508
 
509
 
510
/*
511
 *   If this destination server is overloaded and there is a less loaded
512
 *   server, then return true.
513
 */
514
static inline int
515
is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
516
{
517
        if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
518
                register struct list_head *l, *e;
519
                struct ip_vs_dest *d;
520
 
521
                l = &svc->destinations;
522
                for (e=l->next; e!=l; e=e->next) {
523
                        d = list_entry(e, struct ip_vs_dest, n_list);
524
                        if (atomic_read(&d->activeconns)*2
525
                            < atomic_read(&d->weight)) {
526
                                return 1;
527
                        }
528
                }
529
        }
530
        return 0;
531
}
532
 
533
 
534
/*
535
 *    Locality-Based (weighted) Least-Connection scheduling
536
 */
537
static struct ip_vs_dest *
538
ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
539
{
540
        struct ip_vs_dest *dest;
541
        struct ip_vs_lblc_table *tbl;
542
        struct ip_vs_lblc_entry *en;
543
 
544
        IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
545
 
546
        tbl = (struct ip_vs_lblc_table *)svc->sched_data;
547
        en = ip_vs_lblc_get(tbl, iph->daddr);
548
        if (en == NULL) {
549
                dest = __ip_vs_wlc_schedule(svc, iph);
550
                if (dest == NULL) {
551
                        IP_VS_DBG(1, "no destination available\n");
552
                        return NULL;
553
                }
554
                en = ip_vs_lblc_new(iph->daddr, dest);
555
                if (en == NULL) {
556
                        return NULL;
557
                }
558
                ip_vs_lblc_hash(tbl, en);
559
        } else {
560
                dest = en->dest;
561
                if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
562
                    || atomic_read(&dest->weight) <= 0
563
                    || is_overloaded(dest, svc)) {
564
                        dest = __ip_vs_wlc_schedule(svc, iph);
565
                        if (dest == NULL) {
566
                                IP_VS_DBG(1, "no destination available\n");
567
                                return NULL;
568
                        }
569
                        atomic_dec(&en->dest->refcnt);
570
                        atomic_inc(&dest->refcnt);
571
                        en->dest = dest;
572
                }
573
        }
574
        en->lastuse = jiffies;
575
 
576
        IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
577
                  "--> server %u.%u.%u.%u:%d\n",
578
                  NIPQUAD(en->addr),
579
                  NIPQUAD(dest->addr),
580
                  ntohs(dest->port));
581
 
582
        return dest;
583
}
584
 
585
 
586
/*
587
 *      IPVS LBLC Scheduler structure
588
 */
589
static struct ip_vs_scheduler ip_vs_lblc_scheduler =
590
{
591
        {0},                    /* n_list */
592
        "lblc",                 /* name */
593
        ATOMIC_INIT(0),         /* refcnt */
594
        THIS_MODULE,            /* this module */
595
        ip_vs_lblc_init_svc,    /* service initializer */
596
        ip_vs_lblc_done_svc,    /* service done */
597
        ip_vs_lblc_update_svc,  /* service updater */
598
        ip_vs_lblc_schedule,    /* select a server from the destination list */
599
};
600
 
601
 
602
static int __init ip_vs_lblc_init(void)
603
{
604
        INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
605
        lblc_sysctl_table.sysctl_header =
606
                register_sysctl_table(lblc_sysctl_table.root_dir, 0);
607
        return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
608
}
609
 
610
 
611
static void __exit ip_vs_lblc_cleanup(void)
612
{
613
        unregister_sysctl_table(lblc_sysctl_table.sysctl_header);
614
        unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
615
}
616
 
617
 
618
module_init(ip_vs_lblc_init);
619
module_exit(ip_vs_lblc_cleanup);
620
MODULE_LICENSE("GPL");

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.