OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [net/] [ipv4/] [route.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * INET         An implementation of the TCP/IP protocol suite for the LINUX
3
 *              operating system.  INET is implemented using the  BSD Socket
4
 *              interface as the means of communication with the user level.
5
 *
6
 *              ROUTE - implementation of the IP router.
7
 *
8
 * Version:     $Id: route.c,v 1.1.1.1 2004-04-15 01:13:41 phoenix Exp $
9
 *
10
 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11
 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12
 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13
 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14
 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15
 *
16
 * Fixes:
17
 *              Alan Cox        :       Verify area fixes.
18
 *              Alan Cox        :       cli() protects routing changes
19
 *              Rui Oliveira    :       ICMP routing table updates
20
 *              (rco@di.uminho.pt)      Routing table insertion and update
21
 *              Linus Torvalds  :       Rewrote bits to be sensible
22
 *              Alan Cox        :       Added BSD route gw semantics
23
 *              Alan Cox        :       Super /proc >4K
24
 *              Alan Cox        :       MTU in route table
25
 *              Alan Cox        :       MSS actually. Also added the window
26
 *                                      clamper.
27
 *              Sam Lantinga    :       Fixed route matching in rt_del()
28
 *              Alan Cox        :       Routing cache support.
29
 *              Alan Cox        :       Removed compatibility cruft.
30
 *              Alan Cox        :       RTF_REJECT support.
31
 *              Alan Cox        :       TCP irtt support.
32
 *              Jonathan Naylor :       Added Metric support.
33
 *      Miquel van Smoorenburg  :       BSD API fixes.
34
 *      Miquel van Smoorenburg  :       Metrics.
35
 *              Alan Cox        :       Use __u32 properly
36
 *              Alan Cox        :       Aligned routing errors more closely with BSD
37
 *                                      our system is still very different.
38
 *              Alan Cox        :       Faster /proc handling
39
 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40
 *                                      routing caches and better behaviour.
41
 *
42
 *              Olaf Erb        :       irtt wasn't being copied right.
43
 *              Bjorn Ekwall    :       Kerneld route support.
44
 *              Alan Cox        :       Multicast fixed (I hope)
45
 *              Pavel Krauz     :       Limited broadcast fixed
46
 *              Mike McLagan    :       Routing by source
47
 *      Alexey Kuznetsov        :       End of old history. Splitted to fib.c and
48
 *                                      route.c and rewritten from scratch.
49
 *              Andi Kleen      :       Load-limit warning messages.
50
 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51
 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52
 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53
 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54
 *              Marc Boucher    :       routing by fwmark
55
 *      Robert Olsson           :       Added rt_cache statistics
56
 *
57
 *              This program is free software; you can redistribute it and/or
58
 *              modify it under the terms of the GNU General Public License
59
 *              as published by the Free Software Foundation; either version
60
 *              2 of the License, or (at your option) any later version.
61
 */
62
 
63
#include <linux/config.h>
64
#include <asm/uaccess.h>
65
#include <asm/system.h>
66
#include <asm/bitops.h>
67
#include <linux/types.h>
68
#include <linux/kernel.h>
69
#include <linux/sched.h>
70
#include <linux/mm.h>
71
#include <linux/string.h>
72
#include <linux/socket.h>
73
#include <linux/sockios.h>
74
#include <linux/errno.h>
75
#include <linux/in.h>
76
#include <linux/inet.h>
77
#include <linux/netdevice.h>
78
#include <linux/proc_fs.h>
79
#include <linux/init.h>
80
#include <linux/skbuff.h>
81
#include <linux/rtnetlink.h>
82
#include <linux/inetdevice.h>
83
#include <linux/igmp.h>
84
#include <linux/pkt_sched.h>
85
#include <linux/mroute.h>
86
#include <linux/netfilter_ipv4.h>
87
#include <linux/random.h>
88
#include <linux/jhash.h>
89
#include <net/protocol.h>
90
#include <net/ip.h>
91
#include <net/route.h>
92
#include <net/inetpeer.h>
93
#include <net/sock.h>
94
#include <net/ip_fib.h>
95
#include <net/arp.h>
96
#include <net/tcp.h>
97
#include <net/icmp.h>
98
#ifdef CONFIG_SYSCTL
99
#include <linux/sysctl.h>
100
#endif
101
 
102
#define IP_MAX_MTU      0xFFF0
103
 
104
#define RT_GC_TIMEOUT (300*HZ)
105
 
106
int ip_rt_min_delay             = 2 * HZ;
107
int ip_rt_max_delay             = 10 * HZ;
108
int ip_rt_max_size;
109
int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
110
int ip_rt_gc_interval           = 60 * HZ;
111
int ip_rt_gc_min_interval       = HZ / 2;
112
int ip_rt_redirect_number       = 9;
113
int ip_rt_redirect_load         = HZ / 50;
114
int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
115
int ip_rt_error_cost            = HZ;
116
int ip_rt_error_burst           = 5 * HZ;
117
int ip_rt_gc_elasticity         = 8;
118
int ip_rt_mtu_expires           = 10 * 60 * HZ;
119
int ip_rt_min_pmtu              = 512 + 20 + 20;
120
int ip_rt_min_advmss            = 256;
121
int ip_rt_secret_interval       = 10 * 60 * HZ;
122
static unsigned long rt_deadline;
123
 
124
#define RTprint(a...)   printk(KERN_DEBUG a)
125
 
126
static struct timer_list rt_flush_timer;
127
static struct timer_list rt_periodic_timer;
128
static struct timer_list rt_secret_timer;
129
 
130
/*
131
 *      Interface to generic destination cache.
132
 */
133
 
134
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135
static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
136
                                           struct sk_buff *skb);
137
static void              ipv4_dst_destroy(struct dst_entry *dst);
138
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139
static void              ipv4_link_failure(struct sk_buff *skb);
140
static int rt_garbage_collect(void);
141
 
142
 
143
struct dst_ops ipv4_dst_ops = {
144
        family:                 AF_INET,
145
        protocol:               __constant_htons(ETH_P_IP),
146
        gc:                     rt_garbage_collect,
147
        check:                  ipv4_dst_check,
148
        reroute:                ipv4_dst_reroute,
149
        destroy:                ipv4_dst_destroy,
150
        negative_advice:        ipv4_negative_advice,
151
        link_failure:           ipv4_link_failure,
152
        entry_size:             sizeof(struct rtable),
153
};
154
 
155
#define ECN_OR_COST(class)      TC_PRIO_##class
156
 
157
__u8 ip_tos2prio[16] = {
158
        TC_PRIO_BESTEFFORT,
159
        ECN_OR_COST(FILLER),
160
        TC_PRIO_BESTEFFORT,
161
        ECN_OR_COST(BESTEFFORT),
162
        TC_PRIO_BULK,
163
        ECN_OR_COST(BULK),
164
        TC_PRIO_BULK,
165
        ECN_OR_COST(BULK),
166
        TC_PRIO_INTERACTIVE,
167
        ECN_OR_COST(INTERACTIVE),
168
        TC_PRIO_INTERACTIVE,
169
        ECN_OR_COST(INTERACTIVE),
170
        TC_PRIO_INTERACTIVE_BULK,
171
        ECN_OR_COST(INTERACTIVE_BULK),
172
        TC_PRIO_INTERACTIVE_BULK,
173
        ECN_OR_COST(INTERACTIVE_BULK)
174
};
175
 
176
 
177
/*
178
 * Route cache.
179
 */
180
 
181
/* The locking scheme is rather straight forward:
182
 *
183
 * 1) A BH protected rwlocks protect buckets of the central route hash.
184
 * 2) Only writers remove entries, and they hold the lock
185
 *    as they look at rtable reference counts.
186
 * 3) Only readers acquire references to rtable entries,
187
 *    they do so with atomic increments and with the
188
 *    lock held.
189
 */
190
 
191
struct rt_hash_bucket {
192
        struct rtable   *chain;
193
        rwlock_t        lock;
194
} __attribute__((__aligned__(8)));
195
 
196
static struct rt_hash_bucket    *rt_hash_table;
197
static unsigned                 rt_hash_mask;
198
static int                      rt_hash_log;
199
static unsigned int             rt_hash_rnd;
200
 
201
struct rt_cache_stat rt_cache_stat[NR_CPUS];
202
 
203
static int rt_intern_hash(unsigned hash, struct rtable *rth,
204
                                struct rtable **res);
205
 
206
static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
207
{
208
        return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
209
                & rt_hash_mask);
210
}
211
 
212
static int rt_cache_get_info(char *buffer, char **start, off_t offset,
213
                                int length)
214
{
215
        int len = 0;
216
        off_t pos = 128;
217
        char temp[256];
218
        struct rtable *r;
219
        int i;
220
 
221
        if (offset < 128) {
222
                sprintf(buffer, "%-127s\n",
223
                        "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224
                        "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225
                        "HHUptod\tSpecDst");
226
                len = 128;
227
        }
228
 
229
        for (i = rt_hash_mask; i >= 0; i--) {
230
                read_lock_bh(&rt_hash_table[i].lock);
231
                for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
232
                        /*
233
                         *      Spin through entries until we are ready
234
                         */
235
                        pos += 128;
236
 
237
                        if (pos <= offset) {
238
                                len = 0;
239
                                continue;
240
                        }
241
                        sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
242
                                "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
243
                                r->u.dst.dev ? r->u.dst.dev->name : "*",
244
                                (unsigned long)r->rt_dst,
245
                                (unsigned long)r->rt_gateway,
246
                                r->rt_flags,
247
                                atomic_read(&r->u.dst.__refcnt),
248
                                r->u.dst.__use,
249
                                0,
250
                                (unsigned long)r->rt_src,
251
                                (r->u.dst.advmss ?
252
                                 (int) r->u.dst.advmss + 40 : 0),
253
                                r->u.dst.window,
254
                                (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
255
                                r->key.tos,
256
                                r->u.dst.hh ?
257
                                        atomic_read(&r->u.dst.hh->hh_refcnt) :
258
                                        -1,
259
                                r->u.dst.hh ?
260
                                        (r->u.dst.hh->hh_output ==
261
                                         dev_queue_xmit) : 0,
262
                                r->rt_spec_dst);
263
                        sprintf(buffer + len, "%-127s\n", temp);
264
                        len += 128;
265
                        if (pos >= offset+length) {
266
                                read_unlock_bh(&rt_hash_table[i].lock);
267
                                goto done;
268
                        }
269
                }
270
                read_unlock_bh(&rt_hash_table[i].lock);
271
        }
272
 
273
done:
274
        *start = buffer + len - (pos - offset);
275
        len = pos - offset;
276
        if (len > length)
277
                len = length;
278
        return len;
279
}
280
 
281
static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
282
{
283
        unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
284
        int i, lcpu;
285
        int len = 0;
286
 
287
        for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
288
                i = cpu_logical_map(lcpu);
289
 
290
                len += sprintf(buffer+len, "%08x  %08x %08x %08x %08x %08x %08x %08x  %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
291
                               dst_entries,
292
                               rt_cache_stat[i].in_hit,
293
                               rt_cache_stat[i].in_slow_tot,
294
                               rt_cache_stat[i].in_slow_mc,
295
                               rt_cache_stat[i].in_no_route,
296
                               rt_cache_stat[i].in_brd,
297
                               rt_cache_stat[i].in_martian_dst,
298
                               rt_cache_stat[i].in_martian_src,
299
 
300
                               rt_cache_stat[i].out_hit,
301
                               rt_cache_stat[i].out_slow_tot,
302
                               rt_cache_stat[i].out_slow_mc,
303
 
304
                               rt_cache_stat[i].gc_total,
305
                               rt_cache_stat[i].gc_ignored,
306
                               rt_cache_stat[i].gc_goal_miss,
307
                               rt_cache_stat[i].gc_dst_overflow,
308
                               rt_cache_stat[i].in_hlist_search,
309
                               rt_cache_stat[i].out_hlist_search
310
 
311
                        );
312
        }
313
        len -= offset;
314
 
315
        if (len > length)
316
                len = length;
317
        if (len < 0)
318
                len = 0;
319
 
320
        *start = buffer + offset;
321
        return len;
322
}
323
 
324
static __inline__ void rt_free(struct rtable *rt)
325
{
326
        dst_free(&rt->u.dst);
327
}
328
 
329
static __inline__ void rt_drop(struct rtable *rt)
330
{
331
        ip_rt_put(rt);
332
        dst_free(&rt->u.dst);
333
}
334
 
335
static __inline__ int rt_fast_clean(struct rtable *rth)
336
{
337
        /* Kill broadcast/multicast entries very aggresively, if they
338
           collide in hash table with more useful entries */
339
        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
340
                rth->key.iif && rth->u.rt_next;
341
}
342
 
343
static __inline__ int rt_valuable(struct rtable *rth)
344
{
345
        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
346
                rth->u.dst.expires;
347
}
348
 
349
static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
350
{
351
        unsigned long age;
352
        int ret = 0;
353
 
354
        if (atomic_read(&rth->u.dst.__refcnt))
355
                goto out;
356
 
357
        ret = 1;
358
        if (rth->u.dst.expires &&
359
            time_after_eq(jiffies, rth->u.dst.expires))
360
                goto out;
361
 
362
        age = jiffies - rth->u.dst.lastuse;
363
        ret = 0;
364
        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
365
            (age <= tmo2 && rt_valuable(rth)))
366
                goto out;
367
        ret = 1;
368
out:    return ret;
369
}
370
 
371
/* Bits of score are:
372
 * 31: very valuable
373
 * 30: not quite useless
374
 * 29..0: usage counter
375
 */
376
static inline u32 rt_score(struct rtable *rt)
377
{
378
        u32 score = jiffies - rt->u.dst.lastuse;
379
 
380
        score = ~score & ~(3<<30);
381
 
382
        if (rt_valuable(rt))
383
                score |= (1<<31);
384
 
385
        if (!rt->key.iif ||
386
            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
387
                score |= (1<<30);
388
 
389
        return score;
390
}
391
 
392
/* This runs via a timer and thus is always in BH context. */
393
static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
394
{
395
        static int rover;
396
        int i = rover, t;
397
        struct rtable *rth, **rthp;
398
        unsigned long now = jiffies;
399
 
400
        for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
401
             t -= ip_rt_gc_timeout) {
402
                unsigned long tmo = ip_rt_gc_timeout;
403
 
404
                i = (i + 1) & rt_hash_mask;
405
                rthp = &rt_hash_table[i].chain;
406
 
407
                write_lock(&rt_hash_table[i].lock);
408
                while ((rth = *rthp) != NULL) {
409
                        if (rth->u.dst.expires) {
410
                                /* Entry is expired even if it is in use */
411
                                if (time_before_eq(now, rth->u.dst.expires)) {
412
                                        tmo >>= 1;
413
                                        rthp = &rth->u.rt_next;
414
                                        continue;
415
                                }
416
                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
417
                                tmo >>= 1;
418
                                rthp = &rth->u.rt_next;
419
                                continue;
420
                        }
421
 
422
                        /* Cleanup aged off entries. */
423
                        *rthp = rth->u.rt_next;
424
                        rt_free(rth);
425
                }
426
                write_unlock(&rt_hash_table[i].lock);
427
 
428
                /* Fallback loop breaker. */
429
                if (time_after(jiffies, now))
430
                        break;
431
        }
432
        rover = i;
433
        mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
434
}
435
 
436
SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
437
 
438
/* This can run from both BH and non-BH contexts, the latter
439
 * in the case of a forced flush event.
440
 */
441
static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
442
{
443
        int i;
444
        struct rtable *rth, *next;
445
 
446
        rt_deadline = 0;
447
 
448
        get_random_bytes(&rt_hash_rnd, 4);
449
 
450
        for (i = rt_hash_mask; i >= 0; i--) {
451
                write_lock_bh(&rt_hash_table[i].lock);
452
                rth = rt_hash_table[i].chain;
453
                if (rth)
454
                        rt_hash_table[i].chain = NULL;
455
                write_unlock_bh(&rt_hash_table[i].lock);
456
 
457
                for (; rth; rth = next) {
458
                        next = rth->u.rt_next;
459
                        rt_free(rth);
460
                }
461
        }
462
}
463
 
464
SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
465
 
466
static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
467
 
468
void rt_cache_flush(int delay)
469
{
470
        unsigned long now = jiffies;
471
        int user_mode = !in_softirq();
472
 
473
        if (delay < 0)
474
                delay = ip_rt_min_delay;
475
 
476
        spin_lock_bh(&rt_flush_lock);
477
 
478
        if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
479
                long tmo = (long)(rt_deadline - now);
480
 
481
                /* If flush timer is already running
482
                   and flush request is not immediate (delay > 0):
483
 
484
                   if deadline is not achieved, prolongate timer to "delay",
485
                   otherwise fire it at deadline time.
486
                 */
487
 
488
                if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
489
                        tmo = 0;
490
 
491
                if (delay > tmo)
492
                        delay = tmo;
493
        }
494
 
495
        if (delay <= 0) {
496
                spin_unlock_bh(&rt_flush_lock);
497
                SMP_TIMER_NAME(rt_run_flush)(0);
498
                return;
499
        }
500
 
501
        if (rt_deadline == 0)
502
                rt_deadline = now + ip_rt_max_delay;
503
 
504
        mod_timer(&rt_flush_timer, now+delay);
505
        spin_unlock_bh(&rt_flush_lock);
506
}
507
 
508
static void rt_secret_rebuild(unsigned long dummy)
509
{
510
        unsigned long now = jiffies;
511
 
512
        rt_cache_flush(0);
513
        mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
514
}
515
 
516
/*
517
   Short description of GC goals.
518
 
519
   We want to build algorithm, which will keep routing cache
520
   at some equilibrium point, when number of aged off entries
521
   is kept approximately equal to newly generated ones.
522
 
523
   Current expiration strength is variable "expire".
524
   We try to adjust it dynamically, so that if networking
525
   is idle expires is large enough to keep enough of warm entries,
526
   and when load increases it reduces to limit cache size.
527
 */
528
 
529
static int rt_garbage_collect(void)
530
{
531
        static unsigned long expire = RT_GC_TIMEOUT;
532
        static unsigned long last_gc;
533
        static int rover;
534
        static int equilibrium;
535
        struct rtable *rth, **rthp;
536
        unsigned long now = jiffies;
537
        int goal;
538
 
539
        /*
540
         * Garbage collection is pretty expensive,
541
         * do not make it too frequently.
542
         */
543
 
544
        rt_cache_stat[smp_processor_id()].gc_total++;
545
 
546
        if (now - last_gc < ip_rt_gc_min_interval &&
547
            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
548
                rt_cache_stat[smp_processor_id()].gc_ignored++;
549
                goto out;
550
        }
551
 
552
        /* Calculate number of entries, which we want to expire now. */
553
        goal = atomic_read(&ipv4_dst_ops.entries) -
554
                (ip_rt_gc_elasticity << rt_hash_log);
555
        if (goal <= 0) {
556
                if (equilibrium < ipv4_dst_ops.gc_thresh)
557
                        equilibrium = ipv4_dst_ops.gc_thresh;
558
                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
559
                if (goal > 0) {
560
                        equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
561
                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
562
                }
563
        } else {
564
                /* We are in dangerous area. Try to reduce cache really
565
                 * aggressively.
566
                 */
567
                goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
568
                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
569
        }
570
 
571
        if (now - last_gc >= ip_rt_gc_min_interval)
572
                last_gc = now;
573
 
574
        if (goal <= 0) {
575
                equilibrium += goal;
576
                goto work_done;
577
        }
578
 
579
        do {
580
                int i, k;
581
 
582
                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
583
                        unsigned long tmo = expire;
584
 
585
                        k = (k + 1) & rt_hash_mask;
586
                        rthp = &rt_hash_table[k].chain;
587
                        write_lock_bh(&rt_hash_table[k].lock);
588
                        while ((rth = *rthp) != NULL) {
589
                                if (!rt_may_expire(rth, tmo, expire)) {
590
                                        tmo >>= 1;
591
                                        rthp = &rth->u.rt_next;
592
                                        continue;
593
                                }
594
                                *rthp = rth->u.rt_next;
595
                                rt_free(rth);
596
                                goal--;
597
                        }
598
                        write_unlock_bh(&rt_hash_table[k].lock);
599
                        if (goal <= 0)
600
                                break;
601
                }
602
                rover = k;
603
 
604
                if (goal <= 0)
605
                        goto work_done;
606
 
607
                /* Goal is not achieved. We stop process if:
608
 
609
                   - if expire reduced to zero. Otherwise, expire is halfed.
610
                   - if table is not full.
611
                   - if we are called from interrupt.
612
                   - jiffies check is just fallback/debug loop breaker.
613
                     We will not spin here for long time in any case.
614
                 */
615
 
616
                rt_cache_stat[smp_processor_id()].gc_goal_miss++;
617
 
618
                if (expire == 0)
619
                        break;
620
 
621
                expire >>= 1;
622
#if RT_CACHE_DEBUG >= 2
623
                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
624
                                atomic_read(&ipv4_dst_ops.entries), goal, i);
625
#endif
626
 
627
                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
628
                        goto out;
629
        } while (!in_softirq() && time_before_eq(jiffies, now));
630
 
631
        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
632
                goto out;
633
        if (net_ratelimit())
634
                printk(KERN_WARNING "dst cache overflow\n");
635
        rt_cache_stat[smp_processor_id()].gc_dst_overflow++;
636
        return 1;
637
 
638
work_done:
639
        expire += ip_rt_gc_min_interval;
640
        if (expire > ip_rt_gc_timeout ||
641
            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
642
                expire = ip_rt_gc_timeout;
643
#if RT_CACHE_DEBUG >= 2
644
        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
645
                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
646
#endif
647
out:    return 0;
648
}
649
 
650
static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
651
{
652
        struct rtable   *rth, **rthp;
653
        unsigned long   now;
654
        struct rtable *cand, **candp;
655
        u32             min_score;
656
        int             chain_length;
657
        int attempts = !in_softirq();
658
 
659
restart:
660
        chain_length = 0;
661
        min_score = ~(u32)0;
662
        cand = NULL;
663
        candp = NULL;
664
        now = jiffies;
665
 
666
        rthp = &rt_hash_table[hash].chain;
667
 
668
        write_lock_bh(&rt_hash_table[hash].lock);
669
        while ((rth = *rthp) != NULL) {
670
                if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
671
                        /* Put it first */
672
                        *rthp = rth->u.rt_next;
673
                        rth->u.rt_next = rt_hash_table[hash].chain;
674
                        rt_hash_table[hash].chain = rth;
675
 
676
                        rth->u.dst.__use++;
677
                        dst_hold(&rth->u.dst);
678
                        rth->u.dst.lastuse = now;
679
                        write_unlock_bh(&rt_hash_table[hash].lock);
680
 
681
                        rt_drop(rt);
682
                        *rp = rth;
683
                        return 0;
684
                }
685
 
686
                if (!atomic_read(&rth->u.dst.__refcnt)) {
687
                        u32 score = rt_score(rth);
688
 
689
                        if (score <= min_score) {
690
                                cand = rth;
691
                                candp = rthp;
692
                                min_score = score;
693
                        }
694
                }
695
 
696
                chain_length++;
697
 
698
                rthp = &rth->u.rt_next;
699
        }
700
 
701
        if (cand) {
702
                /* ip_rt_gc_elasticity used to be average length of chain
703
                 * length, when exceeded gc becomes really aggressive.
704
                 *
705
                 * The second limit is less certain. At the moment it allows
706
                 * only 2 entries per bucket. We will see.
707
                 */
708
                if (chain_length > ip_rt_gc_elasticity) {
709
                        *candp = cand->u.rt_next;
710
                        rt_free(cand);
711
                }
712
        }
713
 
714
        /* Try to bind route to arp only if it is output
715
           route or unicast forwarding path.
716
         */
717
        if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
718
                int err = arp_bind_neighbour(&rt->u.dst);
719
                if (err) {
720
                        write_unlock_bh(&rt_hash_table[hash].lock);
721
 
722
                        if (err != -ENOBUFS) {
723
                                rt_drop(rt);
724
                                return err;
725
                        }
726
 
727
                        /* Neighbour tables are full and nothing
728
                           can be released. Try to shrink route cache,
729
                           it is most likely it holds some neighbour records.
730
                         */
731
                        if (attempts-- > 0) {
732
                                int saved_elasticity = ip_rt_gc_elasticity;
733
                                int saved_int = ip_rt_gc_min_interval;
734
                                ip_rt_gc_elasticity     = 1;
735
                                ip_rt_gc_min_interval   = 0;
736
                                rt_garbage_collect();
737
                                ip_rt_gc_min_interval   = saved_int;
738
                                ip_rt_gc_elasticity     = saved_elasticity;
739
                                goto restart;
740
                        }
741
 
742
                        if (net_ratelimit())
743
                                printk(KERN_WARNING "Neighbour table overflow.\n");
744
                        rt_drop(rt);
745
                        return -ENOBUFS;
746
                }
747
        }
748
 
749
        rt->u.rt_next = rt_hash_table[hash].chain;
750
#if RT_CACHE_DEBUG >= 2
751
        if (rt->u.rt_next) {
752
                struct rtable *trt;
753
                printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
754
                       NIPQUAD(rt->rt_dst));
755
                for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
756
                        printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
757
                printk("\n");
758
        }
759
#endif
760
        rt_hash_table[hash].chain = rt;
761
        write_unlock_bh(&rt_hash_table[hash].lock);
762
        *rp = rt;
763
        return 0;
764
}
765
 
766
void rt_bind_peer(struct rtable *rt, int create)
767
{
768
        static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
769
        struct inet_peer *peer;
770
 
771
        peer = inet_getpeer(rt->rt_dst, create);
772
 
773
        spin_lock_bh(&rt_peer_lock);
774
        if (rt->peer == NULL) {
775
                rt->peer = peer;
776
                peer = NULL;
777
        }
778
        spin_unlock_bh(&rt_peer_lock);
779
        if (peer)
780
                inet_putpeer(peer);
781
}
782
 
783
/*
784
 * Peer allocation may fail only in serious out-of-memory conditions.  However
785
 * we still can generate some output.
786
 * Random ID selection looks a bit dangerous because we have no chances to
787
 * select ID being unique in a reasonable period of time.
788
 * But broken packet identifier may be better than no packet at all.
789
 */
790
static void ip_select_fb_ident(struct iphdr *iph)
791
{
792
        static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
793
        static u32 ip_fallback_id;
794
        u32 salt;
795
 
796
        spin_lock_bh(&ip_fb_id_lock);
797
        salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
798
        iph->id = htons(salt & 0xFFFF);
799
        ip_fallback_id = salt;
800
        spin_unlock_bh(&ip_fb_id_lock);
801
}
802
 
803
void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
804
{
805
        struct rtable *rt = (struct rtable *) dst;
806
 
807
        if (rt) {
808
                if (rt->peer == NULL)
809
                        rt_bind_peer(rt, 1);
810
 
811
                /* If peer is attached to destination, it is never detached,
812
                   so that we need not to grab a lock to dereference it.
813
                 */
814
                if (rt->peer) {
815
                        iph->id = htons(inet_getid(rt->peer));
816
                        return;
817
                }
818
        } else
819
                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
820
 
821
        ip_select_fb_ident(iph);
822
}
823
 
824
static void rt_del(unsigned hash, struct rtable *rt)
825
{
826
        struct rtable **rthp;
827
 
828
        write_lock_bh(&rt_hash_table[hash].lock);
829
        ip_rt_put(rt);
830
        for (rthp = &rt_hash_table[hash].chain; *rthp;
831
             rthp = &(*rthp)->u.rt_next)
832
                if (*rthp == rt) {
833
                        *rthp = rt->u.rt_next;
834
                        rt_free(rt);
835
                        break;
836
                }
837
        write_unlock_bh(&rt_hash_table[hash].lock);
838
}
839
 
840
void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
841
                    u32 saddr, u8 tos, struct net_device *dev)
842
{
843
        int i, k;
844
        struct in_device *in_dev = in_dev_get(dev);
845
        struct rtable *rth, **rthp;
846
        u32  skeys[2] = { saddr, 0 };
847
        int  ikeys[2] = { dev->ifindex, 0 };
848
 
849
        tos &= IPTOS_RT_MASK;
850
 
851
        if (!in_dev)
852
                return;
853
 
854
        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
855
            || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
856
                goto reject_redirect;
857
 
858
        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
859
                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
860
                        goto reject_redirect;
861
                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
862
                        goto reject_redirect;
863
        } else {
864
                if (inet_addr_type(new_gw) != RTN_UNICAST)
865
                        goto reject_redirect;
866
        }
867
 
868
        for (i = 0; i < 2; i++) {
869
                for (k = 0; k < 2; k++) {
870
                        unsigned hash = rt_hash_code(daddr,
871
                                                     skeys[i] ^ (ikeys[k] << 5),
872
                                                     tos);
873
 
874
                        rthp=&rt_hash_table[hash].chain;
875
 
876
                        read_lock(&rt_hash_table[hash].lock);
877
                        while ((rth = *rthp) != NULL) {
878
                                struct rtable *rt;
879
 
880
                                if (rth->key.dst != daddr ||
881
                                    rth->key.src != skeys[i] ||
882
                                    rth->key.tos != tos ||
883
                                    rth->key.oif != ikeys[k] ||
884
                                    rth->key.iif != 0) {
885
                                        rthp = &rth->u.rt_next;
886
                                        continue;
887
                                }
888
 
889
                                if (rth->rt_dst != daddr ||
890
                                    rth->rt_src != saddr ||
891
                                    rth->u.dst.error ||
892
                                    rth->rt_gateway != old_gw ||
893
                                    rth->u.dst.dev != dev)
894
                                        break;
895
 
896
                                dst_hold(&rth->u.dst);
897
                                read_unlock(&rt_hash_table[hash].lock);
898
 
899
                                rt = dst_alloc(&ipv4_dst_ops);
900
                                if (rt == NULL) {
901
                                        ip_rt_put(rth);
902
                                        in_dev_put(in_dev);
903
                                        return;
904
                                }
905
 
906
                                /* Copy all the information. */
907
                                *rt = *rth;
908
                                rt->u.dst.__use         = 1;
909
                                atomic_set(&rt->u.dst.__refcnt, 1);
910
                                if (rt->u.dst.dev)
911
                                        dev_hold(rt->u.dst.dev);
912
                                rt->u.dst.lastuse       = jiffies;
913
                                rt->u.dst.neighbour     = NULL;
914
                                rt->u.dst.hh            = NULL;
915
                                rt->u.dst.obsolete      = 0;
916
 
917
                                rt->rt_flags            |= RTCF_REDIRECTED;
918
 
919
                                /* Gateway is different ... */
920
                                rt->rt_gateway          = new_gw;
921
 
922
                                /* Redirect received -> path was valid */
923
                                dst_confirm(&rth->u.dst);
924
 
925
                                if (rt->peer)
926
                                        atomic_inc(&rt->peer->refcnt);
927
 
928
                                if (arp_bind_neighbour(&rt->u.dst) ||
929
                                    !(rt->u.dst.neighbour->nud_state &
930
                                            NUD_VALID)) {
931
                                        if (rt->u.dst.neighbour)
932
                                                neigh_event_send(rt->u.dst.neighbour, NULL);
933
                                        ip_rt_put(rth);
934
                                        rt_drop(rt);
935
                                        goto do_next;
936
                                }
937
 
938
                                rt_del(hash, rth);
939
                                if (!rt_intern_hash(hash, rt, &rt))
940
                                        ip_rt_put(rt);
941
                                goto do_next;
942
                        }
943
                        read_unlock(&rt_hash_table[hash].lock);
944
                do_next:
945
                        ;
946
                }
947
        }
948
        in_dev_put(in_dev);
949
        return;
950
 
951
reject_redirect:
952
#ifdef CONFIG_IP_ROUTE_VERBOSE
953
        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
954
                printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
955
                        "%u.%u.%u.%u ignored.\n"
956
                        "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
957
                        "tos %02x\n",
958
                       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
959
                       NIPQUAD(saddr), NIPQUAD(daddr), tos);
960
#endif
961
        in_dev_put(in_dev);
962
}
963
 
964
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
965
{
966
        struct rtable *rt = (struct rtable*)dst;
967
        struct dst_entry *ret = dst;
968
 
969
        if (rt) {
970
                if (dst->obsolete) {
971
                        ip_rt_put(rt);
972
                        ret = NULL;
973
                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
974
                           rt->u.dst.expires) {
975
                        unsigned hash = rt_hash_code(rt->key.dst,
976
                                                     rt->key.src ^
977
                                                        (rt->key.oif << 5),
978
                                                     rt->key.tos);
979
#if RT_CACHE_DEBUG >= 1
980
                        printk(KERN_DEBUG "ip_rt_advice: redirect to "
981
                                          "%u.%u.%u.%u/%02x dropped\n",
982
                                NIPQUAD(rt->rt_dst), rt->key.tos);
983
#endif
984
                        rt_del(hash, rt);
985
                        ret = NULL;
986
                }
987
        }
988
        return ret;
989
}
990
 
991
/*
992
 * Algorithm:
993
 *      1. The first ip_rt_redirect_number redirects are sent
994
 *         with exponential backoff, then we stop sending them at all,
995
 *         assuming that the host ignores our redirects.
996
 *      2. If we did not see packets requiring redirects
997
 *         during ip_rt_redirect_silence, we assume that the host
998
 *         forgot redirected route and start to send redirects again.
999
 *
1000
 * This algorithm is much cheaper and more intelligent than dumb load limiting
1001
 * in icmp.c.
1002
 *
1003
 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1004
 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1005
 */
1006
 
1007
void ip_rt_send_redirect(struct sk_buff *skb)
1008
{
1009
        struct rtable *rt = (struct rtable*)skb->dst;
1010
        struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1011
 
1012
        if (!in_dev)
1013
                return;
1014
 
1015
        if (!IN_DEV_TX_REDIRECTS(in_dev))
1016
                goto out;
1017
 
1018
        /* No redirected packets during ip_rt_redirect_silence;
1019
         * reset the algorithm.
1020
         */
1021
        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1022
                rt->u.dst.rate_tokens = 0;
1023
 
1024
        /* Too many ignored redirects; do not send anything
1025
         * set u.dst.rate_last to the last seen redirected packet.
1026
         */
1027
        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1028
                rt->u.dst.rate_last = jiffies;
1029
                goto out;
1030
        }
1031
 
1032
        /* Check for load limit; set rate_last to the latest sent
1033
         * redirect.
1034
         */
1035
        if (time_after(jiffies,
1036
                       (rt->u.dst.rate_last +
1037
                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1038
                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1039
                rt->u.dst.rate_last = jiffies;
1040
                ++rt->u.dst.rate_tokens;
1041
#ifdef CONFIG_IP_ROUTE_VERBOSE
1042
                if (IN_DEV_LOG_MARTIANS(in_dev) &&
1043
                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1044
                    net_ratelimit())
1045
                        printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1046
                                "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1047
                                NIPQUAD(rt->rt_src), rt->rt_iif,
1048
                                NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1049
#endif
1050
        }
1051
out:
1052
        in_dev_put(in_dev);
1053
}
1054
 
1055
static int ip_error(struct sk_buff *skb)
1056
{
1057
        struct rtable *rt = (struct rtable*)skb->dst;
1058
        unsigned long now;
1059
        int code;
1060
 
1061
        switch (rt->u.dst.error) {
1062
                case EINVAL:
1063
                default:
1064
                        goto out;
1065
                case EHOSTUNREACH:
1066
                        code = ICMP_HOST_UNREACH;
1067
                        break;
1068
                case ENETUNREACH:
1069
                        code = ICMP_NET_UNREACH;
1070
                        break;
1071
                case EACCES:
1072
                        code = ICMP_PKT_FILTERED;
1073
                        break;
1074
        }
1075
 
1076
        now = jiffies;
1077
        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1078
        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1079
                rt->u.dst.rate_tokens = ip_rt_error_burst;
1080
        rt->u.dst.rate_last = now;
1081
        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1082
                rt->u.dst.rate_tokens -= ip_rt_error_cost;
1083
                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1084
        }
1085
 
1086
out:    kfree_skb(skb);
1087
        return 0;
1088
}
1089
 
1090
/*
1091
 *      The last two values are not from the RFC but
1092
 *      are needed for AMPRnet AX.25 paths.
1093
 */
1094
 
1095
static unsigned short mtu_plateau[] =
1096
{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1097
 
1098
static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1099
{
1100
        int i;
1101
 
1102
        for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1103
                if (old_mtu > mtu_plateau[i])
1104
                        return mtu_plateau[i];
1105
        return 68;
1106
}
1107
 
1108
unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1109
{
1110
        int i;
1111
        unsigned short old_mtu = ntohs(iph->tot_len);
1112
        struct rtable *rth;
1113
        u32  skeys[2] = { iph->saddr, 0, };
1114
        u32  daddr = iph->daddr;
1115
        u8   tos = iph->tos & IPTOS_RT_MASK;
1116
        unsigned short est_mtu = 0;
1117
 
1118
        if (ipv4_config.no_pmtu_disc)
1119
                return 0;
1120
 
1121
        for (i = 0; i < 2; i++) {
1122
                unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1123
 
1124
                read_lock(&rt_hash_table[hash].lock);
1125
                for (rth = rt_hash_table[hash].chain; rth;
1126
                     rth = rth->u.rt_next) {
1127
                        if (rth->key.dst == daddr &&
1128
                            rth->key.src == skeys[i] &&
1129
                            rth->rt_dst  == daddr &&
1130
                            rth->rt_src  == iph->saddr &&
1131
                            rth->key.tos == tos &&
1132
                            rth->key.iif == 0 &&
1133
                            !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1134
                                unsigned short mtu = new_mtu;
1135
 
1136
                                if (new_mtu < 68 || new_mtu >= old_mtu) {
1137
 
1138
                                        /* BSD 4.2 compatibility hack :-( */
1139
                                        if (mtu == 0 &&
1140
                                            old_mtu >= rth->u.dst.pmtu &&
1141
                                            old_mtu >= 68 + (iph->ihl << 2))
1142
                                                old_mtu -= iph->ihl << 2;
1143
 
1144
                                        mtu = guess_mtu(old_mtu);
1145
                                }
1146
                                if (mtu <= rth->u.dst.pmtu) {
1147
                                        if (mtu < rth->u.dst.pmtu) {
1148
                                                dst_confirm(&rth->u.dst);
1149
                                                if (mtu < ip_rt_min_pmtu) {
1150
                                                        mtu = ip_rt_min_pmtu;
1151
                                                        rth->u.dst.mxlock |=
1152
                                                                (1 << RTAX_MTU);
1153
                                                }
1154
                                                rth->u.dst.pmtu = mtu;
1155
                                                dst_set_expires(&rth->u.dst,
1156
                                                        ip_rt_mtu_expires);
1157
                                        }
1158
                                        est_mtu = mtu;
1159
                                }
1160
                        }
1161
                }
1162
                read_unlock(&rt_hash_table[hash].lock);
1163
        }
1164
        return est_mtu ? : new_mtu;
1165
}
1166
 
1167
void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1168
{
1169
        if (dst->pmtu > mtu && mtu >= 68 &&
1170
            !(dst->mxlock & (1 << RTAX_MTU))) {
1171
                if (mtu < ip_rt_min_pmtu) {
1172
                        mtu = ip_rt_min_pmtu;
1173
                        dst->mxlock |= (1 << RTAX_MTU);
1174
                }
1175
                dst->pmtu = mtu;
1176
                dst_set_expires(dst, ip_rt_mtu_expires);
1177
        }
1178
}
1179
 
1180
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1181
{
1182
        dst_release(dst);
1183
        return NULL;
1184
}
1185
 
1186
static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1187
                                          struct sk_buff *skb)
1188
{
1189
        return NULL;
1190
}
1191
 
1192
static void ipv4_dst_destroy(struct dst_entry *dst)
1193
{
1194
        struct rtable *rt = (struct rtable *) dst;
1195
        struct inet_peer *peer = rt->peer;
1196
 
1197
        if (peer) {
1198
                rt->peer = NULL;
1199
                inet_putpeer(peer);
1200
        }
1201
}
1202
 
1203
static void ipv4_link_failure(struct sk_buff *skb)
1204
{
1205
        struct rtable *rt;
1206
 
1207
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1208
 
1209
        rt = (struct rtable *) skb->dst;
1210
        if (rt)
1211
                dst_set_expires(&rt->u.dst, 0);
1212
}
1213
 
1214
static int ip_rt_bug(struct sk_buff *skb)
1215
{
1216
        printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1217
                NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1218
                skb->dev ? skb->dev->name : "?");
1219
        kfree_skb(skb);
1220
        return 0;
1221
}
1222
 
1223
/*
1224
   We do not cache source address of outgoing interface,
1225
   because it is used only by IP RR, TS and SRR options,
1226
   so that it out of fast path.
1227
 
1228
   BTW remember: "addr" is allowed to be not aligned
1229
   in IP options!
1230
 */
1231
 
1232
void ip_rt_get_source(u8 *addr, struct rtable *rt)
1233
{
1234
        u32 src;
1235
        struct fib_result res;
1236
 
1237
        if (rt->key.iif == 0)
1238
                src = rt->rt_src;
1239
        else if (fib_lookup(&rt->key, &res) == 0) {
1240
#ifdef CONFIG_IP_ROUTE_NAT
1241
                if (res.type == RTN_NAT)
1242
                        src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1243
                                                RT_SCOPE_UNIVERSE);
1244
                else
1245
#endif
1246
                        src = FIB_RES_PREFSRC(res);
1247
                fib_res_put(&res);
1248
        } else
1249
                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1250
                                        RT_SCOPE_UNIVERSE);
1251
        memcpy(addr, &src, 4);
1252
}
1253
 
1254
#ifdef CONFIG_NET_CLS_ROUTE
1255
static void set_class_tag(struct rtable *rt, u32 tag)
1256
{
1257
        if (!(rt->u.dst.tclassid & 0xFFFF))
1258
                rt->u.dst.tclassid |= tag & 0xFFFF;
1259
        if (!(rt->u.dst.tclassid & 0xFFFF0000))
1260
                rt->u.dst.tclassid |= tag & 0xFFFF0000;
1261
}
1262
#endif
1263
 
1264
static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1265
{
1266
        struct fib_info *fi = res->fi;
1267
 
1268
        if (fi) {
1269
                if (FIB_RES_GW(*res) &&
1270
                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1271
                        rt->rt_gateway = FIB_RES_GW(*res);
1272
                memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1273
                        sizeof(fi->fib_metrics));
1274
                if (fi->fib_mtu == 0) {
1275
                        rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1276
                        if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1277
                            rt->rt_gateway != rt->rt_dst &&
1278
                            rt->u.dst.pmtu > 576)
1279
                                rt->u.dst.pmtu = 576;
1280
                }
1281
#ifdef CONFIG_NET_CLS_ROUTE
1282
                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1283
#endif
1284
        } else
1285
                rt->u.dst.pmtu  = rt->u.dst.dev->mtu;
1286
 
1287
        if (rt->u.dst.pmtu > IP_MAX_MTU)
1288
                rt->u.dst.pmtu = IP_MAX_MTU;
1289
        if (rt->u.dst.advmss == 0)
1290
                rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1291
                                       ip_rt_min_advmss);
1292
        if (rt->u.dst.advmss > 65535 - 40)
1293
                rt->u.dst.advmss = 65535 - 40;
1294
 
1295
#ifdef CONFIG_NET_CLS_ROUTE
1296
#ifdef CONFIG_IP_MULTIPLE_TABLES
1297
        set_class_tag(rt, fib_rules_tclass(res));
1298
#endif
1299
        set_class_tag(rt, itag);
1300
#endif
1301
        rt->rt_type = res->type;
1302
}
1303
 
1304
static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1305
                                u8 tos, struct net_device *dev, int our)
1306
{
1307
        unsigned hash;
1308
        struct rtable *rth;
1309
        u32 spec_dst;
1310
        struct in_device *in_dev = in_dev_get(dev);
1311
        u32 itag = 0;
1312
 
1313
        /* Primary sanity checks. */
1314
 
1315
        if (in_dev == NULL)
1316
                return -EINVAL;
1317
 
1318
        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1319
            skb->protocol != htons(ETH_P_IP))
1320
                goto e_inval;
1321
 
1322
        if (ZERONET(saddr)) {
1323
                if (!LOCAL_MCAST(daddr))
1324
                        goto e_inval;
1325
                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1326
        } else if (fib_validate_source(saddr, 0, tos, 0,
1327
                                        dev, &spec_dst, &itag) < 0)
1328
                goto e_inval;
1329
 
1330
        rth = dst_alloc(&ipv4_dst_ops);
1331
        if (!rth)
1332
                goto e_nobufs;
1333
 
1334
        rth->u.dst.output= ip_rt_bug;
1335
 
1336
        atomic_set(&rth->u.dst.__refcnt, 1);
1337
        rth->u.dst.flags= DST_HOST;
1338
        rth->key.dst    = daddr;
1339
        rth->rt_dst     = daddr;
1340
        rth->key.tos    = tos;
1341
#ifdef CONFIG_IP_ROUTE_FWMARK
1342
        rth->key.fwmark = skb->nfmark;
1343
#endif
1344
        rth->key.src    = saddr;
1345
        rth->rt_src     = saddr;
1346
#ifdef CONFIG_IP_ROUTE_NAT
1347
        rth->rt_dst_map = daddr;
1348
        rth->rt_src_map = saddr;
1349
#endif
1350
#ifdef CONFIG_NET_CLS_ROUTE
1351
        rth->u.dst.tclassid = itag;
1352
#endif
1353
        rth->rt_iif     =
1354
        rth->key.iif    = dev->ifindex;
1355
        rth->u.dst.dev  = &loopback_dev;
1356
        dev_hold(rth->u.dst.dev);
1357
        rth->key.oif    = 0;
1358
        rth->rt_gateway = daddr;
1359
        rth->rt_spec_dst= spec_dst;
1360
        rth->rt_type    = RTN_MULTICAST;
1361
        rth->rt_flags   = RTCF_MULTICAST;
1362
        if (our) {
1363
                rth->u.dst.input= ip_local_deliver;
1364
                rth->rt_flags |= RTCF_LOCAL;
1365
        }
1366
 
1367
#ifdef CONFIG_IP_MROUTE
1368
        if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1369
                rth->u.dst.input = ip_mr_input;
1370
#endif
1371
        rt_cache_stat[smp_processor_id()].in_slow_mc++;
1372
 
1373
        in_dev_put(in_dev);
1374
        hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1375
        return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1376
 
1377
e_nobufs:
1378
        in_dev_put(in_dev);
1379
        return -ENOBUFS;
1380
 
1381
e_inval:
1382
        in_dev_put(in_dev);
1383
        return -EINVAL;
1384
}
1385
 
1386
/*
1387
 *      NOTE. We drop all the packets that has local source
1388
 *      addresses, because every properly looped back packet
1389
 *      must have correct destination already attached by output routine.
1390
 *
1391
 *      Such approach solves two big problems:
1392
 *      1. Not simplex devices are handled properly.
1393
 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1394
 */
1395
 
1396
int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1397
                        u8 tos, struct net_device *dev)
1398
{
1399
        struct rt_key   key;
1400
        struct fib_result res;
1401
        struct in_device *in_dev = in_dev_get(dev);
1402
        struct in_device *out_dev = NULL;
1403
        unsigned        flags = 0;
1404
        u32             itag = 0;
1405
        struct rtable * rth;
1406
        unsigned        hash;
1407
        u32             spec_dst;
1408
        int             err = -EINVAL;
1409
        int             free_res = 0;
1410
 
1411
        /* IP on this device is disabled. */
1412
 
1413
        if (!in_dev)
1414
                goto out;
1415
 
1416
        key.dst         = daddr;
1417
        key.src         = saddr;
1418
        key.tos         = tos;
1419
#ifdef CONFIG_IP_ROUTE_FWMARK
1420
        key.fwmark      = skb->nfmark;
1421
#endif
1422
        key.iif         = dev->ifindex;
1423
        key.oif         = 0;
1424
        key.scope       = RT_SCOPE_UNIVERSE;
1425
 
1426
        hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1427
 
1428
        /* Check for the most weird martians, which can be not detected
1429
           by fib_lookup.
1430
         */
1431
 
1432
        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1433
                goto martian_source;
1434
 
1435
        if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1436
                goto brd_input;
1437
 
1438
        /* Accept zero addresses only to limited broadcast;
1439
         * I even do not know to fix it or not. Waiting for complains :-)
1440
         */
1441
        if (ZERONET(saddr))
1442
                goto martian_source;
1443
 
1444
        if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1445
                goto martian_destination;
1446
 
1447
        /*
1448
         *      Now we are ready to route packet.
1449
         */
1450
        if ((err = fib_lookup(&key, &res)) != 0) {
1451
                if (!IN_DEV_FORWARD(in_dev))
1452
                        goto e_inval;
1453
                goto no_route;
1454
        }
1455
        free_res = 1;
1456
 
1457
        rt_cache_stat[smp_processor_id()].in_slow_tot++;
1458
 
1459
#ifdef CONFIG_IP_ROUTE_NAT
1460
        /* Policy is applied before mapping destination,
1461
           but rerouting after map should be made with old source.
1462
         */
1463
 
1464
        if (1) {
1465
                u32 src_map = saddr;
1466
                if (res.r)
1467
                        src_map = fib_rules_policy(saddr, &res, &flags);
1468
 
1469
                if (res.type == RTN_NAT) {
1470
                        key.dst = fib_rules_map_destination(daddr, &res);
1471
                        fib_res_put(&res);
1472
                        free_res = 0;
1473
                        if (fib_lookup(&key, &res))
1474
                                goto e_inval;
1475
                        free_res = 1;
1476
                        if (res.type != RTN_UNICAST)
1477
                                goto e_inval;
1478
                        flags |= RTCF_DNAT;
1479
                }
1480
                key.src = src_map;
1481
        }
1482
#endif
1483
 
1484
        if (res.type == RTN_BROADCAST)
1485
                goto brd_input;
1486
 
1487
        if (res.type == RTN_LOCAL) {
1488
                int result;
1489
                result = fib_validate_source(saddr, daddr, tos,
1490
                                             loopback_dev.ifindex,
1491
                                             dev, &spec_dst, &itag);
1492
                if (result < 0)
1493
                        goto martian_source;
1494
                if (result)
1495
                        flags |= RTCF_DIRECTSRC;
1496
                spec_dst = daddr;
1497
                goto local_input;
1498
        }
1499
 
1500
        if (!IN_DEV_FORWARD(in_dev))
1501
                goto e_inval;
1502
        if (res.type != RTN_UNICAST)
1503
                goto martian_destination;
1504
 
1505
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1506
        if (res.fi->fib_nhs > 1 && key.oif == 0)
1507
                fib_select_multipath(&key, &res);
1508
#endif
1509
        out_dev = in_dev_get(FIB_RES_DEV(res));
1510
        if (out_dev == NULL) {
1511
                if (net_ratelimit())
1512
                        printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1513
                                         "Please, report\n");
1514
                goto e_inval;
1515
        }
1516
 
1517
        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1518
                                  &spec_dst, &itag);
1519
        if (err < 0)
1520
                goto martian_source;
1521
 
1522
        if (err)
1523
                flags |= RTCF_DIRECTSRC;
1524
 
1525
        if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1526
            (IN_DEV_SHARED_MEDIA(out_dev) ||
1527
             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1528
                flags |= RTCF_DOREDIRECT;
1529
 
1530
        if (skb->protocol != htons(ETH_P_IP)) {
1531
                /* Not IP (i.e. ARP). Do not create route, if it is
1532
                 * invalid for proxy arp. DNAT routes are always valid.
1533
                 */
1534
                if (out_dev == in_dev && !(flags & RTCF_DNAT))
1535
                        goto e_inval;
1536
        }
1537
 
1538
        rth = dst_alloc(&ipv4_dst_ops);
1539
        if (!rth)
1540
                goto e_nobufs;
1541
 
1542
        atomic_set(&rth->u.dst.__refcnt, 1);
1543
        rth->u.dst.flags= DST_HOST;
1544
        rth->key.dst    = daddr;
1545
        rth->rt_dst     = daddr;
1546
        rth->key.tos    = tos;
1547
#ifdef CONFIG_IP_ROUTE_FWMARK
1548
        rth->key.fwmark = skb->nfmark;
1549
#endif
1550
        rth->key.src    = saddr;
1551
        rth->rt_src     = saddr;
1552
        rth->rt_gateway = daddr;
1553
#ifdef CONFIG_IP_ROUTE_NAT
1554
        rth->rt_src_map = key.src;
1555
        rth->rt_dst_map = key.dst;
1556
        if (flags&RTCF_DNAT)
1557
                rth->rt_gateway = key.dst;
1558
#endif
1559
        rth->rt_iif     =
1560
        rth->key.iif    = dev->ifindex;
1561
        rth->u.dst.dev  = out_dev->dev;
1562
        dev_hold(rth->u.dst.dev);
1563
        rth->key.oif    = 0;
1564
        rth->rt_spec_dst= spec_dst;
1565
 
1566
        rth->u.dst.input = ip_forward;
1567
        rth->u.dst.output = ip_output;
1568
 
1569
        rt_set_nexthop(rth, &res, itag);
1570
 
1571
        rth->rt_flags = flags;
1572
 
1573
#ifdef CONFIG_NET_FASTROUTE
1574
        if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1575
                struct net_device *odev = rth->u.dst.dev;
1576
                if (odev != dev &&
1577
                    dev->accept_fastpath &&
1578
                    odev->mtu >= dev->mtu &&
1579
                    dev->accept_fastpath(dev, &rth->u.dst) == 0)
1580
                        rth->rt_flags |= RTCF_FAST;
1581
        }
1582
#endif
1583
 
1584
intern:
1585
        err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1586
done:
1587
        in_dev_put(in_dev);
1588
        if (out_dev)
1589
                in_dev_put(out_dev);
1590
        if (free_res)
1591
                fib_res_put(&res);
1592
out:    return err;
1593
 
1594
brd_input:
1595
        if (skb->protocol != htons(ETH_P_IP))
1596
                goto e_inval;
1597
 
1598
        if (ZERONET(saddr))
1599
                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1600
        else {
1601
                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1602
                                          &itag);
1603
                if (err < 0)
1604
                        goto martian_source;
1605
                if (err)
1606
                        flags |= RTCF_DIRECTSRC;
1607
        }
1608
        flags |= RTCF_BROADCAST;
1609
        res.type = RTN_BROADCAST;
1610
        rt_cache_stat[smp_processor_id()].in_brd++;
1611
 
1612
local_input:
1613
        rth = dst_alloc(&ipv4_dst_ops);
1614
        if (!rth)
1615
                goto e_nobufs;
1616
 
1617
        rth->u.dst.output= ip_rt_bug;
1618
 
1619
        atomic_set(&rth->u.dst.__refcnt, 1);
1620
        rth->u.dst.flags= DST_HOST;
1621
        rth->key.dst    = daddr;
1622
        rth->rt_dst     = daddr;
1623
        rth->key.tos    = tos;
1624
#ifdef CONFIG_IP_ROUTE_FWMARK
1625
        rth->key.fwmark = skb->nfmark;
1626
#endif
1627
        rth->key.src    = saddr;
1628
        rth->rt_src     = saddr;
1629
#ifdef CONFIG_IP_ROUTE_NAT
1630
        rth->rt_dst_map = key.dst;
1631
        rth->rt_src_map = key.src;
1632
#endif
1633
#ifdef CONFIG_NET_CLS_ROUTE
1634
        rth->u.dst.tclassid = itag;
1635
#endif
1636
        rth->rt_iif     =
1637
        rth->key.iif    = dev->ifindex;
1638
        rth->u.dst.dev  = &loopback_dev;
1639
        dev_hold(rth->u.dst.dev);
1640
        rth->key.oif    = 0;
1641
        rth->rt_gateway = daddr;
1642
        rth->rt_spec_dst= spec_dst;
1643
        rth->u.dst.input= ip_local_deliver;
1644
        rth->rt_flags   = flags|RTCF_LOCAL;
1645
        if (res.type == RTN_UNREACHABLE) {
1646
                rth->u.dst.input= ip_error;
1647
                rth->u.dst.error= -err;
1648
                rth->rt_flags   &= ~RTCF_LOCAL;
1649
        }
1650
        rth->rt_type    = res.type;
1651
        goto intern;
1652
 
1653
no_route:
1654
        rt_cache_stat[smp_processor_id()].in_no_route++;
1655
        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1656
        res.type = RTN_UNREACHABLE;
1657
        goto local_input;
1658
 
1659
        /*
1660
         *      Do not cache martian addresses: they should be logged (RFC1812)
1661
         */
1662
martian_destination:
1663
        rt_cache_stat[smp_processor_id()].in_martian_dst++;
1664
#ifdef CONFIG_IP_ROUTE_VERBOSE
1665
        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1666
                printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1667
                        "%u.%u.%u.%u, dev %s\n",
1668
                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1669
#endif
1670
e_inval:
1671
        err = -EINVAL;
1672
        goto done;
1673
 
1674
e_nobufs:
1675
        err = -ENOBUFS;
1676
        goto done;
1677
 
1678
martian_source:
1679
 
1680
        rt_cache_stat[smp_processor_id()].in_martian_src++;
1681
#ifdef CONFIG_IP_ROUTE_VERBOSE
1682
        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683
                /*
1684
                 *      RFC1812 recommendation, if source is martian,
1685
                 *      the only hint is MAC header.
1686
                 */
1687
                printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688
                        "%u.%u.%u.%u, on dev %s\n",
1689
                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1690
                if (dev->hard_header_len) {
1691
                        int i;
1692
                        unsigned char *p = skb->mac.raw;
1693
                        printk(KERN_WARNING "ll header: ");
1694
                        for (i = 0; i < dev->hard_header_len; i++, p++) {
1695
                                printk("%02x", *p);
1696
                                if (i < (dev->hard_header_len - 1))
1697
                                        printk(":");
1698
                        }
1699
                        printk("\n");
1700
                }
1701
        }
1702
#endif
1703
        goto e_inval;
1704
}
1705
 
1706
int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1707
                   u8 tos, struct net_device *dev)
1708
{
1709
        struct rtable * rth;
1710
        unsigned        hash;
1711
        int iif = dev->ifindex;
1712
 
1713
        tos &= IPTOS_RT_MASK;
1714
        hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1715
 
1716
        read_lock(&rt_hash_table[hash].lock);
1717
        for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1718
                if (rth->key.dst == daddr &&
1719
                    rth->key.src == saddr &&
1720
                    rth->key.iif == iif &&
1721
                    rth->key.oif == 0 &&
1722
#ifdef CONFIG_IP_ROUTE_FWMARK
1723
                    rth->key.fwmark == skb->nfmark &&
1724
#endif
1725
                    rth->key.tos == tos) {
1726
                        rth->u.dst.lastuse = jiffies;
1727
                        dst_hold(&rth->u.dst);
1728
                        rth->u.dst.__use++;
1729
                        rt_cache_stat[smp_processor_id()].in_hit++;
1730
                        read_unlock(&rt_hash_table[hash].lock);
1731
                        skb->dst = (struct dst_entry*)rth;
1732
                        return 0;
1733
                }
1734
                rt_cache_stat[smp_processor_id()].in_hlist_search++;
1735
        }
1736
        read_unlock(&rt_hash_table[hash].lock);
1737
 
1738
        /* Multicast recognition logic is moved from route cache to here.
1739
           The problem was that too many Ethernet cards have broken/missing
1740
           hardware multicast filters :-( As result the host on multicasting
1741
           network acquires a lot of useless route cache entries, sort of
1742
           SDR messages from all the world. Now we try to get rid of them.
1743
           Really, provided software IP multicast filter is organized
1744
           reasonably (at least, hashed), it does not result in a slowdown
1745
           comparing with route cache reject entries.
1746
           Note, that multicast routers are not affected, because
1747
           route cache entry is created eventually.
1748
         */
1749
        if (MULTICAST(daddr)) {
1750
                struct in_device *in_dev;
1751
 
1752
                read_lock(&inetdev_lock);
1753
                if ((in_dev = __in_dev_get(dev)) != NULL) {
1754
                        int our = ip_check_mc(in_dev, daddr, saddr);
1755
                        if (our
1756
#ifdef CONFIG_IP_MROUTE
1757
                            || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1758
#endif
1759
                            ) {
1760
                                read_unlock(&inetdev_lock);
1761
                                return ip_route_input_mc(skb, daddr, saddr,
1762
                                                         tos, dev, our);
1763
                        }
1764
                }
1765
                read_unlock(&inetdev_lock);
1766
                return -EINVAL;
1767
        }
1768
        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1769
}
1770
 
1771
/*
1772
 * Major route resolver routine.
1773
 */
1774
 
1775
int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1776
{
1777
        struct rt_key key;
1778
        struct fib_result res;
1779
        unsigned flags = 0;
1780
        struct rtable *rth;
1781
        struct net_device *dev_out = NULL;
1782
        unsigned hash;
1783
        int free_res = 0;
1784
        int err;
1785
        u32 tos;
1786
 
1787
        tos             = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1788
        key.dst         = oldkey->dst;
1789
        key.src         = oldkey->src;
1790
        key.tos         = tos & IPTOS_RT_MASK;
1791
        key.iif         = loopback_dev.ifindex;
1792
        key.oif         = oldkey->oif;
1793
#ifdef CONFIG_IP_ROUTE_FWMARK
1794
        key.fwmark      = oldkey->fwmark;
1795
#endif
1796
        key.scope       = (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1797
                                                RT_SCOPE_UNIVERSE;
1798
        res.fi          = NULL;
1799
#ifdef CONFIG_IP_MULTIPLE_TABLES
1800
        res.r           = NULL;
1801
#endif
1802
 
1803
        if (oldkey->src) {
1804
                err = -EINVAL;
1805
                if (MULTICAST(oldkey->src) ||
1806
                    BADCLASS(oldkey->src) ||
1807
                    ZERONET(oldkey->src))
1808
                        goto out;
1809
 
1810
                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1811
                dev_out = ip_dev_find(oldkey->src);
1812
                if (dev_out == NULL)
1813
                        goto out;
1814
 
1815
                /* I removed check for oif == dev_out->oif here.
1816
                   It was wrong by three reasons:
1817
                   1. ip_dev_find(saddr) can return wrong iface, if saddr is
1818
                      assigned to multiple interfaces.
1819
                   2. Moreover, we are allowed to send packets with saddr
1820
                      of another iface. --ANK
1821
                 */
1822
 
1823
                if (oldkey->oif == 0
1824
                    && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1825
                        /* Special hack: user can direct multicasts
1826
                           and limited broadcast via necessary interface
1827
                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1828
                           This hack is not just for fun, it allows
1829
                           vic,vat and friends to work.
1830
                           They bind socket to loopback, set ttl to zero
1831
                           and expect that it will work.
1832
                           From the viewpoint of routing cache they are broken,
1833
                           because we are not allowed to build multicast path
1834
                           with loopback source addr (look, routing cache
1835
                           cannot know, that ttl is zero, so that packet
1836
                           will not leave this host and route is valid).
1837
                           Luckily, this hack is good workaround.
1838
                         */
1839
 
1840
                        key.oif = dev_out->ifindex;
1841
                        goto make_route;
1842
                }
1843
                if (dev_out)
1844
                        dev_put(dev_out);
1845
                dev_out = NULL;
1846
        }
1847
        if (oldkey->oif) {
1848
                dev_out = dev_get_by_index(oldkey->oif);
1849
                err = -ENODEV;
1850
                if (dev_out == NULL)
1851
                        goto out;
1852
                if (__in_dev_get(dev_out) == NULL) {
1853
                        dev_put(dev_out);
1854
                        goto out;       /* Wrong error code */
1855
                }
1856
 
1857
                if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1858
                        if (!key.src)
1859
                                key.src = inet_select_addr(dev_out, 0,
1860
                                                                RT_SCOPE_LINK);
1861
                        goto make_route;
1862
                }
1863
                if (!key.src) {
1864
                        if (MULTICAST(oldkey->dst))
1865
                                key.src = inet_select_addr(dev_out, 0,
1866
                                                                key.scope);
1867
                        else if (!oldkey->dst)
1868
                                key.src = inet_select_addr(dev_out, 0,
1869
                                                                RT_SCOPE_HOST);
1870
                }
1871
        }
1872
 
1873
        if (!key.dst) {
1874
                key.dst = key.src;
1875
                if (!key.dst)
1876
                        key.dst = key.src = htonl(INADDR_LOOPBACK);
1877
                if (dev_out)
1878
                        dev_put(dev_out);
1879
                dev_out = &loopback_dev;
1880
                dev_hold(dev_out);
1881
                key.oif = loopback_dev.ifindex;
1882
                res.type = RTN_LOCAL;
1883
                flags |= RTCF_LOCAL;
1884
                goto make_route;
1885
        }
1886
 
1887
        if (fib_lookup(&key, &res)) {
1888
                res.fi = NULL;
1889
                if (oldkey->oif) {
1890
                        /* Apparently, routing tables are wrong. Assume,
1891
                           that the destination is on link.
1892
 
1893
                           WHY? DW.
1894
                           Because we are allowed to send to iface
1895
                           even if it has NO routes and NO assigned
1896
                           addresses. When oif is specified, routing
1897
                           tables are looked up with only one purpose:
1898
                           to catch if destination is gatewayed, rather than
1899
                           direct. Moreover, if MSG_DONTROUTE is set,
1900
                           we send packet, ignoring both routing tables
1901
                           and ifaddr state. --ANK
1902
 
1903
 
1904
                           We could make it even if oif is unknown,
1905
                           likely IPv6, but we do not.
1906
                         */
1907
 
1908
                        if (key.src == 0)
1909
                                key.src = inet_select_addr(dev_out, 0,
1910
                                                           RT_SCOPE_LINK);
1911
                        res.type = RTN_UNICAST;
1912
                        goto make_route;
1913
                }
1914
                if (dev_out)
1915
                        dev_put(dev_out);
1916
                err = -ENETUNREACH;
1917
                goto out;
1918
        }
1919
        free_res = 1;
1920
 
1921
        if (res.type == RTN_NAT)
1922
                goto e_inval;
1923
 
1924
        if (res.type == RTN_LOCAL) {
1925
                if (!key.src)
1926
                        key.src = key.dst;
1927
                if (dev_out)
1928
                        dev_put(dev_out);
1929
                dev_out = &loopback_dev;
1930
                dev_hold(dev_out);
1931
                key.oif = dev_out->ifindex;
1932
                if (res.fi)
1933
                        fib_info_put(res.fi);
1934
                res.fi = NULL;
1935
                flags |= RTCF_LOCAL;
1936
                goto make_route;
1937
        }
1938
 
1939
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1940
        if (res.fi->fib_nhs > 1 && key.oif == 0)
1941
                fib_select_multipath(&key, &res);
1942
        else
1943
#endif
1944
        if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1945
                fib_select_default(&key, &res);
1946
 
1947
        if (!key.src)
1948
                key.src = FIB_RES_PREFSRC(res);
1949
 
1950
        if (dev_out)
1951
                dev_put(dev_out);
1952
        dev_out = FIB_RES_DEV(res);
1953
        dev_hold(dev_out);
1954
        key.oif = dev_out->ifindex;
1955
 
1956
make_route:
1957
        if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1958
                goto e_inval;
1959
 
1960
        if (key.dst == 0xFFFFFFFF)
1961
                res.type = RTN_BROADCAST;
1962
        else if (MULTICAST(key.dst))
1963
                res.type = RTN_MULTICAST;
1964
        else if (BADCLASS(key.dst) || ZERONET(key.dst))
1965
                goto e_inval;
1966
 
1967
        if (dev_out->flags & IFF_LOOPBACK)
1968
                flags |= RTCF_LOCAL;
1969
 
1970
        if (res.type == RTN_BROADCAST) {
1971
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1972
                if (res.fi) {
1973
                        fib_info_put(res.fi);
1974
                        res.fi = NULL;
1975
                }
1976
        } else if (res.type == RTN_MULTICAST) {
1977
                flags |= RTCF_MULTICAST|RTCF_LOCAL;
1978
                read_lock(&inetdev_lock);
1979
                if (!__in_dev_get(dev_out) ||
1980
                    !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src))
1981
                        flags &= ~RTCF_LOCAL;
1982
                read_unlock(&inetdev_lock);
1983
                /* If multicast route do not exist use
1984
                   default one, but do not gateway in this case.
1985
                   Yes, it is hack.
1986
                 */
1987
                if (res.fi && res.prefixlen < 4) {
1988
                        fib_info_put(res.fi);
1989
                        res.fi = NULL;
1990
                }
1991
        }
1992
 
1993
        rth = dst_alloc(&ipv4_dst_ops);
1994
        if (!rth)
1995
                goto e_nobufs;
1996
 
1997
        atomic_set(&rth->u.dst.__refcnt, 1);
1998
        rth->u.dst.flags= DST_HOST;
1999
        rth->key.dst    = oldkey->dst;
2000
        rth->key.tos    = tos;
2001
        rth->key.src    = oldkey->src;
2002
        rth->key.iif    = 0;
2003
        rth->key.oif    = oldkey->oif;
2004
#ifdef CONFIG_IP_ROUTE_FWMARK
2005
        rth->key.fwmark = oldkey->fwmark;
2006
#endif
2007
        rth->rt_dst     = key.dst;
2008
        rth->rt_src     = key.src;
2009
#ifdef CONFIG_IP_ROUTE_NAT
2010
        rth->rt_dst_map = key.dst;
2011
        rth->rt_src_map = key.src;
2012
#endif
2013
        rth->rt_iif     = oldkey->oif ? : dev_out->ifindex;
2014
        rth->u.dst.dev  = dev_out;
2015
        dev_hold(dev_out);
2016
        rth->rt_gateway = key.dst;
2017
        rth->rt_spec_dst= key.src;
2018
 
2019
        rth->u.dst.output=ip_output;
2020
 
2021
        rt_cache_stat[smp_processor_id()].out_slow_tot++;
2022
 
2023
        if (flags & RTCF_LOCAL) {
2024
                rth->u.dst.input = ip_local_deliver;
2025
                rth->rt_spec_dst = key.dst;
2026
        }
2027
        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2028
                rth->rt_spec_dst = key.src;
2029
                if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2030
                        rth->u.dst.output = ip_mc_output;
2031
                        rt_cache_stat[smp_processor_id()].out_slow_mc++;
2032
                }
2033
#ifdef CONFIG_IP_MROUTE
2034
                if (res.type == RTN_MULTICAST) {
2035
                        struct in_device *in_dev = in_dev_get(dev_out);
2036
                        if (in_dev) {
2037
                                if (IN_DEV_MFORWARD(in_dev) &&
2038
                                    !LOCAL_MCAST(oldkey->dst)) {
2039
                                        rth->u.dst.input = ip_mr_input;
2040
                                        rth->u.dst.output = ip_mc_output;
2041
                                }
2042
                                in_dev_put(in_dev);
2043
                        }
2044
                }
2045
#endif
2046
        }
2047
 
2048
        rt_set_nexthop(rth, &res, 0);
2049
 
2050
        rth->rt_flags = flags;
2051
 
2052
        hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
2053
        err = rt_intern_hash(hash, rth, rp);
2054
done:
2055
        if (free_res)
2056
                fib_res_put(&res);
2057
        if (dev_out)
2058
                dev_put(dev_out);
2059
out:    return err;
2060
 
2061
e_inval:
2062
        err = -EINVAL;
2063
        goto done;
2064
e_nobufs:
2065
        err = -ENOBUFS;
2066
        goto done;
2067
}
2068
 
2069
int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
2070
{
2071
        unsigned hash;
2072
        struct rtable *rth;
2073
 
2074
        hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
2075
 
2076
        read_lock_bh(&rt_hash_table[hash].lock);
2077
        for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2078
                if (rth->key.dst == key->dst &&
2079
                    rth->key.src == key->src &&
2080
                    rth->key.iif == 0 &&
2081
                    rth->key.oif == key->oif &&
2082
#ifdef CONFIG_IP_ROUTE_FWMARK
2083
                    rth->key.fwmark == key->fwmark &&
2084
#endif
2085
                    !((rth->key.tos ^ key->tos) &
2086
                            (IPTOS_RT_MASK | RTO_ONLINK))) {
2087
                        rth->u.dst.lastuse = jiffies;
2088
                        dst_hold(&rth->u.dst);
2089
                        rth->u.dst.__use++;
2090
                        rt_cache_stat[smp_processor_id()].out_hit++;
2091
                        read_unlock_bh(&rt_hash_table[hash].lock);
2092
                        *rp = rth;
2093
                        return 0;
2094
                }
2095
                rt_cache_stat[smp_processor_id()].out_hlist_search++;
2096
        }
2097
        read_unlock_bh(&rt_hash_table[hash].lock);
2098
 
2099
        return ip_route_output_slow(rp, key);
2100
}
2101
 
2102
static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2103
                        int nowait)
2104
{
2105
        struct rtable *rt = (struct rtable*)skb->dst;
2106
        struct rtmsg *r;
2107
        struct nlmsghdr  *nlh;
2108
        unsigned char    *b = skb->tail;
2109
        struct rta_cacheinfo ci;
2110
#ifdef CONFIG_IP_MROUTE
2111
        struct rtattr *eptr;
2112
#endif
2113
        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2114
        r = NLMSG_DATA(nlh);
2115
        nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2116
        r->rtm_family    = AF_INET;
2117
        r->rtm_dst_len  = 32;
2118
        r->rtm_src_len  = 0;
2119
        r->rtm_tos      = rt->key.tos;
2120
        r->rtm_table    = RT_TABLE_MAIN;
2121
        r->rtm_type     = rt->rt_type;
2122
        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2123
        r->rtm_protocol = RTPROT_UNSPEC;
2124
        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2125
        if (rt->rt_flags & RTCF_NOTIFY)
2126
                r->rtm_flags |= RTM_F_NOTIFY;
2127
        RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2128
        if (rt->key.src) {
2129
                r->rtm_src_len = 32;
2130
                RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2131
        }
2132
        if (rt->u.dst.dev)
2133
                RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2134
#ifdef CONFIG_NET_CLS_ROUTE
2135
        if (rt->u.dst.tclassid)
2136
                RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2137
#endif
2138
        if (rt->key.iif)
2139
                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2140
        else if (rt->rt_src != rt->key.src)
2141
                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2142
        if (rt->rt_dst != rt->rt_gateway)
2143
                RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2144
        if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2145
                goto rtattr_failure;
2146
        ci.rta_lastuse  = jiffies - rt->u.dst.lastuse;
2147
        ci.rta_used     = rt->u.dst.__use;
2148
        ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2149
        if (rt->u.dst.expires)
2150
                ci.rta_expires = rt->u.dst.expires - jiffies;
2151
        else
2152
                ci.rta_expires = 0;
2153
        ci.rta_error    = rt->u.dst.error;
2154
        ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2155
        if (rt->peer) {
2156
                ci.rta_id = rt->peer->ip_id_count;
2157
                if (rt->peer->tcp_ts_stamp) {
2158
                        ci.rta_ts = rt->peer->tcp_ts;
2159
                        ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2160
                }
2161
        }
2162
#ifdef CONFIG_IP_MROUTE
2163
        eptr = (struct rtattr*)skb->tail;
2164
#endif
2165
        RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2166
        if (rt->key.iif) {
2167
#ifdef CONFIG_IP_MROUTE
2168
                u32 dst = rt->rt_dst;
2169
 
2170
                if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2171
                    ipv4_devconf.mc_forwarding) {
2172
                        int err = ipmr_get_route(skb, r, nowait);
2173
                        if (err <= 0) {
2174
                                if (!nowait) {
2175
                                        if (err == 0)
2176
                                                return 0;
2177
                                        goto nlmsg_failure;
2178
                                } else {
2179
                                        if (err == -EMSGSIZE)
2180
                                                goto nlmsg_failure;
2181
                                        ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2182
                                }
2183
                        }
2184
                } else
2185
#endif
2186
                        RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2187
        }
2188
 
2189
        nlh->nlmsg_len = skb->tail - b;
2190
        return skb->len;
2191
 
2192
nlmsg_failure:
2193
rtattr_failure:
2194
        skb_trim(skb, b - skb->data);
2195
        return -1;
2196
}
2197
 
2198
int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2199
{
2200
        struct rtattr **rta = arg;
2201
        struct rtmsg *rtm = NLMSG_DATA(nlh);
2202
        struct rtable *rt = NULL;
2203
        u32 dst = 0;
2204
        u32 src = 0;
2205
        int iif = 0;
2206
        int err = -ENOBUFS;
2207
        struct sk_buff *skb;
2208
 
2209
        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2210
        if (!skb)
2211
                goto out;
2212
 
2213
        /* Reserve room for dummy headers, this skb can pass
2214
           through good chunk of routing engine.
2215
         */
2216
        skb->mac.raw = skb->data;
2217
        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2218
 
2219
        if (rta[RTA_SRC - 1])
2220
                memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2221
        if (rta[RTA_DST - 1])
2222
                memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2223
        if (rta[RTA_IIF - 1])
2224
                memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2225
 
2226
        if (iif) {
2227
                struct net_device *dev = __dev_get_by_index(iif);
2228
                err = -ENODEV;
2229
                if (!dev)
2230
                        goto out_free;
2231
                skb->protocol   = htons(ETH_P_IP);
2232
                skb->dev        = dev;
2233
                local_bh_disable();
2234
                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2235
                local_bh_enable();
2236
                rt = (struct rtable*)skb->dst;
2237
                if (!err && rt->u.dst.error)
2238
                        err = -rt->u.dst.error;
2239
        } else {
2240
                int oif = 0;
2241
                if (rta[RTA_OIF - 1])
2242
                        memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2243
                err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2244
        }
2245
        if (err)
2246
                goto out_free;
2247
 
2248
        skb->dst = &rt->u.dst;
2249
        if (rtm->rtm_flags & RTM_F_NOTIFY)
2250
                rt->rt_flags |= RTCF_NOTIFY;
2251
 
2252
        NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2253
 
2254
        err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2255
                                RTM_NEWROUTE, 0);
2256
        if (!err)
2257
                goto out_free;
2258
        if (err < 0) {
2259
                err = -EMSGSIZE;
2260
                goto out_free;
2261
        }
2262
 
2263
        err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2264
        if (err > 0)
2265
                err = 0;
2266
out:    return err;
2267
 
2268
out_free:
2269
        kfree_skb(skb);
2270
        goto out;
2271
}
2272
 
2273
int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2274
{
2275
        struct rtable *rt;
2276
        int h, s_h;
2277
        int idx, s_idx;
2278
 
2279
        s_h = cb->args[0];
2280
        s_idx = idx = cb->args[1];
2281
        for (h = 0; h <= rt_hash_mask; h++) {
2282
                if (h < s_h) continue;
2283
                if (h > s_h)
2284
                        s_idx = 0;
2285
                read_lock_bh(&rt_hash_table[h].lock);
2286
                for (rt = rt_hash_table[h].chain, idx = 0; rt;
2287
                     rt = rt->u.rt_next, idx++) {
2288
                        if (idx < s_idx)
2289
                                continue;
2290
                        skb->dst = dst_clone(&rt->u.dst);
2291
                        if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2292
                                         cb->nlh->nlmsg_seq,
2293
                                         RTM_NEWROUTE, 1) <= 0) {
2294
                                dst_release(xchg(&skb->dst, NULL));
2295
                                read_unlock_bh(&rt_hash_table[h].lock);
2296
                                goto done;
2297
                        }
2298
                        dst_release(xchg(&skb->dst, NULL));
2299
                }
2300
                read_unlock_bh(&rt_hash_table[h].lock);
2301
        }
2302
 
2303
done:
2304
        cb->args[0] = h;
2305
        cb->args[1] = idx;
2306
        return skb->len;
2307
}
2308
 
2309
void ip_rt_multicast_event(struct in_device *in_dev)
2310
{
2311
        rt_cache_flush(0);
2312
}
2313
 
2314
#ifdef CONFIG_SYSCTL
2315
static int flush_delay;
2316
 
2317
static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2318
                                        struct file *filp, void *buffer,
2319
                                        size_t *lenp)
2320
{
2321
        if (write) {
2322
                proc_dointvec(ctl, write, filp, buffer, lenp);
2323
                rt_cache_flush(flush_delay);
2324
                return 0;
2325
        }
2326
 
2327
        return -EINVAL;
2328
}
2329
 
2330
static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2331
                                                int nlen, void *oldval,
2332
                                                size_t *oldlenp, void *newval,
2333
                                                size_t newlen, void **context)
2334
{
2335
        int delay;
2336
        if (newlen != sizeof(int))
2337
                return -EINVAL;
2338
        if (get_user(delay, (int *)newval))
2339
                return -EFAULT;
2340
        rt_cache_flush(delay);
2341
        return 0;
2342
}
2343
 
2344
ctl_table ipv4_route_table[] = {
2345
        {
2346
                ctl_name:       NET_IPV4_ROUTE_FLUSH,
2347
                procname:       "flush",
2348
                data:           &flush_delay,
2349
                maxlen:         sizeof(int),
2350
                mode:           0644,
2351
                proc_handler:   &ipv4_sysctl_rtcache_flush,
2352
                strategy:       &ipv4_sysctl_rtcache_flush_strategy,
2353
        },
2354
        {
2355
                ctl_name:       NET_IPV4_ROUTE_MIN_DELAY,
2356
                procname:       "min_delay",
2357
                data:           &ip_rt_min_delay,
2358
                maxlen:         sizeof(int),
2359
                mode:           0644,
2360
                proc_handler:   &proc_dointvec_jiffies,
2361
                strategy:       &sysctl_jiffies,
2362
        },
2363
        {
2364
                ctl_name:       NET_IPV4_ROUTE_MAX_DELAY,
2365
                procname:       "max_delay",
2366
                data:           &ip_rt_max_delay,
2367
                maxlen:         sizeof(int),
2368
                mode:           0644,
2369
                proc_handler:   &proc_dointvec_jiffies,
2370
                strategy:       &sysctl_jiffies,
2371
        },
2372
        {
2373
                ctl_name:       NET_IPV4_ROUTE_GC_THRESH,
2374
                procname:       "gc_thresh",
2375
                data:           &ipv4_dst_ops.gc_thresh,
2376
                maxlen:         sizeof(int),
2377
                mode:           0644,
2378
                proc_handler:   &proc_dointvec,
2379
        },
2380
        {
2381
                ctl_name:       NET_IPV4_ROUTE_MAX_SIZE,
2382
                procname:       "max_size",
2383
                data:           &ip_rt_max_size,
2384
                maxlen:         sizeof(int),
2385
                mode:           0644,
2386
                proc_handler:   &proc_dointvec,
2387
        },
2388
        {
2389
                ctl_name:       NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2390
                procname:       "gc_min_interval",
2391
                data:           &ip_rt_gc_min_interval,
2392
                maxlen:         sizeof(int),
2393
                mode:           0644,
2394
                proc_handler:   &proc_dointvec_jiffies,
2395
                strategy:       &sysctl_jiffies,
2396
        },
2397
        {
2398
                ctl_name:       NET_IPV4_ROUTE_GC_TIMEOUT,
2399
                procname:       "gc_timeout",
2400
                data:           &ip_rt_gc_timeout,
2401
                maxlen:         sizeof(int),
2402
                mode:           0644,
2403
                proc_handler:   &proc_dointvec_jiffies,
2404
                strategy:       &sysctl_jiffies,
2405
        },
2406
        {
2407
                ctl_name:       NET_IPV4_ROUTE_GC_INTERVAL,
2408
                procname:       "gc_interval",
2409
                data:           &ip_rt_gc_interval,
2410
                maxlen:         sizeof(int),
2411
                mode:           0644,
2412
                proc_handler:   &proc_dointvec_jiffies,
2413
                strategy:       &sysctl_jiffies,
2414
        },
2415
        {
2416
                ctl_name:       NET_IPV4_ROUTE_REDIRECT_LOAD,
2417
                procname:       "redirect_load",
2418
                data:           &ip_rt_redirect_load,
2419
                maxlen:         sizeof(int),
2420
                mode:           0644,
2421
                proc_handler:   &proc_dointvec,
2422
        },
2423
        {
2424
                ctl_name:       NET_IPV4_ROUTE_REDIRECT_NUMBER,
2425
                procname:       "redirect_number",
2426
                data:           &ip_rt_redirect_number,
2427
                maxlen:         sizeof(int),
2428
                mode:           0644,
2429
                proc_handler:   &proc_dointvec,
2430
        },
2431
        {
2432
                ctl_name:       NET_IPV4_ROUTE_REDIRECT_SILENCE,
2433
                procname:       "redirect_silence",
2434
                data:           &ip_rt_redirect_silence,
2435
                maxlen:         sizeof(int),
2436
                mode:           0644,
2437
                proc_handler:   &proc_dointvec,
2438
        },
2439
        {
2440
                ctl_name:       NET_IPV4_ROUTE_ERROR_COST,
2441
                procname:       "error_cost",
2442
                data:           &ip_rt_error_cost,
2443
                maxlen:         sizeof(int),
2444
                mode:           0644,
2445
                proc_handler:   &proc_dointvec,
2446
        },
2447
        {
2448
                ctl_name:       NET_IPV4_ROUTE_ERROR_BURST,
2449
                procname:       "error_burst",
2450
                data:           &ip_rt_error_burst,
2451
                maxlen:         sizeof(int),
2452
                mode:           0644,
2453
                proc_handler:   &proc_dointvec,
2454
        },
2455
        {
2456
                ctl_name:       NET_IPV4_ROUTE_GC_ELASTICITY,
2457
                procname:       "gc_elasticity",
2458
                data:           &ip_rt_gc_elasticity,
2459
                maxlen:         sizeof(int),
2460
                mode:           0644,
2461
                proc_handler:   &proc_dointvec,
2462
        },
2463
        {
2464
                ctl_name:       NET_IPV4_ROUTE_MTU_EXPIRES,
2465
                procname:       "mtu_expires",
2466
                data:           &ip_rt_mtu_expires,
2467
                maxlen:         sizeof(int),
2468
                mode:           0644,
2469
                proc_handler:   &proc_dointvec_jiffies,
2470
                strategy:       &sysctl_jiffies,
2471
        },
2472
        {
2473
                ctl_name:       NET_IPV4_ROUTE_MIN_PMTU,
2474
                procname:       "min_pmtu",
2475
                data:           &ip_rt_min_pmtu,
2476
                maxlen:         sizeof(int),
2477
                mode:           0644,
2478
                proc_handler:   &proc_dointvec,
2479
        },
2480
        {
2481
                ctl_name:       NET_IPV4_ROUTE_MIN_ADVMSS,
2482
                procname:       "min_adv_mss",
2483
                data:           &ip_rt_min_advmss,
2484
                maxlen:         sizeof(int),
2485
                mode:           0644,
2486
                proc_handler:   &proc_dointvec,
2487
        },
2488
        {
2489
                ctl_name:       NET_IPV4_ROUTE_SECRET_INTERVAL,
2490
                procname:       "secret_interval",
2491
                data:           &ip_rt_secret_interval,
2492
                maxlen:         sizeof(int),
2493
                mode:           0644,
2494
                proc_handler:   &proc_dointvec_jiffies,
2495
                strategy:       &sysctl_jiffies,
2496
        },
2497
         { 0 }
2498
};
2499
#endif
2500
 
2501
#ifdef CONFIG_NET_CLS_ROUTE
2502
struct ip_rt_acct *ip_rt_acct;
2503
 
2504
/* This code sucks.  But you should have seen it before! --RR */
2505
 
2506
/* IP route accounting ptr for this logical cpu number. */
2507
#define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
2508
 
2509
static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2510
                           int length, int *eof, void *data)
2511
{
2512
        unsigned int i;
2513
 
2514
        if ((offset & 3) || (length & 3))
2515
                return -EIO;
2516
 
2517
        if (offset >= sizeof(struct ip_rt_acct) * 256) {
2518
                *eof = 1;
2519
                return 0;
2520
        }
2521
 
2522
        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2523
                length = sizeof(struct ip_rt_acct) * 256 - offset;
2524
                *eof = 1;
2525
        }
2526
 
2527
        offset /= sizeof(u32);
2528
 
2529
        if (length > 0) {
2530
                u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2531
                u32 *dst = (u32 *) buffer;
2532
 
2533
                /* Copy first cpu. */
2534
                *start = buffer;
2535
                memcpy(dst, src, length);
2536
 
2537
                /* Add the other cpus in, one int at a time */
2538
                for (i = 1; i < smp_num_cpus; i++) {
2539
                        unsigned int j;
2540
 
2541
                        src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2542
 
2543
                        for (j = 0; j < length/4; j++)
2544
                                dst[j] += src[j];
2545
                }
2546
        }
2547
        return length;
2548
}
2549
#endif
2550
 
2551
void __init ip_rt_init(void)
2552
{
2553
        int i, order, goal;
2554
 
2555
        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2556
                             (jiffies ^ (jiffies >> 7)));
2557
 
2558
#ifdef CONFIG_NET_CLS_ROUTE
2559
        for (order = 0;
2560
             (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2561
                /* NOTHING */;
2562
        ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2563
        if (!ip_rt_acct)
2564
                panic("IP: failed to allocate ip_rt_acct\n");
2565
        memset(ip_rt_acct, 0, PAGE_SIZE << order);
2566
#endif
2567
 
2568
        ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2569
                                                     sizeof(struct rtable),
2570
                                                     0, SLAB_HWCACHE_ALIGN,
2571
                                                     NULL, NULL);
2572
 
2573
        if (!ipv4_dst_ops.kmem_cachep)
2574
                panic("IP: failed to allocate ip_dst_cache\n");
2575
 
2576
        goal = num_physpages >> (26 - PAGE_SHIFT);
2577
 
2578
        for (order = 0; (1UL << order) < goal; order++)
2579
                /* NOTHING */;
2580
 
2581
        do {
2582
                rt_hash_mask = (1UL << order) * PAGE_SIZE /
2583
                        sizeof(struct rt_hash_bucket);
2584
                while (rt_hash_mask & (rt_hash_mask - 1))
2585
                        rt_hash_mask--;
2586
                rt_hash_table = (struct rt_hash_bucket *)
2587
                        __get_free_pages(GFP_ATOMIC, order);
2588
        } while (rt_hash_table == NULL && --order > 0);
2589
 
2590
        if (!rt_hash_table)
2591
                panic("Failed to allocate IP route cache hash table\n");
2592
 
2593
        printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2594
               rt_hash_mask,
2595
               (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2596
 
2597
        for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2598
                /* NOTHING */;
2599
 
2600
        rt_hash_mask--;
2601
        for (i = 0; i <= rt_hash_mask; i++) {
2602
                rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2603
                rt_hash_table[i].chain = NULL;
2604
        }
2605
 
2606
        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2607
        ip_rt_max_size = (rt_hash_mask + 1) * 16;
2608
 
2609
        devinet_init();
2610
        ip_fib_init();
2611
 
2612
        rt_flush_timer.function = rt_run_flush;
2613
        rt_periodic_timer.function = rt_check_expire;
2614
        rt_secret_timer.function = rt_secret_rebuild;
2615
 
2616
        /* All the timers, started at system startup tend
2617
           to synchronize. Perturb it a bit.
2618
         */
2619
        rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2620
                                        ip_rt_gc_interval;
2621
        add_timer(&rt_periodic_timer);
2622
 
2623
        rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2624
                ip_rt_secret_interval;
2625
        add_timer(&rt_secret_timer);
2626
 
2627
        proc_net_create ("rt_cache", 0, rt_cache_get_info);
2628
        proc_net_create ("rt_cache_stat", 0, rt_cache_stat_get_info);
2629
#ifdef CONFIG_NET_CLS_ROUTE
2630
        create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2631
#endif
2632
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.