URL
https://opencores.org/ocsvn/or1k/or1k/trunk
Subversion Repositories or1k
Compare Revisions
- This comparison shows the changes necessary to convert path
/or1k/trunk/linux/linux-2.4/net/sched
- from Rev 1275 to Rev 1765
- ↔ Reverse comparison
Rev 1275 → Rev 1765
/cls_tcindex.c
0,0 → 1,509
/* |
* net/sched/cls_tcindex.c Packet classifier for skb->tc_index |
* |
* Written 1998,1999 by Werner Almesberger, EPFL ICA |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/skbuff.h> |
#include <linux/errno.h> |
#include <linux/netdevice.h> |
#include <net/ip.h> |
#include <net/pkt_sched.h> |
#include <net/route.h> |
|
|
/* |
* Not quite sure if we need all the xchgs Alexey uses when accessing things. |
* Can always add them later ... :) |
*/ |
|
/* |
* Passing parameters to the root seems to be done more awkwardly than really |
* necessary. At least, u32 doesn't seem to use such dirty hacks. To be |
* verified. FIXME. |
*/ |
|
#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ |
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ |
|
|
#if 1 /* control */ |
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define DPRINTK(format,args...) |
#endif |
|
#if 0 /* data */ |
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define D2PRINTK(format,args...) |
#endif |
|
|
#define PRIV(tp) ((struct tcindex_data *) (tp)->root) |
|
|
struct tcindex_filter_result { |
struct tcf_police *police; |
struct tcf_result res; |
}; |
|
struct tcindex_filter { |
__u16 key; |
struct tcindex_filter_result result; |
struct tcindex_filter *next; |
}; |
|
|
struct tcindex_data { |
struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ |
struct tcindex_filter **h; /* imperfect hash; only used if !perfect; |
NULL if unused */ |
__u16 mask; /* AND key with mask */ |
int shift; /* shift ANDed key to the right */ |
int hash; /* hash table size; 0 if undefined */ |
int alloc_hash; /* allocated size */ |
int fall_through; /* 0: only classify if explicit match */ |
}; |
|
|
static struct tcindex_filter_result *lookup(struct tcindex_data *p,__u16 key) |
{ |
struct tcindex_filter *f; |
|
if (p->perfect) |
return p->perfect[key].res.class ? p->perfect+key : NULL; |
if (!p->h) |
return NULL; |
for (f = p->h[key % p->hash]; f; f = f->next) { |
if (f->key == key) |
return &f->result; |
} |
return NULL; |
} |
|
|
static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, |
struct tcf_result *res) |
{ |
struct tcindex_data *p = PRIV(tp); |
struct tcindex_filter_result *f; |
|
D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p); |
|
f = lookup(p,(skb->tc_index & p->mask) >> p->shift); |
if (!f) { |
if (!p->fall_through) |
return -1; |
res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), |
(skb->tc_index& p->mask) >> p->shift); |
res->class = 0; |
D2PRINTK("alg 0x%x\n",res->classid); |
return 0; |
} |
*res = f->res; |
D2PRINTK("map 0x%x\n",res->classid); |
#ifdef CONFIG_NET_CLS_POLICE |
if (f->police) { |
int result; |
|
result = tcf_police(skb,f->police); |
D2PRINTK("police %d\n",res); |
return result; |
} |
#endif |
return 0; |
} |
|
|
static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) |
{ |
struct tcindex_data *p = PRIV(tp); |
struct tcindex_filter_result *r; |
|
DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle); |
if (p->perfect && handle >= p->alloc_hash) |
return 0; |
r = lookup(PRIV(tp),handle); |
return r && r->res.class ? (unsigned long) r : 0; |
} |
|
|
static void tcindex_put(struct tcf_proto *tp, unsigned long f) |
{ |
DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f); |
} |
|
|
static int tcindex_init(struct tcf_proto *tp) |
{ |
struct tcindex_data *p; |
|
DPRINTK("tcindex_init(tp %p)\n",tp); |
MOD_INC_USE_COUNT; |
p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL); |
if (!p) { |
MOD_DEC_USE_COUNT; |
return -ENOMEM; |
} |
tp->root = p; |
p->perfect = NULL; |
p->h = NULL; |
p->hash = 0; |
p->mask = 0xffff; |
p->shift = 0; |
p->fall_through = 1; |
return 0; |
} |
|
|
static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) |
{ |
struct tcindex_data *p = PRIV(tp); |
struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; |
struct tcindex_filter *f = NULL; |
unsigned long cl; |
|
DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f); |
if (p->perfect) { |
if (!r->res.class) |
return -ENOENT; |
} else { |
int i; |
struct tcindex_filter **walk = NULL; |
|
for (i = 0; i < p->hash; i++) |
for (walk = p->h+i; *walk; walk = &(*walk)->next) |
if (&(*walk)->result == r) |
goto found; |
return -ENOENT; |
|
found: |
f = *walk; |
tcf_tree_lock(tp); |
*walk = f->next; |
tcf_tree_unlock(tp); |
} |
cl = __cls_set_class(&r->res.class,0); |
if (cl) |
tp->q->ops->cl_ops->unbind_tcf(tp->q,cl); |
#ifdef CONFIG_NET_CLS_POLICE |
tcf_police_release(r->police); |
#endif |
if (f) |
kfree(f); |
return 0; |
} |
|
|
/* |
* There are no parameters for tcindex_init, so we overload tcindex_change |
*/ |
|
|
static int tcindex_change(struct tcf_proto *tp,unsigned long base,u32 handle, |
struct rtattr **tca,unsigned long *arg) |
{ |
struct tcindex_filter_result new_filter_result = { |
NULL, /* no policing */ |
{ 0,0 }, /* no classification */ |
}; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_TCINDEX_MAX]; |
struct tcindex_data *p = PRIV(tp); |
struct tcindex_filter *f; |
struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; |
struct tcindex_filter **walk; |
int hash,shift; |
__u16 mask; |
|
DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," |
"p %p,r %p\n",tp,handle,tca,arg,opt,p,r); |
if (arg) |
DPRINTK("*arg = 0x%lx\n",*arg); |
if (!opt) |
return 0; |
if (rtattr_parse(tb,TCA_TCINDEX_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0) |
return -EINVAL; |
if (!tb[TCA_TCINDEX_HASH-1]) { |
hash = p->hash; |
} else { |
if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(int)) |
return -EINVAL; |
hash = *(int *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]); |
} |
if (!tb[TCA_TCINDEX_MASK-1]) { |
mask = p->mask; |
} else { |
if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(__u16)) |
return -EINVAL; |
mask = *(__u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]); |
} |
if (!tb[TCA_TCINDEX_SHIFT-1]) |
shift = p->shift; |
else { |
if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(__u16)) |
return -EINVAL; |
shift = *(int *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]); |
} |
if (p->perfect && hash <= (mask >> shift)) |
return -EBUSY; |
if (p->perfect && hash > p->alloc_hash) |
return -EBUSY; |
if (p->h && hash != p->alloc_hash) |
return -EBUSY; |
p->hash = hash; |
p->mask = mask; |
p->shift = shift; |
if (tb[TCA_TCINDEX_FALL_THROUGH-1]) { |
if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(int)) |
return -EINVAL; |
p->fall_through = |
*(int *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]); |
} |
DPRINTK("classid/police %p/%p\n",tb[TCA_TCINDEX_CLASSID-1], |
tb[TCA_TCINDEX_POLICE-1]); |
if (!tb[TCA_TCINDEX_CLASSID-1] && !tb[TCA_TCINDEX_POLICE-1]) |
return 0; |
if (!hash) { |
if ((mask >> shift) < PERFECT_HASH_THRESHOLD) { |
p->hash = (mask >> shift)+1; |
} else { |
p->hash = DEFAULT_HASH_SIZE; |
} |
} |
if (!p->perfect && !p->h) { |
p->alloc_hash = p->hash; |
DPRINTK("hash %d mask %d\n",p->hash,p->mask); |
if (p->hash > (mask >> shift)) { |
p->perfect = kmalloc(p->hash* |
sizeof(struct tcindex_filter_result),GFP_KERNEL); |
if (!p->perfect) |
return -ENOMEM; |
memset(p->perfect, 0, |
p->hash * sizeof(struct tcindex_filter_result)); |
} else { |
p->h = kmalloc(p->hash*sizeof(struct tcindex_filter *), |
GFP_KERNEL); |
if (!p->h) |
return -ENOMEM; |
memset(p->h, 0, p->hash*sizeof(struct tcindex_filter *)); |
} |
} |
/* |
* Note: this could be as restrictive as |
* if (handle & ~(mask >> shift)) |
* but then, we'd fail handles that may become valid after some |
* future mask change. While this is extremely unlikely to ever |
* matter, the check below is safer (and also more |
* backwards-compatible). |
*/ |
if (p->perfect && handle >= p->alloc_hash) |
return -EINVAL; |
if (p->perfect) { |
r = p->perfect+handle; |
} else { |
r = lookup(p,handle); |
DPRINTK("r=%p\n",r); |
if (!r) |
r = &new_filter_result; |
} |
DPRINTK("r=%p\n",r); |
if (tb[TCA_TCINDEX_CLASSID-1]) { |
unsigned long cl = cls_set_class(tp,&r->res.class,0); |
|
if (cl) |
tp->q->ops->cl_ops->unbind_tcf(tp->q,cl); |
r->res.classid = *(__u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]); |
r->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q,base, |
r->res.classid); |
if (!r->res.class) { |
r->res.classid = 0; |
return -ENOENT; |
} |
} |
#ifdef CONFIG_NET_CLS_POLICE |
{ |
struct tcf_police *police; |
|
police = tb[TCA_TCINDEX_POLICE-1] ? |
tcf_police_locate(tb[TCA_TCINDEX_POLICE-1],NULL) : NULL; |
tcf_tree_lock(tp); |
police = xchg(&r->police,police); |
tcf_tree_unlock(tp); |
tcf_police_release(police); |
} |
#endif |
if (r != &new_filter_result) |
return 0; |
f = kmalloc(sizeof(struct tcindex_filter),GFP_KERNEL); |
if (!f) |
return -ENOMEM; |
f->key = handle; |
f->result = new_filter_result; |
f->next = NULL; |
for (walk = p->h+(handle % p->hash); *walk; walk = &(*walk)->next) |
/* nothing */; |
wmb(); |
*walk = f; |
return 0; |
} |
|
|
static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) |
{ |
struct tcindex_data *p = PRIV(tp); |
struct tcindex_filter *f,*next; |
int i; |
|
DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p); |
if (p->perfect) { |
for (i = 0; i < p->hash; i++) { |
if (!p->perfect[i].res.class) |
continue; |
if (walker->count >= walker->skip) { |
if (walker->fn(tp, |
(unsigned long) (p->perfect+i), walker) |
< 0) { |
walker->stop = 1; |
return; |
} |
} |
walker->count++; |
} |
} |
if (!p->h) |
return; |
for (i = 0; i < p->hash; i++) { |
for (f = p->h[i]; f; f = next) { |
next = f->next; |
if (walker->count >= walker->skip) { |
if (walker->fn(tp,(unsigned long) &f->result, |
walker) < 0) { |
walker->stop = 1; |
return; |
} |
} |
walker->count++; |
} |
} |
} |
|
|
static int tcindex_destroy_element(struct tcf_proto *tp, |
unsigned long arg, struct tcf_walker *walker) |
{ |
return tcindex_delete(tp,arg); |
} |
|
|
static void tcindex_destroy(struct tcf_proto *tp) |
{ |
struct tcindex_data *p = PRIV(tp); |
struct tcf_walker walker; |
|
DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p); |
walker.count = 0; |
walker.skip = 0; |
walker.fn = &tcindex_destroy_element; |
tcindex_walk(tp,&walker); |
if (p->perfect) |
kfree(p->perfect); |
if (p->h) |
kfree(p->h); |
kfree(p); |
tp->root = NULL; |
MOD_DEC_USE_COUNT; |
} |
|
|
static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, |
struct sk_buff *skb, struct tcmsg *t) |
{ |
struct tcindex_data *p = PRIV(tp); |
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", |
tp,fh,skb,t,p,r,b); |
DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h); |
rta = (struct rtattr *) b; |
RTA_PUT(skb,TCA_OPTIONS,0,NULL); |
if (!fh) { |
t->tcm_handle = ~0; /* whatever ... */ |
RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash); |
RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask); |
RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift); |
RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through), |
&p->fall_through); |
} else { |
if (p->perfect) { |
t->tcm_handle = r-p->perfect; |
} else { |
struct tcindex_filter *f; |
int i; |
|
t->tcm_handle = 0; |
for (i = 0; !t->tcm_handle && i < p->hash; i++) { |
for (f = p->h[i]; !t->tcm_handle && f; |
f = f->next) { |
if (&f->result == r) |
t->tcm_handle = f->key; |
} |
} |
} |
DPRINTK("handle = %d\n",t->tcm_handle); |
if (r->res.class) |
RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid); |
#ifdef CONFIG_NET_CLS_POLICE |
if (r->police) { |
struct rtattr *p_rta = (struct rtattr *) skb->tail; |
|
RTA_PUT(skb,TCA_TCINDEX_POLICE,0,NULL); |
if (tcf_police_dump(skb,r->police) < 0) |
goto rtattr_failure; |
p_rta->rta_len = skb->tail-(u8 *) p_rta; |
} |
#endif |
} |
rta->rta_len = skb->tail-b; |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
struct tcf_proto_ops cls_tcindex_ops = { |
NULL, |
"tcindex", |
tcindex_classify, |
tcindex_init, |
tcindex_destroy, |
|
tcindex_get, |
tcindex_put, |
tcindex_change, |
tcindex_delete, |
tcindex_walk, |
tcindex_dump |
}; |
|
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_tcf_proto_ops(&cls_tcindex_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_tcf_proto_ops(&cls_tcindex_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/cls_route.c
0,0 → 1,635
/* |
* net/sched/cls_route.c ROUTE4 classifier. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
#include <linux/module.h> |
#include <linux/config.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
/* |
1. For now we assume that route tags < 256. |
It allows to use direct table lookups, instead of hash tables. |
2. For now we assume that "from TAG" and "fromdev DEV" statements |
are mutually exclusive. |
3. "to TAG from ANY" has higher priority, than "to ANY from XXX" |
*/ |
|
struct route4_fastmap |
{ |
struct route4_filter *filter; |
u32 id; |
int iif; |
}; |
|
struct route4_head |
{ |
struct route4_fastmap fastmap[16]; |
struct route4_bucket *table[256+1]; |
}; |
|
struct route4_bucket |
{ |
struct route4_filter *ht[16+16+1]; |
}; |
|
struct route4_filter |
{ |
struct route4_filter *next; |
u32 id; |
int iif; |
|
struct tcf_result res; |
#ifdef CONFIG_NET_CLS_POLICE |
struct tcf_police *police; |
#endif |
|
u32 handle; |
struct route4_bucket *bkt; |
}; |
|
#define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) |
|
static __inline__ int route4_fastmap_hash(u32 id, int iif) |
{ |
return id&0xF; |
} |
|
static void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id) |
{ |
spin_lock_bh(&dev->queue_lock); |
memset(head->fastmap, 0, sizeof(head->fastmap)); |
spin_unlock_bh(&dev->queue_lock); |
} |
|
static void __inline__ |
route4_set_fastmap(struct route4_head *head, u32 id, int iif, |
struct route4_filter *f) |
{ |
int h = route4_fastmap_hash(id, iif); |
head->fastmap[h].id = id; |
head->fastmap[h].iif = iif; |
head->fastmap[h].filter = f; |
} |
|
static __inline__ int route4_hash_to(u32 id) |
{ |
return id&0xFF; |
} |
|
static __inline__ int route4_hash_from(u32 id) |
{ |
return (id>>16)&0xF; |
} |
|
static __inline__ int route4_hash_iif(int iif) |
{ |
return 16 + ((iif>>16)&0xF); |
} |
|
static __inline__ int route4_hash_wild(void) |
{ |
return 32; |
} |
|
#ifdef CONFIG_NET_CLS_POLICE |
#define IF_ROUTE_POLICE \ |
if (f->police) { \ |
int pol_res = tcf_police(skb, f->police); \ |
if (pol_res >= 0) return pol_res; \ |
dont_cache = 1; \ |
continue; \ |
} \ |
if (!dont_cache) |
#else |
#define IF_ROUTE_POLICE |
#endif |
|
|
static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, |
struct tcf_result *res) |
{ |
struct route4_head *head = (struct route4_head*)tp->root; |
struct dst_entry *dst; |
struct route4_bucket *b; |
struct route4_filter *f; |
#ifdef CONFIG_NET_CLS_POLICE |
int dont_cache = 0; |
#endif |
u32 id, h; |
int iif; |
|
if ((dst = skb->dst) == NULL) |
goto failure; |
|
id = dst->tclassid; |
if (head == NULL) |
goto old_method; |
|
iif = ((struct rtable*)dst)->key.iif; |
|
h = route4_fastmap_hash(id, iif); |
if (id == head->fastmap[h].id && |
iif == head->fastmap[h].iif && |
(f = head->fastmap[h].filter) != NULL) { |
if (f == ROUTE4_FAILURE) |
goto failure; |
|
*res = f->res; |
return 0; |
} |
|
h = route4_hash_to(id); |
|
restart: |
if ((b = head->table[h]) != NULL) { |
f = b->ht[route4_hash_from(id)]; |
|
for ( ; f; f = f->next) { |
if (f->id == id) { |
*res = f->res; |
IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f); |
return 0; |
} |
} |
|
for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) { |
if (f->iif == iif) { |
*res = f->res; |
IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f); |
return 0; |
} |
} |
|
for (f = b->ht[route4_hash_wild()]; f; f = f->next) { |
*res = f->res; |
IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f); |
return 0; |
} |
|
} |
if (h < 256) { |
h = 256; |
id &= ~0xFFFF; |
goto restart; |
} |
|
#ifdef CONFIG_NET_CLS_POLICE |
if (!dont_cache) |
#endif |
route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); |
failure: |
return -1; |
|
old_method: |
if (id && (TC_H_MAJ(id) == 0 || |
!(TC_H_MAJ(id^tp->q->handle)))) { |
res->classid = id; |
res->class = 0; |
return 0; |
} |
return -1; |
} |
|
static u32 to_hash(u32 id) |
{ |
u32 h = id&0xFF; |
if (id&0x8000) |
h += 256; |
return h; |
} |
|
static u32 from_hash(u32 id) |
{ |
id &= 0xFFFF; |
if (id == 0xFFFF) |
return 32; |
if (!(id & 0x8000)) { |
if (id > 255) |
return 256; |
return id&0xF; |
} |
return 16 + (id&0xF); |
} |
|
static unsigned long route4_get(struct tcf_proto *tp, u32 handle) |
{ |
struct route4_head *head = (struct route4_head*)tp->root; |
struct route4_bucket *b; |
struct route4_filter *f; |
unsigned h1, h2; |
|
if (!head) |
return 0; |
|
h1 = to_hash(handle); |
if (h1 > 256) |
return 0; |
|
h2 = from_hash(handle>>16); |
if (h2 > 32) |
return 0; |
|
if ((b = head->table[h1]) != NULL) { |
for (f = b->ht[h2]; f; f = f->next) |
if (f->handle == handle) |
return (unsigned long)f; |
} |
return 0; |
} |
|
static void route4_put(struct tcf_proto *tp, unsigned long f) |
{ |
} |
|
static int route4_init(struct tcf_proto *tp) |
{ |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
static void route4_destroy(struct tcf_proto *tp) |
{ |
struct route4_head *head = xchg(&tp->root, NULL); |
int h1, h2; |
|
if (head == NULL) { |
MOD_DEC_USE_COUNT; |
return; |
} |
|
for (h1=0; h1<=256; h1++) { |
struct route4_bucket *b; |
|
if ((b = head->table[h1]) != NULL) { |
for (h2=0; h2<=32; h2++) { |
struct route4_filter *f; |
|
while ((f = b->ht[h2]) != NULL) { |
unsigned long cl; |
|
b->ht[h2] = f->next; |
if ((cl = __cls_set_class(&f->res.class, 0)) != 0) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
#ifdef CONFIG_NET_CLS_POLICE |
tcf_police_release(f->police); |
#endif |
kfree(f); |
} |
} |
kfree(b); |
} |
} |
kfree(head); |
MOD_DEC_USE_COUNT; |
} |
|
static int route4_delete(struct tcf_proto *tp, unsigned long arg) |
{ |
struct route4_head *head = (struct route4_head*)tp->root; |
struct route4_filter **fp, *f = (struct route4_filter*)arg; |
unsigned h = 0; |
struct route4_bucket *b; |
int i; |
|
if (!head || !f) |
return -EINVAL; |
|
h = f->handle; |
b = f->bkt; |
|
for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { |
if (*fp == f) { |
unsigned long cl; |
|
tcf_tree_lock(tp); |
*fp = f->next; |
tcf_tree_unlock(tp); |
|
route4_reset_fastmap(tp->q->dev, head, f->id); |
|
if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
|
#ifdef CONFIG_NET_CLS_POLICE |
tcf_police_release(f->police); |
#endif |
kfree(f); |
|
/* Strip tree */ |
|
for (i=0; i<=32; i++) |
if (b->ht[i]) |
return 0; |
|
/* OK, session has no flows */ |
tcf_tree_lock(tp); |
head->table[to_hash(h)] = NULL; |
tcf_tree_unlock(tp); |
|
kfree(b); |
return 0; |
} |
} |
return 0; |
} |
|
static int route4_change(struct tcf_proto *tp, unsigned long base, |
u32 handle, |
struct rtattr **tca, |
unsigned long *arg) |
{ |
struct route4_head *head = tp->root; |
struct route4_filter *f, *f1, **ins_f; |
struct route4_bucket *b; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_ROUTE4_MAX]; |
unsigned h1, h2; |
int err; |
|
if (opt == NULL) |
return handle ? -EINVAL : 0; |
|
if (rtattr_parse(tb, TCA_ROUTE4_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) |
return -EINVAL; |
|
if ((f = (struct route4_filter*)*arg) != NULL) { |
/* Node exists: adjust only classid */ |
|
if (f->handle != handle && handle) |
return -EINVAL; |
if (tb[TCA_ROUTE4_CLASSID-1]) { |
unsigned long cl; |
|
f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); |
cl = cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); |
if (cl) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
} |
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_ROUTE4_POLICE-1]) { |
struct tcf_police *police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]); |
|
tcf_tree_lock(tp); |
police = xchg(&f->police, police); |
tcf_tree_unlock(tp); |
|
tcf_police_release(police); |
} |
#endif |
return 0; |
} |
|
/* Now more serious part... */ |
|
if (head == NULL) { |
head = kmalloc(sizeof(struct route4_head), GFP_KERNEL); |
if (head == NULL) |
return -ENOBUFS; |
memset(head, 0, sizeof(struct route4_head)); |
|
tcf_tree_lock(tp); |
tp->root = head; |
tcf_tree_unlock(tp); |
} |
|
f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL); |
if (f == NULL) |
return -ENOBUFS; |
|
memset(f, 0, sizeof(*f)); |
|
err = -EINVAL; |
f->handle = 0x8000; |
if (tb[TCA_ROUTE4_TO-1]) { |
if (handle&0x8000) |
goto errout; |
if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < 4) |
goto errout; |
f->id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]); |
if (f->id > 0xFF) |
goto errout; |
f->handle = f->id; |
} |
if (tb[TCA_ROUTE4_FROM-1]) { |
u32 sid; |
if (tb[TCA_ROUTE4_IIF-1]) |
goto errout; |
if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < 4) |
goto errout; |
sid = (*(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1])); |
if (sid > 0xFF) |
goto errout; |
f->handle |= sid<<16; |
f->id |= sid<<16; |
} else if (tb[TCA_ROUTE4_IIF-1]) { |
if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < 4) |
goto errout; |
f->iif = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]); |
if (f->iif > 0x7FFF) |
goto errout; |
f->handle |= (f->iif|0x8000)<<16; |
} else |
f->handle |= 0xFFFF<<16; |
|
if (handle) { |
f->handle |= handle&0x7F00; |
if (f->handle != handle) |
goto errout; |
} |
|
if (tb[TCA_ROUTE4_CLASSID-1]) { |
if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < 4) |
goto errout; |
f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); |
} |
|
h1 = to_hash(f->handle); |
if ((b = head->table[h1]) == NULL) { |
err = -ENOBUFS; |
b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL); |
if (b == NULL) |
goto errout; |
memset(b, 0, sizeof(*b)); |
|
tcf_tree_lock(tp); |
head->table[h1] = b; |
tcf_tree_unlock(tp); |
} |
f->bkt = b; |
|
err = -EEXIST; |
h2 = from_hash(f->handle>>16); |
for (ins_f = &b->ht[h2]; (f1=*ins_f) != NULL; ins_f = &f1->next) { |
if (f->handle < f1->handle) |
break; |
if (f1->handle == f->handle) |
goto errout; |
} |
|
cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); |
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_ROUTE4_POLICE-1]) |
f->police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]); |
#endif |
|
f->next = f1; |
tcf_tree_lock(tp); |
*ins_f = f; |
tcf_tree_unlock(tp); |
|
route4_reset_fastmap(tp->q->dev, head, f->id); |
*arg = (unsigned long)f; |
return 0; |
|
errout: |
if (f) |
kfree(f); |
return err; |
} |
|
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) |
{ |
struct route4_head *head = tp->root; |
unsigned h, h1; |
|
if (head == NULL) |
arg->stop = 1; |
|
if (arg->stop) |
return; |
|
for (h = 0; h <= 256; h++) { |
struct route4_bucket *b = head->table[h]; |
|
if (b) { |
for (h1 = 0; h1 <= 32; h1++) { |
struct route4_filter *f; |
|
for (f = b->ht[h1]; f; f = f->next) { |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (arg->fn(tp, (unsigned long)f, arg) < 0) { |
arg->stop = 1; |
break; |
} |
arg->count++; |
} |
} |
} |
} |
} |
|
static int route4_dump(struct tcf_proto *tp, unsigned long fh, |
struct sk_buff *skb, struct tcmsg *t) |
{ |
struct route4_filter *f = (struct route4_filter*)fh; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
u32 id; |
|
if (f == NULL) |
return skb->len; |
|
t->tcm_handle = f->handle; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
if (!(f->handle&0x8000)) { |
id = f->id&0xFF; |
RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id); |
} |
if (f->handle&0x80000000) { |
if ((f->handle>>16) != 0xFFFF) |
RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif); |
} else { |
id = f->id>>16; |
RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id); |
} |
if (f->res.classid) |
RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid); |
#ifdef CONFIG_NET_CLS_POLICE |
if (f->police) { |
struct rtattr * p_rta = (struct rtattr*)skb->tail; |
|
RTA_PUT(skb, TCA_ROUTE4_POLICE, 0, NULL); |
|
if (tcf_police_dump(skb, f->police) < 0) |
goto rtattr_failure; |
|
p_rta->rta_len = skb->tail - (u8*)p_rta; |
} |
#endif |
|
rta->rta_len = skb->tail - b; |
#ifdef CONFIG_NET_CLS_POLICE |
if (f->police) { |
if (qdisc_copy_stats(skb, &f->police->stats)) |
goto rtattr_failure; |
} |
#endif |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
struct tcf_proto_ops cls_route4_ops = { |
NULL, |
"route", |
route4_classify, |
route4_init, |
route4_destroy, |
|
route4_get, |
route4_put, |
route4_change, |
route4_delete, |
route4_walk, |
route4_dump |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_tcf_proto_ops(&cls_route4_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_tcf_proto_ops(&cls_route4_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/cls_rsvp.h
0,0 → 1,698
/* |
* net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
/* |
Comparing to general packet classification problem, |
RSVP needs only sevaral relatively simple rules: |
|
* (dst, protocol) are always specified, |
so that we are able to hash them. |
* src may be exact, or may be wildcard, so that |
we can keep a hash table plus one wildcard entry. |
* source port (or flow label) is important only if src is given. |
|
IMPLEMENTATION. |
|
We use a two level hash table: The top level is keyed by |
destination address and protocol ID, every bucket contains a list |
of "rsvp sessions", identified by destination address, protocol and |
DPI(="Destination Port ID"): triple (key, mask, offset). |
|
Every bucket has a smaller hash table keyed by source address |
(cf. RSVP flowspec) and one wildcard entry for wildcard reservations. |
Every bucket is again a list of "RSVP flows", selected by |
source address and SPI(="Source Port ID" here rather than |
"security parameter index"): triple (key, mask, offset). |
|
|
NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) |
and all fragmented packets go to the best-effort traffic class. |
|
|
NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires |
only one "Generalized Port Identifier". So that for classic |
ah, esp (and udp,tcp) both *pi should coincide or one of them |
should be wildcard. |
|
At first sight, this redundancy is just a waste of CPU |
resources. But DPI and SPI add the possibility to assign different |
priorities to GPIs. Look also at note 4 about tunnels below. |
|
|
NOTE 3. One complication is the case of tunneled packets. |
We implement it as following: if the first lookup |
matches a special session with "tunnelhdr" value not zero, |
flowid doesn't contain the true flow ID, but the tunnel ID (1...255). |
In this case, we pull tunnelhdr bytes and restart lookup |
with tunnel ID added to the list of keys. Simple and stupid 8)8) |
It's enough for PIMREG and IPIP. |
|
|
NOTE 4. Two GPIs make it possible to parse even GRE packets. |
F.e. DPI can select ETH_P_IP (and necessary flags to make |
tunnelhdr correct) in GRE protocol field and SPI matches |
GRE key. Is it not nice? 8)8) |
|
|
Well, as result, despite its simplicity, we get a pretty |
powerful classification engine. */ |
|
#include <linux/config.h> |
|
struct rsvp_head |
{ |
u32 tmap[256/32]; |
u32 hgenerator; |
u8 tgenerator; |
struct rsvp_session *ht[256]; |
}; |
|
struct rsvp_session |
{ |
struct rsvp_session *next; |
u32 dst[RSVP_DST_LEN]; |
struct tc_rsvp_gpi dpi; |
u8 protocol; |
u8 tunnelid; |
/* 16 (src,sport) hash slots, and one wildcard source slot */ |
struct rsvp_filter *ht[16+1]; |
}; |
|
|
struct rsvp_filter |
{ |
struct rsvp_filter *next; |
u32 src[RSVP_DST_LEN]; |
struct tc_rsvp_gpi spi; |
u8 tunnelhdr; |
|
struct tcf_result res; |
#ifdef CONFIG_NET_CLS_POLICE |
struct tcf_police *police; |
#endif |
|
u32 handle; |
struct rsvp_session *sess; |
}; |
|
static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid) |
{ |
unsigned h = dst[RSVP_DST_LEN-1]; |
h ^= h>>16; |
h ^= h>>8; |
return (h ^ protocol ^ tunnelid) & 0xFF; |
} |
|
static __inline__ unsigned hash_src(u32 *src) |
{ |
unsigned h = src[RSVP_DST_LEN-1]; |
h ^= h>>16; |
h ^= h>>8; |
h ^= h>>4; |
return h & 0xF; |
} |
|
#ifdef CONFIG_NET_CLS_POLICE |
#define RSVP_POLICE() \ |
if (f->police) { \ |
int pol_res = tcf_police(skb, f->police); \ |
if (pol_res < 0) continue; \ |
if (pol_res) return pol_res; \ |
} |
#else |
#define RSVP_POLICE() |
#endif |
|
|
static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, |
struct tcf_result *res) |
{ |
struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; |
struct rsvp_session *s; |
struct rsvp_filter *f; |
unsigned h1, h2; |
u32 *dst, *src; |
u8 protocol; |
u8 tunnelid = 0; |
u8 *xprt; |
#if RSVP_DST_LEN == 4 |
struct ipv6hdr *nhptr = skb->nh.ipv6h; |
#else |
struct iphdr *nhptr = skb->nh.iph; |
#endif |
|
restart: |
|
#if RSVP_DST_LEN == 4 |
src = &nhptr->saddr.s6_addr32[0]; |
dst = &nhptr->daddr.s6_addr32[0]; |
protocol = nhptr->nexthdr; |
xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); |
#else |
src = &nhptr->saddr; |
dst = &nhptr->daddr; |
protocol = nhptr->protocol; |
xprt = ((u8*)nhptr) + (nhptr->ihl<<2); |
if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) |
return -1; |
#endif |
|
h1 = hash_dst(dst, protocol, tunnelid); |
h2 = hash_src(src); |
|
for (s = sht[h1]; s; s = s->next) { |
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && |
protocol == s->protocol && |
!(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) |
#if RSVP_DST_LEN == 4 |
&& dst[0] == s->dst[0] |
&& dst[1] == s->dst[1] |
&& dst[2] == s->dst[2] |
#endif |
&& tunnelid == s->tunnelid) { |
|
for (f = s->ht[h2]; f; f = f->next) { |
if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && |
!(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) |
#if RSVP_DST_LEN == 4 |
&& src[0] == f->src[0] |
&& src[1] == f->src[1] |
&& src[2] == f->src[2] |
#endif |
) { |
*res = f->res; |
|
RSVP_POLICE(); |
|
matched: |
if (f->tunnelhdr == 0) |
return 0; |
|
tunnelid = f->res.classid; |
nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); |
goto restart; |
} |
} |
|
/* And wildcard bucket... */ |
for (f = s->ht[16]; f; f = f->next) { |
*res = f->res; |
RSVP_POLICE(); |
goto matched; |
} |
return -1; |
} |
} |
return -1; |
} |
|
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) |
{ |
struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; |
struct rsvp_session *s; |
struct rsvp_filter *f; |
unsigned h1 = handle&0xFF; |
unsigned h2 = (handle>>8)&0xFF; |
|
if (h2 > 16) |
return 0; |
|
for (s = sht[h1]; s; s = s->next) { |
for (f = s->ht[h2]; f; f = f->next) { |
if (f->handle == handle) |
return (unsigned long)f; |
} |
} |
return 0; |
} |
|
static void rsvp_put(struct tcf_proto *tp, unsigned long f) |
{ |
} |
|
static int rsvp_init(struct tcf_proto *tp) |
{ |
struct rsvp_head *data; |
|
MOD_INC_USE_COUNT; |
data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL); |
if (data) { |
memset(data, 0, sizeof(struct rsvp_head)); |
tp->root = data; |
return 0; |
} |
MOD_DEC_USE_COUNT; |
return -ENOBUFS; |
} |
|
static void rsvp_destroy(struct tcf_proto *tp) |
{ |
struct rsvp_head *data = xchg(&tp->root, NULL); |
struct rsvp_session **sht; |
int h1, h2; |
|
if (data == NULL) |
return; |
|
sht = data->ht; |
|
for (h1=0; h1<256; h1++) { |
struct rsvp_session *s; |
|
while ((s = sht[h1]) != NULL) { |
sht[h1] = s->next; |
|
for (h2=0; h2<=16; h2++) { |
struct rsvp_filter *f; |
|
while ((f = s->ht[h2]) != NULL) { |
unsigned long cl; |
|
s->ht[h2] = f->next; |
if ((cl = __cls_set_class(&f->res.class, 0)) != 0) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
#ifdef CONFIG_NET_CLS_POLICE |
tcf_police_release(f->police); |
#endif |
kfree(f); |
} |
} |
kfree(s); |
} |
} |
kfree(data); |
MOD_DEC_USE_COUNT; |
} |
|
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) |
{ |
struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; |
unsigned h = f->handle; |
struct rsvp_session **sp; |
struct rsvp_session *s = f->sess; |
int i; |
|
for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { |
if (*fp == f) { |
unsigned long cl; |
|
|
tcf_tree_lock(tp); |
*fp = f->next; |
tcf_tree_unlock(tp); |
|
if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
|
#ifdef CONFIG_NET_CLS_POLICE |
tcf_police_release(f->police); |
#endif |
|
kfree(f); |
|
/* Strip tree */ |
|
for (i=0; i<=16; i++) |
if (s->ht[i]) |
return 0; |
|
/* OK, session has no flows */ |
for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; |
*sp; sp = &(*sp)->next) { |
if (*sp == s) { |
tcf_tree_lock(tp); |
*sp = s->next; |
tcf_tree_unlock(tp); |
|
kfree(s); |
return 0; |
} |
} |
|
return 0; |
} |
} |
return 0; |
} |
|
static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) |
{ |
struct rsvp_head *data = tp->root; |
int i = 0xFFFF; |
|
while (i-- > 0) { |
u32 h; |
if ((data->hgenerator += 0x10000) == 0) |
data->hgenerator = 0x10000; |
h = data->hgenerator|salt; |
if (rsvp_get(tp, h) == 0) |
return h; |
} |
return 0; |
} |
|
static int tunnel_bts(struct rsvp_head *data) |
{ |
int n = data->tgenerator>>5; |
u32 b = 1<<(data->tgenerator&0x1F); |
|
if (data->tmap[n]&b) |
return 0; |
data->tmap[n] |= b; |
return 1; |
} |
|
static void tunnel_recycle(struct rsvp_head *data) |
{ |
struct rsvp_session **sht = data->ht; |
u32 tmap[256/32]; |
int h1, h2; |
|
memset(tmap, 0, sizeof(tmap)); |
|
for (h1=0; h1<256; h1++) { |
struct rsvp_session *s; |
for (s = sht[h1]; s; s = s->next) { |
for (h2=0; h2<=16; h2++) { |
struct rsvp_filter *f; |
|
for (f = s->ht[h2]; f; f = f->next) { |
if (f->tunnelhdr == 0) |
continue; |
data->tgenerator = f->res.classid; |
tunnel_bts(data); |
} |
} |
} |
} |
|
memcpy(data->tmap, tmap, sizeof(tmap)); |
} |
|
static u32 gen_tunnel(struct rsvp_head *data) |
{ |
int i, k; |
|
for (k=0; k<2; k++) { |
for (i=255; i>0; i--) { |
if (++data->tgenerator == 0) |
data->tgenerator = 1; |
if (tunnel_bts(data)) |
return data->tgenerator; |
} |
tunnel_recycle(data); |
} |
return 0; |
} |
|
static int rsvp_change(struct tcf_proto *tp, unsigned long base, |
u32 handle, |
struct rtattr **tca, |
unsigned long *arg) |
{ |
struct rsvp_head *data = tp->root; |
struct rsvp_filter *f, **fp; |
struct rsvp_session *s, **sp; |
struct tc_rsvp_pinfo *pinfo = NULL; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_RSVP_MAX]; |
unsigned h1, h2; |
u32 *dst; |
int err; |
|
if (opt == NULL) |
return handle ? -EINVAL : 0; |
|
if (rtattr_parse(tb, TCA_RSVP_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) |
return -EINVAL; |
|
if ((f = (struct rsvp_filter*)*arg) != NULL) { |
/* Node exists: adjust only classid */ |
|
if (f->handle != handle && handle) |
return -EINVAL; |
if (tb[TCA_RSVP_CLASSID-1]) { |
unsigned long cl; |
|
f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); |
cl = cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); |
if (cl) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
} |
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_RSVP_POLICE-1]) { |
struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]); |
|
tcf_tree_lock(tp); |
police = xchg(&f->police, police); |
tcf_tree_unlock(tp); |
|
tcf_police_release(police); |
} |
#endif |
return 0; |
} |
|
/* Now more serious part... */ |
if (handle) |
return -EINVAL; |
if (tb[TCA_RSVP_DST-1] == NULL) |
return -EINVAL; |
|
f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL); |
if (f == NULL) |
return -ENOBUFS; |
|
memset(f, 0, sizeof(*f)); |
h2 = 16; |
if (tb[TCA_RSVP_SRC-1]) { |
err = -EINVAL; |
if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) |
goto errout; |
memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); |
h2 = hash_src(f->src); |
} |
if (tb[TCA_RSVP_PINFO-1]) { |
err = -EINVAL; |
if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) |
goto errout; |
pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); |
f->spi = pinfo->spi; |
f->tunnelhdr = pinfo->tunnelhdr; |
} |
if (tb[TCA_RSVP_CLASSID-1]) { |
err = -EINVAL; |
if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) |
goto errout; |
f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); |
} |
|
err = -EINVAL; |
if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) |
goto errout; |
dst = RTA_DATA(tb[TCA_RSVP_DST-1]); |
h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); |
|
err = -ENOMEM; |
if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) |
goto errout; |
|
if (f->tunnelhdr) { |
err = -EINVAL; |
if (f->res.classid > 255) |
goto errout; |
|
err = -ENOMEM; |
if (f->res.classid == 0 && |
(f->res.classid = gen_tunnel(data)) == 0) |
goto errout; |
} |
|
for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { |
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && |
pinfo && pinfo->protocol == s->protocol && |
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 |
#if RSVP_DST_LEN == 4 |
&& dst[0] == s->dst[0] |
&& dst[1] == s->dst[1] |
&& dst[2] == s->dst[2] |
#endif |
&& pinfo->tunnelid == s->tunnelid) { |
|
insert: |
/* OK, we found appropriate session */ |
|
fp = &s->ht[h2]; |
|
f->sess = s; |
if (f->tunnelhdr == 0) |
cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); |
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_RSVP_POLICE-1]) |
f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]); |
#endif |
|
for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) |
if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) |
break; |
f->next = *fp; |
wmb(); |
*fp = f; |
|
*arg = (unsigned long)f; |
return 0; |
} |
} |
|
/* No session found. Create new one. */ |
|
err = -ENOBUFS; |
s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL); |
if (s == NULL) |
goto errout; |
memset(s, 0, sizeof(*s)); |
memcpy(s->dst, dst, sizeof(s->dst)); |
|
if (pinfo) { |
s->dpi = pinfo->dpi; |
s->protocol = pinfo->protocol; |
s->tunnelid = pinfo->tunnelid; |
} |
for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { |
if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) |
break; |
} |
s->next = *sp; |
wmb(); |
*sp = s; |
|
goto insert; |
|
errout: |
if (f) |
kfree(f); |
return err; |
} |
|
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) |
{ |
struct rsvp_head *head = tp->root; |
unsigned h, h1; |
|
if (arg->stop) |
return; |
|
for (h = 0; h < 256; h++) { |
struct rsvp_session *s; |
|
for (s = head->ht[h]; s; s = s->next) { |
for (h1 = 0; h1 <= 16; h1++) { |
struct rsvp_filter *f; |
|
for (f = s->ht[h1]; f; f = f->next) { |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (arg->fn(tp, (unsigned long)f, arg) < 0) { |
arg->stop = 1; |
break; |
} |
arg->count++; |
} |
} |
} |
} |
} |
|
static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, |
struct sk_buff *skb, struct tcmsg *t) |
{ |
struct rsvp_filter *f = (struct rsvp_filter*)fh; |
struct rsvp_session *s; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
struct tc_rsvp_pinfo pinfo; |
|
if (f == NULL) |
return skb->len; |
s = f->sess; |
|
t->tcm_handle = f->handle; |
|
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); |
pinfo.dpi = s->dpi; |
pinfo.spi = f->spi; |
pinfo.protocol = s->protocol; |
pinfo.tunnelid = s->tunnelid; |
pinfo.tunnelhdr = f->tunnelhdr; |
RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); |
if (f->res.classid) |
RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); |
if (((f->handle>>8)&0xFF) != 16) |
RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); |
#ifdef CONFIG_NET_CLS_POLICE |
if (f->police) { |
struct rtattr * p_rta = (struct rtattr*)skb->tail; |
|
RTA_PUT(skb, TCA_RSVP_POLICE, 0, NULL); |
|
if (tcf_police_dump(skb, f->police) < 0) |
goto rtattr_failure; |
|
p_rta->rta_len = skb->tail - (u8*)p_rta; |
} |
#endif |
|
rta->rta_len = skb->tail - b; |
#ifdef CONFIG_NET_CLS_POLICE |
if (f->police) { |
if (qdisc_copy_stats(skb, &f->police->stats)) |
goto rtattr_failure; |
} |
#endif |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
struct tcf_proto_ops RSVP_OPS = { |
NULL, |
RSVP_ID, |
rsvp_classify, |
rsvp_init, |
rsvp_destroy, |
|
rsvp_get, |
rsvp_put, |
rsvp_change, |
rsvp_delete, |
rsvp_walk, |
rsvp_dump |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_tcf_proto_ops(&RSVP_OPS); |
} |
|
void cleanup_module(void) |
{ |
unregister_tcf_proto_ops(&RSVP_OPS); |
} |
#endif |
/sch_cbq.c
0,0 → 1,2115
/* |
* net/sched/sch_cbq.c Class-Based Queueing discipline. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
|
/* Class-Based Queueing (CBQ) algorithm. |
======================================= |
|
Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource |
Management Models for Packet Networks", |
IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 |
|
[2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995 |
|
[3] Sally Floyd, "Notes on Class-Based Queueing: Setting |
Parameters", 1996 |
|
[4] Sally Floyd and Michael Speer, "Experimental Results |
for Class-Based Queueing", 1998, not published. |
|
----------------------------------------------------------------------- |
|
Algorithm skeleton was taken from NS simulator cbq.cc. |
If someone wants to check this code against the LBL version, |
he should take into account that ONLY the skeleton was borrowed, |
the implementation is different. Particularly: |
|
--- The WRR algorithm is different. Our version looks more |
reasonable (I hope) and works when quanta are allowed to be |
less than MTU, which is always the case when real time classes |
have small rates. Note, that the statement of [3] is |
incomplete, delay may actually be estimated even if class |
per-round allotment is less than MTU. Namely, if per-round |
allotment is W*r_i, and r_1+...+r_k = r < 1 |
|
delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B |
|
In the worst case we have IntServ estimate with D = W*r+k*MTU |
and C = MTU*r. The proof (if correct at all) is trivial. |
|
|
--- It seems that cbq-2.0 is not very accurate. At least, I cannot |
interpret some places, which look like wrong translations |
from NS. Anyone is advised to find these differences |
and explain to me, why I am wrong 8). |
|
--- Linux has no EOI event, so that we cannot estimate true class |
idle time. Workaround is to consider the next dequeue event |
as sign that previous packet is finished. This is wrong because of |
internal device queueing, but on a permanently loaded link it is true. |
Moreover, combined with clock integrator, this scheme looks |
very close to an ideal solution. */ |
|
struct cbq_sched_data; |
|
|
struct cbq_class |
{ |
struct cbq_class *next; /* hash table link */ |
struct cbq_class *next_alive; /* next class with backlog in this priority band */ |
|
/* Parameters */ |
u32 classid; |
unsigned char priority; /* class priority */ |
unsigned char priority2; /* priority to be used after overlimit */ |
unsigned char ewma_log; /* time constant for idle time calculation */ |
unsigned char ovl_strategy; |
#ifdef CONFIG_NET_CLS_POLICE |
unsigned char police; |
#endif |
|
u32 defmap; |
|
/* Link-sharing scheduler parameters */ |
long maxidle; /* Class paramters: see below. */ |
long offtime; |
long minidle; |
u32 avpkt; |
struct qdisc_rate_table *R_tab; |
|
/* Overlimit strategy parameters */ |
void (*overlimit)(struct cbq_class *cl); |
long penalty; |
|
/* General scheduler (WRR) parameters */ |
long allot; |
long quantum; /* Allotment per WRR round */ |
long weight; /* Relative allotment: see below */ |
|
struct Qdisc *qdisc; /* Ptr to CBQ discipline */ |
struct cbq_class *split; /* Ptr to split node */ |
struct cbq_class *share; /* Ptr to LS parent in the class tree */ |
struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ |
struct cbq_class *borrow; /* NULL if class is bandwidth limited; |
parent otherwise */ |
struct cbq_class *sibling; /* Sibling chain */ |
struct cbq_class *children; /* Pointer to children chain */ |
|
struct Qdisc *q; /* Elementary queueing discipline */ |
|
|
/* Variables */ |
unsigned char cpriority; /* Effective priority */ |
unsigned char delayed; |
unsigned char level; /* level of the class in hierarchy: |
0 for leaf classes, and maximal |
level of children + 1 for nodes. |
*/ |
|
psched_time_t last; /* Last end of service */ |
psched_time_t undertime; |
long avgidle; |
long deficit; /* Saved deficit for WRR */ |
unsigned long penalized; |
struct tc_stats stats; |
struct tc_cbq_xstats xstats; |
|
struct tcf_proto *filter_list; |
|
int refcnt; |
int filters; |
|
struct cbq_class *defaults[TC_PRIO_MAX+1]; |
}; |
|
struct cbq_sched_data |
{ |
struct cbq_class *classes[16]; /* Hash table of all classes */ |
int nclasses[TC_CBQ_MAXPRIO+1]; |
unsigned quanta[TC_CBQ_MAXPRIO+1]; |
|
struct cbq_class link; |
|
unsigned activemask; |
struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes |
with backlog */ |
|
#ifdef CONFIG_NET_CLS_POLICE |
struct cbq_class *rx_class; |
#endif |
struct cbq_class *tx_class; |
struct cbq_class *tx_borrowed; |
int tx_len; |
psched_time_t now; /* Cached timestamp */ |
psched_time_t now_rt; /* Cached real time */ |
unsigned pmask; |
|
struct timer_list delay_timer; |
struct timer_list wd_timer; /* Watchdog timer, |
started when CBQ has |
backlog, but cannot |
transmit just now */ |
long wd_expires; |
int toplevel; |
u32 hgenerator; |
}; |
|
|
#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) |
|
|
static __inline__ unsigned cbq_hash(u32 h) |
{ |
h ^= h>>8; |
h ^= h>>4; |
return h&0xF; |
} |
|
static __inline__ struct cbq_class * |
cbq_class_lookup(struct cbq_sched_data *q, u32 classid) |
{ |
struct cbq_class *cl; |
|
for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) |
if (cl->classid == classid) |
return cl; |
return NULL; |
} |
|
#ifdef CONFIG_NET_CLS_POLICE |
|
static struct cbq_class * |
cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) |
{ |
struct cbq_class *cl, *new; |
|
for (cl = this->tparent; cl; cl = cl->tparent) |
if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) |
return new; |
|
return NULL; |
} |
|
#endif |
|
/* Classify packet. The procedure is pretty complicated, but |
it allows us to combine link sharing and priority scheduling |
transparently. |
|
Namely, you can put link sharing rules (f.e. route based) at root of CBQ, |
so that it resolves to split nodes. Then packets are classified |
by logical priority, or a more specific classifier may be attached |
to the split node. |
*/ |
|
static struct cbq_class * |
cbq_classify(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; |
struct cbq_class *head = &q->link; |
struct cbq_class **defmap; |
struct cbq_class *cl = NULL; |
u32 prio = skb->priority; |
struct tcf_result res; |
|
/* |
* Step 1. If skb->priority points to one of our classes, use it. |
*/ |
if (TC_H_MAJ(prio^sch->handle) == 0 && |
(cl = cbq_class_lookup(q, prio)) != NULL) |
return cl; |
|
for (;;) { |
int result = 0; |
|
defmap = head->defaults; |
|
/* |
* Step 2+n. Apply classifier. |
*/ |
if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) |
goto fallback; |
|
if ((cl = (void*)res.class) == NULL) { |
if (TC_H_MAJ(res.classid)) |
cl = cbq_class_lookup(q, res.classid); |
else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) |
cl = defmap[TC_PRIO_BESTEFFORT]; |
|
if (cl == NULL || cl->level >= head->level) |
goto fallback; |
} |
|
#ifdef CONFIG_NET_CLS_POLICE |
switch (result) { |
case TC_POLICE_RECLASSIFY: |
return cbq_reclassify(skb, cl); |
case TC_POLICE_SHOT: |
return NULL; |
default: |
break; |
} |
#endif |
if (cl->level == 0) |
return cl; |
|
/* |
* Step 3+n. If classifier selected a link sharing class, |
* apply agency specific classifier. |
* Repeat this procdure until we hit a leaf node. |
*/ |
head = cl; |
} |
|
fallback: |
cl = head; |
|
/* |
* Step 4. No success... |
*/ |
if (TC_H_MAJ(prio) == 0 && |
!(cl = head->defaults[prio&TC_PRIO_MAX]) && |
!(cl = head->defaults[TC_PRIO_BESTEFFORT])) |
return head; |
|
return cl; |
} |
|
/* |
A packet has just been enqueued on the empty class. |
cbq_activate_class adds it to the tail of active class list |
of its priority band. |
*/ |
|
static __inline__ void cbq_activate_class(struct cbq_class *cl) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; |
int prio = cl->cpriority; |
struct cbq_class *cl_tail; |
|
cl_tail = q->active[prio]; |
q->active[prio] = cl; |
|
if (cl_tail != NULL) { |
cl->next_alive = cl_tail->next_alive; |
cl_tail->next_alive = cl; |
} else { |
cl->next_alive = cl; |
q->activemask |= (1<<prio); |
} |
} |
|
/* |
Unlink class from active chain. |
Note that this same procedure is done directly in cbq_dequeue* |
during round-robin procedure. |
*/ |
|
static void cbq_deactivate_class(struct cbq_class *this) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; |
int prio = this->cpriority; |
struct cbq_class *cl; |
struct cbq_class *cl_prev = q->active[prio]; |
|
do { |
cl = cl_prev->next_alive; |
if (cl == this) { |
cl_prev->next_alive = cl->next_alive; |
cl->next_alive = NULL; |
|
if (cl == q->active[prio]) { |
q->active[prio] = cl_prev; |
if (cl == q->active[prio]) { |
q->active[prio] = NULL; |
q->activemask &= ~(1<<prio); |
return; |
} |
} |
|
cl = cl_prev->next_alive; |
return; |
} |
} while ((cl_prev = cl) != q->active[prio]); |
} |
|
static void |
cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) |
{ |
int toplevel = q->toplevel; |
|
if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { |
psched_time_t now; |
psched_tdiff_t incr; |
|
PSCHED_GET_TIME(now); |
incr = PSCHED_TDIFF(now, q->now_rt); |
PSCHED_TADD2(q->now, incr, now); |
|
do { |
if (PSCHED_TLESS(cl->undertime, now)) { |
q->toplevel = cl->level; |
return; |
} |
} while ((cl=cl->borrow) != NULL && toplevel > cl->level); |
} |
} |
|
static int |
cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl = cbq_classify(skb, sch); |
int len = skb->len; |
int ret = NET_XMIT_POLICED; |
|
#ifdef CONFIG_NET_CLS_POLICE |
q->rx_class = cl; |
#endif |
if (cl) { |
#ifdef CONFIG_NET_CLS_POLICE |
cl->q->__parent = sch; |
#endif |
if ((ret = cl->q->enqueue(skb, cl->q)) == 0) { |
sch->q.qlen++; |
sch->stats.packets++; |
sch->stats.bytes+=len; |
cbq_mark_toplevel(q, cl); |
if (!cl->next_alive) |
cbq_activate_class(cl); |
return 0; |
} |
} |
|
sch->stats.drops++; |
if (cl == NULL) |
kfree_skb(skb); |
else { |
cbq_mark_toplevel(q, cl); |
cl->stats.drops++; |
} |
return ret; |
} |
|
static int |
cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl; |
int ret; |
|
if ((cl = q->tx_class) == NULL) { |
kfree_skb(skb); |
sch->stats.drops++; |
return NET_XMIT_CN; |
} |
q->tx_class = NULL; |
|
cbq_mark_toplevel(q, cl); |
|
#ifdef CONFIG_NET_CLS_POLICE |
q->rx_class = cl; |
cl->q->__parent = sch; |
#endif |
if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { |
sch->q.qlen++; |
if (!cl->next_alive) |
cbq_activate_class(cl); |
return 0; |
} |
sch->stats.drops++; |
cl->stats.drops++; |
return ret; |
} |
|
/* Overlimit actions */ |
|
/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ |
|
static void cbq_ovl_classic(struct cbq_class *cl) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; |
psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); |
|
if (!cl->delayed) { |
delay += cl->offtime; |
|
/* |
Class goes to sleep, so that it will have no |
chance to work avgidle. Let's forgive it 8) |
|
BTW cbq-2.0 has a crap in this |
place, apparently they forgot to shift it by cl->ewma_log. |
*/ |
if (cl->avgidle < 0) |
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); |
if (cl->avgidle < cl->minidle) |
cl->avgidle = cl->minidle; |
if (delay <= 0) |
delay = 1; |
PSCHED_TADD2(q->now, delay, cl->undertime); |
|
cl->xstats.overactions++; |
cl->delayed = 1; |
} |
if (q->wd_expires == 0 || q->wd_expires > delay) |
q->wd_expires = delay; |
|
/* Dirty work! We must schedule wakeups based on |
real available rate, rather than leaf rate, |
which may be tiny (even zero). |
*/ |
if (q->toplevel == TC_CBQ_MAXLEVEL) { |
struct cbq_class *b; |
psched_tdiff_t base_delay = q->wd_expires; |
|
for (b = cl->borrow; b; b = b->borrow) { |
delay = PSCHED_TDIFF(b->undertime, q->now); |
if (delay < base_delay) { |
if (delay <= 0) |
delay = 1; |
base_delay = delay; |
} |
} |
|
q->wd_expires = base_delay; |
} |
} |
|
/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when |
they go overlimit |
*/ |
|
static void cbq_ovl_rclassic(struct cbq_class *cl) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; |
struct cbq_class *this = cl; |
|
do { |
if (cl->level > q->toplevel) { |
cl = NULL; |
break; |
} |
} while ((cl = cl->borrow) != NULL); |
|
if (cl == NULL) |
cl = this; |
cbq_ovl_classic(cl); |
} |
|
/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ |
|
static void cbq_ovl_delay(struct cbq_class *cl) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; |
psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); |
|
if (!cl->delayed) { |
unsigned long sched = jiffies; |
|
delay += cl->offtime; |
if (cl->avgidle < 0) |
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); |
if (cl->avgidle < cl->minidle) |
cl->avgidle = cl->minidle; |
PSCHED_TADD2(q->now, delay, cl->undertime); |
|
if (delay > 0) { |
sched += PSCHED_US2JIFFIE(delay) + cl->penalty; |
cl->penalized = sched; |
cl->cpriority = TC_CBQ_MAXPRIO; |
q->pmask |= (1<<TC_CBQ_MAXPRIO); |
if (del_timer(&q->delay_timer) && |
(long)(q->delay_timer.expires - sched) > 0) |
q->delay_timer.expires = sched; |
add_timer(&q->delay_timer); |
cl->delayed = 1; |
cl->xstats.overactions++; |
return; |
} |
delay = 1; |
} |
if (q->wd_expires == 0 || q->wd_expires > delay) |
q->wd_expires = delay; |
} |
|
/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ |
|
static void cbq_ovl_lowprio(struct cbq_class *cl) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; |
|
cl->penalized = jiffies + cl->penalty; |
|
if (cl->cpriority != cl->priority2) { |
cl->cpriority = cl->priority2; |
q->pmask |= (1<<cl->cpriority); |
cl->xstats.overactions++; |
} |
cbq_ovl_classic(cl); |
} |
|
/* TC_CBQ_OVL_DROP: penalize class by dropping */ |
|
static void cbq_ovl_drop(struct cbq_class *cl) |
{ |
if (cl->q->ops->drop) |
if (cl->q->ops->drop(cl->q)) |
cl->qdisc->q.qlen--; |
cl->xstats.overactions++; |
cbq_ovl_classic(cl); |
} |
|
static void cbq_watchdog(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc*)arg; |
|
sch->flags &= ~TCQ_F_THROTTLED; |
netif_schedule(sch->dev); |
} |
|
static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) |
{ |
struct cbq_class *cl; |
struct cbq_class *cl_prev = q->active[prio]; |
unsigned long now = jiffies; |
unsigned long sched = now; |
|
if (cl_prev == NULL) |
return now; |
|
do { |
cl = cl_prev->next_alive; |
if ((long)(now - cl->penalized) > 0) { |
cl_prev->next_alive = cl->next_alive; |
cl->next_alive = NULL; |
cl->cpriority = cl->priority; |
cl->delayed = 0; |
cbq_activate_class(cl); |
|
if (cl == q->active[prio]) { |
q->active[prio] = cl_prev; |
if (cl == q->active[prio]) { |
q->active[prio] = NULL; |
return 0; |
} |
} |
|
cl = cl_prev->next_alive; |
} else if ((long)(sched - cl->penalized) > 0) |
sched = cl->penalized; |
} while ((cl_prev = cl) != q->active[prio]); |
|
return (long)(sched - now); |
} |
|
static void cbq_undelay(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc*)arg; |
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; |
long delay = 0; |
unsigned pmask; |
|
pmask = q->pmask; |
q->pmask = 0; |
|
while (pmask) { |
int prio = ffz(~pmask); |
long tmp; |
|
pmask &= ~(1<<prio); |
|
tmp = cbq_undelay_prio(q, prio); |
if (tmp > 0) { |
q->pmask |= 1<<prio; |
if (tmp < delay || delay == 0) |
delay = tmp; |
} |
} |
|
if (delay) { |
q->delay_timer.expires = jiffies + delay; |
add_timer(&q->delay_timer); |
} |
|
sch->flags &= ~TCQ_F_THROTTLED; |
netif_schedule(sch->dev); |
} |
|
|
#ifdef CONFIG_NET_CLS_POLICE |
|
static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) |
{ |
int len = skb->len; |
struct Qdisc *sch = child->__parent; |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl = q->rx_class; |
|
q->rx_class = NULL; |
|
if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { |
|
cbq_mark_toplevel(q, cl); |
|
q->rx_class = cl; |
cl->q->__parent = sch; |
|
if (cl->q->enqueue(skb, cl->q) == 0) { |
sch->q.qlen++; |
sch->stats.packets++; |
sch->stats.bytes+=len; |
if (!cl->next_alive) |
cbq_activate_class(cl); |
return 0; |
} |
sch->stats.drops++; |
return 0; |
} |
|
sch->stats.drops++; |
return -1; |
} |
#endif |
|
/* |
It is mission critical procedure. |
|
We "regenerate" toplevel cutoff, if transmitting class |
has backlog and it is not regulated. It is not part of |
original CBQ description, but looks more reasonable. |
Probably, it is wrong. This question needs further investigation. |
*/ |
|
static __inline__ void |
cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, |
struct cbq_class *borrowed) |
{ |
if (cl && q->toplevel >= borrowed->level) { |
if (cl->q->q.qlen > 1) { |
do { |
if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) { |
q->toplevel = borrowed->level; |
return; |
} |
} while ((borrowed=borrowed->borrow) != NULL); |
} |
#if 0 |
/* It is not necessary now. Uncommenting it |
will save CPU cycles, but decrease fairness. |
*/ |
q->toplevel = TC_CBQ_MAXLEVEL; |
#endif |
} |
} |
|
static void |
cbq_update(struct cbq_sched_data *q) |
{ |
struct cbq_class *this = q->tx_class; |
struct cbq_class *cl = this; |
int len = q->tx_len; |
|
q->tx_class = NULL; |
|
for ( ; cl; cl = cl->share) { |
long avgidle = cl->avgidle; |
long idle; |
|
cl->stats.packets++; |
cl->stats.bytes += len; |
|
/* |
(now - last) is total time between packet right edges. |
(last_pktlen/rate) is "virtual" busy time, so that |
|
idle = (now - last) - last_pktlen/rate |
*/ |
|
idle = PSCHED_TDIFF(q->now, cl->last); |
if ((unsigned long)idle > 128*1024*1024) { |
avgidle = cl->maxidle; |
} else { |
idle -= L2T(cl, len); |
|
/* true_avgidle := (1-W)*true_avgidle + W*idle, |
where W=2^{-ewma_log}. But cl->avgidle is scaled: |
cl->avgidle == true_avgidle/W, |
hence: |
*/ |
avgidle += idle - (avgidle>>cl->ewma_log); |
} |
|
if (avgidle <= 0) { |
/* Overlimit or at-limit */ |
|
if (avgidle < cl->minidle) |
avgidle = cl->minidle; |
|
cl->avgidle = avgidle; |
|
/* Calculate expected time, when this class |
will be allowed to send. |
It will occur, when: |
(1-W)*true_avgidle + W*delay = 0, i.e. |
idle = (1/W - 1)*(-true_avgidle) |
or |
idle = (1 - W)*(-cl->avgidle); |
*/ |
idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); |
|
/* |
That is not all. |
To maintain the rate allocated to the class, |
we add to undertime virtual clock, |
necesary to complete transmitted packet. |
(len/phys_bandwidth has been already passed |
to the moment of cbq_update) |
*/ |
|
idle -= L2T(&q->link, len); |
idle += L2T(cl, len); |
|
PSCHED_AUDIT_TDIFF(idle); |
|
PSCHED_TADD2(q->now, idle, cl->undertime); |
} else { |
/* Underlimit */ |
|
PSCHED_SET_PASTPERFECT(cl->undertime); |
if (avgidle > cl->maxidle) |
cl->avgidle = cl->maxidle; |
else |
cl->avgidle = avgidle; |
} |
cl->last = q->now; |
} |
|
cbq_update_toplevel(q, this, q->tx_borrowed); |
} |
|
static __inline__ struct cbq_class * |
cbq_under_limit(struct cbq_class *cl) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; |
struct cbq_class *this_cl = cl; |
|
if (cl->tparent == NULL) |
return cl; |
|
if (PSCHED_IS_PASTPERFECT(cl->undertime) || |
!PSCHED_TLESS(q->now, cl->undertime)) { |
cl->delayed = 0; |
return cl; |
} |
|
do { |
/* It is very suspicious place. Now overlimit |
action is generated for not bounded classes |
only if link is completely congested. |
Though it is in agree with ancestor-only paradigm, |
it looks very stupid. Particularly, |
it means that this chunk of code will either |
never be called or result in strong amplification |
of burstiness. Dangerous, silly, and, however, |
no another solution exists. |
*/ |
if ((cl = cl->borrow) == NULL) { |
this_cl->stats.overlimits++; |
this_cl->overlimit(this_cl); |
return NULL; |
} |
if (cl->level > q->toplevel) |
return NULL; |
} while (!PSCHED_IS_PASTPERFECT(cl->undertime) && |
PSCHED_TLESS(q->now, cl->undertime)); |
|
cl->delayed = 0; |
return cl; |
} |
|
static __inline__ struct sk_buff * |
cbq_dequeue_prio(struct Qdisc *sch, int prio) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl_tail, *cl_prev, *cl; |
struct sk_buff *skb; |
int deficit; |
|
cl_tail = cl_prev = q->active[prio]; |
cl = cl_prev->next_alive; |
|
do { |
deficit = 0; |
|
/* Start round */ |
do { |
struct cbq_class *borrow = cl; |
|
if (cl->q->q.qlen && |
(borrow = cbq_under_limit(cl)) == NULL) |
goto skip_class; |
|
if (cl->deficit <= 0) { |
/* Class exhausted its allotment per |
this round. Switch to the next one. |
*/ |
deficit = 1; |
cl->deficit += cl->quantum; |
goto next_class; |
} |
|
skb = cl->q->dequeue(cl->q); |
|
/* Class did not give us any skb :-( |
It could occur even if cl->q->q.qlen != 0 |
f.e. if cl->q == "tbf" |
*/ |
if (skb == NULL) |
goto skip_class; |
|
cl->deficit -= skb->len; |
q->tx_class = cl; |
q->tx_borrowed = borrow; |
if (borrow != cl) { |
#ifndef CBQ_XSTATS_BORROWS_BYTES |
borrow->xstats.borrows++; |
cl->xstats.borrows++; |
#else |
borrow->xstats.borrows += skb->len; |
cl->xstats.borrows += skb->len; |
#endif |
} |
q->tx_len = skb->len; |
|
if (cl->deficit <= 0) { |
q->active[prio] = cl; |
cl = cl->next_alive; |
cl->deficit += cl->quantum; |
} |
return skb; |
|
skip_class: |
if (cl->q->q.qlen == 0 || prio != cl->cpriority) { |
/* Class is empty or penalized. |
Unlink it from active chain. |
*/ |
cl_prev->next_alive = cl->next_alive; |
cl->next_alive = NULL; |
|
/* Did cl_tail point to it? */ |
if (cl == cl_tail) { |
/* Repair it! */ |
cl_tail = cl_prev; |
|
/* Was it the last class in this band? */ |
if (cl == cl_tail) { |
/* Kill the band! */ |
q->active[prio] = NULL; |
q->activemask &= ~(1<<prio); |
if (cl->q->q.qlen) |
cbq_activate_class(cl); |
return NULL; |
} |
|
q->active[prio] = cl_tail; |
} |
if (cl->q->q.qlen) |
cbq_activate_class(cl); |
|
cl = cl_prev; |
} |
|
next_class: |
cl_prev = cl; |
cl = cl->next_alive; |
} while (cl_prev != cl_tail); |
} while (deficit); |
|
q->active[prio] = cl_prev; |
|
return NULL; |
} |
|
static __inline__ struct sk_buff * |
cbq_dequeue_1(struct Qdisc *sch) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct sk_buff *skb; |
unsigned activemask; |
|
activemask = q->activemask&0xFF; |
while (activemask) { |
int prio = ffz(~activemask); |
activemask &= ~(1<<prio); |
skb = cbq_dequeue_prio(sch, prio); |
if (skb) |
return skb; |
} |
return NULL; |
} |
|
static struct sk_buff * |
cbq_dequeue(struct Qdisc *sch) |
{ |
struct sk_buff *skb; |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
psched_time_t now; |
psched_tdiff_t incr; |
|
PSCHED_GET_TIME(now); |
incr = PSCHED_TDIFF(now, q->now_rt); |
|
if (q->tx_class) { |
psched_tdiff_t incr2; |
/* Time integrator. We calculate EOS time |
by adding expected packet transmittion time. |
If real time is greater, we warp artificial clock, |
so that: |
|
cbq_time = max(real_time, work); |
*/ |
incr2 = L2T(&q->link, q->tx_len); |
PSCHED_TADD(q->now, incr2); |
cbq_update(q); |
if ((incr -= incr2) < 0) |
incr = 0; |
} |
PSCHED_TADD(q->now, incr); |
q->now_rt = now; |
|
for (;;) { |
q->wd_expires = 0; |
|
skb = cbq_dequeue_1(sch); |
if (skb) { |
sch->q.qlen--; |
sch->flags &= ~TCQ_F_THROTTLED; |
return skb; |
} |
|
/* All the classes are overlimit. |
|
It is possible, if: |
|
1. Scheduler is empty. |
2. Toplevel cutoff inhibited borrowing. |
3. Root class is overlimit. |
|
Reset 2d and 3d conditions and retry. |
|
Note, that NS and cbq-2.0 are buggy, peeking |
an arbitrary class is appropriate for ancestor-only |
sharing, but not for toplevel algorithm. |
|
Our version is better, but slower, because it requires |
two passes, but it is unavoidable with top-level sharing. |
*/ |
|
if (q->toplevel == TC_CBQ_MAXLEVEL && |
PSCHED_IS_PASTPERFECT(q->link.undertime)) |
break; |
|
q->toplevel = TC_CBQ_MAXLEVEL; |
PSCHED_SET_PASTPERFECT(q->link.undertime); |
} |
|
/* No packets in scheduler or nobody wants to give them to us :-( |
Sigh... start watchdog timer in the last case. */ |
|
if (sch->q.qlen) { |
sch->stats.overlimits++; |
if (q->wd_expires && !netif_queue_stopped(sch->dev)) { |
long delay = PSCHED_US2JIFFIE(q->wd_expires); |
if (delay <= 0) |
delay = 1; |
mod_timer(&q->wd_timer, jiffies + delay); |
sch->flags |= TCQ_F_THROTTLED; |
} |
} |
return NULL; |
} |
|
/* CBQ class maintanance routines */ |
|
static void cbq_adjust_levels(struct cbq_class *this) |
{ |
if (this == NULL) |
return; |
|
do { |
int level = 0; |
struct cbq_class *cl; |
|
if ((cl = this->children) != NULL) { |
do { |
if (cl->level > level) |
level = cl->level; |
} while ((cl = cl->sibling) != this->children); |
} |
this->level = level+1; |
} while ((this = this->tparent) != NULL); |
} |
|
static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) |
{ |
struct cbq_class *cl; |
unsigned h; |
|
if (q->quanta[prio] == 0) |
return; |
|
for (h=0; h<16; h++) { |
for (cl = q->classes[h]; cl; cl = cl->next) { |
/* BUGGGG... Beware! This expression suffer of |
arithmetic overflows! |
*/ |
if (cl->priority == prio) { |
cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ |
q->quanta[prio]; |
} |
if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { |
printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); |
cl->quantum = cl->qdisc->dev->mtu/2 + 1; |
} |
} |
} |
} |
|
static void cbq_sync_defmap(struct cbq_class *cl) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; |
struct cbq_class *split = cl->split; |
unsigned h; |
int i; |
|
if (split == NULL) |
return; |
|
for (i=0; i<=TC_PRIO_MAX; i++) { |
if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) |
split->defaults[i] = NULL; |
} |
|
for (i=0; i<=TC_PRIO_MAX; i++) { |
int level = split->level; |
|
if (split->defaults[i]) |
continue; |
|
for (h=0; h<16; h++) { |
struct cbq_class *c; |
|
for (c = q->classes[h]; c; c = c->next) { |
if (c->split == split && c->level < level && |
c->defmap&(1<<i)) { |
split->defaults[i] = c; |
level = c->level; |
} |
} |
} |
} |
} |
|
static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) |
{ |
struct cbq_class *split = NULL; |
|
if (splitid == 0) { |
if ((split = cl->split) == NULL) |
return; |
splitid = split->classid; |
} |
|
if (split == NULL || split->classid != splitid) { |
for (split = cl->tparent; split; split = split->tparent) |
if (split->classid == splitid) |
break; |
} |
|
if (split == NULL) |
return; |
|
if (cl->split != split) { |
cl->defmap = 0; |
cbq_sync_defmap(cl); |
cl->split = split; |
cl->defmap = def&mask; |
} else |
cl->defmap = (cl->defmap&~mask)|(def&mask); |
|
cbq_sync_defmap(cl); |
} |
|
static void cbq_unlink_class(struct cbq_class *this) |
{ |
struct cbq_class *cl, **clp; |
struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; |
|
for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { |
if (cl == this) { |
*clp = cl->next; |
cl->next = NULL; |
break; |
} |
} |
|
if (this->tparent) { |
clp=&this->sibling; |
cl = *clp; |
do { |
if (cl == this) { |
*clp = cl->sibling; |
break; |
} |
clp = &cl->sibling; |
} while ((cl = *clp) != this->sibling); |
|
if (this->tparent->children == this) { |
this->tparent->children = this->sibling; |
if (this->sibling == this) |
this->tparent->children = NULL; |
} |
} else { |
BUG_TRAP(this->sibling == this); |
} |
} |
|
static void cbq_link_class(struct cbq_class *this) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; |
unsigned h = cbq_hash(this->classid); |
struct cbq_class *parent = this->tparent; |
|
this->sibling = this; |
this->next = q->classes[h]; |
q->classes[h] = this; |
|
if (parent == NULL) |
return; |
|
if (parent->children == NULL) { |
parent->children = this; |
} else { |
this->sibling = parent->children->sibling; |
parent->children->sibling = this; |
} |
} |
|
static unsigned int cbq_drop(struct Qdisc* sch) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl, *cl_head; |
int prio; |
unsigned int len; |
|
for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { |
if ((cl_head = q->active[prio]) == NULL) |
continue; |
|
cl = cl_head; |
do { |
if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { |
sch->q.qlen--; |
return len; |
} |
} while ((cl = cl->next_alive) != cl_head); |
} |
return 0; |
} |
|
static void |
cbq_reset(struct Qdisc* sch) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl; |
int prio; |
unsigned h; |
|
q->activemask = 0; |
q->pmask = 0; |
q->tx_class = NULL; |
q->tx_borrowed = NULL; |
del_timer(&q->wd_timer); |
del_timer(&q->delay_timer); |
q->toplevel = TC_CBQ_MAXLEVEL; |
PSCHED_GET_TIME(q->now); |
q->now_rt = q->now; |
|
for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) |
q->active[prio] = NULL; |
|
for (h = 0; h < 16; h++) { |
for (cl = q->classes[h]; cl; cl = cl->next) { |
qdisc_reset(cl->q); |
|
cl->next_alive = NULL; |
PSCHED_SET_PASTPERFECT(cl->undertime); |
cl->avgidle = cl->maxidle; |
cl->deficit = cl->quantum; |
cl->cpriority = cl->priority; |
} |
} |
sch->q.qlen = 0; |
} |
|
|
static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) |
{ |
if (lss->change&TCF_CBQ_LSS_FLAGS) { |
cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; |
cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; |
} |
if (lss->change&TCF_CBQ_LSS_EWMA) |
cl->ewma_log = lss->ewma_log; |
if (lss->change&TCF_CBQ_LSS_AVPKT) |
cl->avpkt = lss->avpkt; |
if (lss->change&TCF_CBQ_LSS_MINIDLE) |
cl->minidle = -(long)lss->minidle; |
if (lss->change&TCF_CBQ_LSS_MAXIDLE) { |
cl->maxidle = lss->maxidle; |
cl->avgidle = lss->maxidle; |
} |
if (lss->change&TCF_CBQ_LSS_OFFTIME) |
cl->offtime = lss->offtime; |
return 0; |
} |
|
static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) |
{ |
q->nclasses[cl->priority]--; |
q->quanta[cl->priority] -= cl->weight; |
cbq_normalize_quanta(q, cl->priority); |
} |
|
static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) |
{ |
q->nclasses[cl->priority]++; |
q->quanta[cl->priority] += cl->weight; |
cbq_normalize_quanta(q, cl->priority); |
} |
|
static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; |
|
if (wrr->allot) |
cl->allot = wrr->allot; |
if (wrr->weight) |
cl->weight = wrr->weight; |
if (wrr->priority) { |
cl->priority = wrr->priority-1; |
cl->cpriority = cl->priority; |
if (cl->priority >= cl->priority2) |
cl->priority2 = TC_CBQ_MAXPRIO-1; |
} |
|
cbq_addprio(q, cl); |
return 0; |
} |
|
static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) |
{ |
switch (ovl->strategy) { |
case TC_CBQ_OVL_CLASSIC: |
cl->overlimit = cbq_ovl_classic; |
break; |
case TC_CBQ_OVL_DELAY: |
cl->overlimit = cbq_ovl_delay; |
break; |
case TC_CBQ_OVL_LOWPRIO: |
if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || |
ovl->priority2-1 <= cl->priority) |
return -EINVAL; |
cl->priority2 = ovl->priority2-1; |
cl->overlimit = cbq_ovl_lowprio; |
break; |
case TC_CBQ_OVL_DROP: |
cl->overlimit = cbq_ovl_drop; |
break; |
case TC_CBQ_OVL_RCLASSIC: |
cl->overlimit = cbq_ovl_rclassic; |
break; |
default: |
return -EINVAL; |
} |
cl->penalty = (ovl->penalty*HZ)/1000; |
return 0; |
} |
|
#ifdef CONFIG_NET_CLS_POLICE |
static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) |
{ |
cl->police = p->police; |
|
if (cl->q->handle) { |
if (p->police == TC_POLICE_RECLASSIFY) |
cl->q->reshape_fail = cbq_reshape_fail; |
else |
cl->q->reshape_fail = NULL; |
} |
return 0; |
} |
#endif |
|
static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) |
{ |
cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); |
return 0; |
} |
|
static int cbq_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; |
struct rtattr *tb[TCA_CBQ_MAX]; |
struct tc_ratespec *r; |
|
if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 || |
tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || |
RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) |
return -EINVAL; |
|
if (tb[TCA_CBQ_LSSOPT-1] && |
RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) |
return -EINVAL; |
|
r = RTA_DATA(tb[TCA_CBQ_RATE-1]); |
|
MOD_INC_USE_COUNT; |
if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) { |
MOD_DEC_USE_COUNT; |
return -EINVAL; |
} |
|
q->link.refcnt = 1; |
q->link.sibling = &q->link; |
q->link.classid = sch->handle; |
q->link.qdisc = sch; |
if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) |
q->link.q = &noop_qdisc; |
|
q->link.priority = TC_CBQ_MAXPRIO-1; |
q->link.priority2 = TC_CBQ_MAXPRIO-1; |
q->link.cpriority = TC_CBQ_MAXPRIO-1; |
q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; |
q->link.overlimit = cbq_ovl_classic; |
q->link.allot = psched_mtu(sch->dev); |
q->link.quantum = q->link.allot; |
q->link.weight = q->link.R_tab->rate.rate; |
|
q->link.ewma_log = TC_CBQ_DEF_EWMA; |
q->link.avpkt = q->link.allot/2; |
q->link.minidle = -0x7FFFFFFF; |
q->link.stats.lock = &sch->dev->queue_lock; |
|
init_timer(&q->wd_timer); |
q->wd_timer.data = (unsigned long)sch; |
q->wd_timer.function = cbq_watchdog; |
init_timer(&q->delay_timer); |
q->delay_timer.data = (unsigned long)sch; |
q->delay_timer.function = cbq_undelay; |
q->toplevel = TC_CBQ_MAXLEVEL; |
PSCHED_GET_TIME(q->now); |
q->now_rt = q->now; |
|
cbq_link_class(&q->link); |
|
if (tb[TCA_CBQ_LSSOPT-1]) |
cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); |
|
cbq_addprio(q, &q->link); |
return 0; |
} |
|
static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) |
{ |
unsigned char *b = skb->tail; |
|
RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) |
{ |
unsigned char *b = skb->tail; |
struct tc_cbq_lssopt opt; |
|
opt.flags = 0; |
if (cl->borrow == NULL) |
opt.flags |= TCF_CBQ_LSS_BOUNDED; |
if (cl->share == NULL) |
opt.flags |= TCF_CBQ_LSS_ISOLATED; |
opt.ewma_log = cl->ewma_log; |
opt.level = cl->level; |
opt.avpkt = cl->avpkt; |
opt.maxidle = cl->maxidle; |
opt.minidle = (u32)(-cl->minidle); |
opt.offtime = cl->offtime; |
opt.change = ~0; |
RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) |
{ |
unsigned char *b = skb->tail; |
struct tc_cbq_wrropt opt; |
|
opt.flags = 0; |
opt.allot = cl->allot; |
opt.priority = cl->priority+1; |
opt.cpriority = cl->cpriority+1; |
opt.weight = cl->weight; |
RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) |
{ |
unsigned char *b = skb->tail; |
struct tc_cbq_ovl opt; |
|
opt.strategy = cl->ovl_strategy; |
opt.priority2 = cl->priority2+1; |
opt.penalty = (cl->penalty*1000)/HZ; |
RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) |
{ |
unsigned char *b = skb->tail; |
struct tc_cbq_fopt opt; |
|
if (cl->split || cl->defmap) { |
opt.split = cl->split ? cl->split->classid : 0; |
opt.defmap = cl->defmap; |
opt.defchange = ~0; |
RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); |
} |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
#ifdef CONFIG_NET_CLS_POLICE |
static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) |
{ |
unsigned char *b = skb->tail; |
struct tc_cbq_police opt; |
|
if (cl->police) { |
opt.police = cl->police; |
RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); |
} |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
#endif |
|
static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) |
{ |
if (cbq_dump_lss(skb, cl) < 0 || |
cbq_dump_rate(skb, cl) < 0 || |
cbq_dump_wrr(skb, cl) < 0 || |
cbq_dump_ovl(skb, cl) < 0 || |
#ifdef CONFIG_NET_CLS_POLICE |
cbq_dump_police(skb, cl) < 0 || |
#endif |
cbq_dump_fopt(skb, cl) < 0) |
return -1; |
return 0; |
} |
|
int cbq_copy_xstats(struct sk_buff *skb, struct tc_cbq_xstats *st) |
{ |
RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st); |
return 0; |
|
rtattr_failure: |
return -1; |
} |
|
|
static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
if (cbq_dump_attr(skb, &q->link) < 0) |
goto rtattr_failure; |
rta->rta_len = skb->tail - b; |
spin_lock_bh(&sch->dev->queue_lock); |
q->link.xstats.avgidle = q->link.avgidle; |
if (cbq_copy_xstats(skb, &q->link.xstats)) { |
spin_unlock_bh(&sch->dev->queue_lock); |
goto rtattr_failure; |
} |
spin_unlock_bh(&sch->dev->queue_lock); |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int |
cbq_dump_class(struct Qdisc *sch, unsigned long arg, |
struct sk_buff *skb, struct tcmsg *tcm) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; |
struct cbq_class *cl = (struct cbq_class*)arg; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
if (cl->tparent) |
tcm->tcm_parent = cl->tparent->classid; |
else |
tcm->tcm_parent = TC_H_ROOT; |
tcm->tcm_handle = cl->classid; |
tcm->tcm_info = cl->q->handle; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
if (cbq_dump_attr(skb, cl) < 0) |
goto rtattr_failure; |
rta->rta_len = skb->tail - b; |
cl->stats.qlen = cl->q->q.qlen; |
if (qdisc_copy_stats(skb, &cl->stats)) |
goto rtattr_failure; |
spin_lock_bh(&sch->dev->queue_lock); |
cl->xstats.avgidle = cl->avgidle; |
cl->xstats.undertime = 0; |
if (!PSCHED_IS_PASTPERFECT(cl->undertime)) |
cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); |
q->link.xstats.avgidle = q->link.avgidle; |
if (cbq_copy_xstats(skb, &cl->xstats)) { |
spin_unlock_bh(&sch->dev->queue_lock); |
goto rtattr_failure; |
} |
spin_unlock_bh(&sch->dev->queue_lock); |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, |
struct Qdisc **old) |
{ |
struct cbq_class *cl = (struct cbq_class*)arg; |
|
if (cl) { |
if (new == NULL) { |
if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) |
return -ENOBUFS; |
} else { |
#ifdef CONFIG_NET_CLS_POLICE |
if (cl->police == TC_POLICE_RECLASSIFY) |
new->reshape_fail = cbq_reshape_fail; |
#endif |
} |
sch_tree_lock(sch); |
*old = cl->q; |
cl->q = new; |
sch->q.qlen -= (*old)->q.qlen; |
qdisc_reset(*old); |
sch_tree_unlock(sch); |
|
return 0; |
} |
return -ENOENT; |
} |
|
static struct Qdisc * |
cbq_leaf(struct Qdisc *sch, unsigned long arg) |
{ |
struct cbq_class *cl = (struct cbq_class*)arg; |
|
return cl ? cl->q : NULL; |
} |
|
static unsigned long cbq_get(struct Qdisc *sch, u32 classid) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl = cbq_class_lookup(q, classid); |
|
if (cl) { |
cl->refcnt++; |
return (unsigned long)cl; |
} |
return 0; |
} |
|
static void cbq_destroy_filters(struct cbq_class *cl) |
{ |
struct tcf_proto *tp; |
|
while ((tp = cl->filter_list) != NULL) { |
cl->filter_list = tp->next; |
tcf_destroy(tp); |
} |
} |
|
static void cbq_destroy_class(struct cbq_class *cl) |
{ |
cbq_destroy_filters(cl); |
qdisc_destroy(cl->q); |
qdisc_put_rtab(cl->R_tab); |
#ifdef CONFIG_NET_ESTIMATOR |
qdisc_kill_estimator(&cl->stats); |
#endif |
kfree(cl); |
} |
|
static void |
cbq_destroy(struct Qdisc* sch) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl; |
unsigned h; |
|
#ifdef CONFIG_NET_CLS_POLICE |
q->rx_class = NULL; |
#endif |
for (h = 0; h < 16; h++) { |
for (cl = q->classes[h]; cl; cl = cl->next) |
cbq_destroy_filters(cl); |
} |
|
for (h = 0; h < 16; h++) { |
struct cbq_class *next; |
|
for (cl = q->classes[h]; cl; cl = next) { |
next = cl->next; |
if (cl != &q->link) |
cbq_destroy_class(cl); |
} |
} |
|
qdisc_put_rtab(q->link.R_tab); |
MOD_DEC_USE_COUNT; |
} |
|
static void cbq_put(struct Qdisc *sch, unsigned long arg) |
{ |
struct cbq_class *cl = (struct cbq_class*)arg; |
|
if (--cl->refcnt == 0) { |
#ifdef CONFIG_NET_CLS_POLICE |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
|
spin_lock_bh(&sch->dev->queue_lock); |
if (q->rx_class == cl) |
q->rx_class = NULL; |
spin_unlock_bh(&sch->dev->queue_lock); |
#endif |
|
cbq_destroy_class(cl); |
} |
} |
|
static int |
cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, |
unsigned long *arg) |
{ |
int err; |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl = (struct cbq_class*)*arg; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_CBQ_MAX]; |
struct cbq_class *parent; |
struct qdisc_rate_table *rtab = NULL; |
|
if (opt==NULL || |
rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt))) |
return -EINVAL; |
|
if (tb[TCA_CBQ_OVL_STRATEGY-1] && |
RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) |
return -EINVAL; |
|
if (tb[TCA_CBQ_FOPT-1] && |
RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) |
return -EINVAL; |
|
if (tb[TCA_CBQ_RATE-1] && |
RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) |
return -EINVAL; |
|
if (tb[TCA_CBQ_LSSOPT-1] && |
RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) |
return -EINVAL; |
|
if (tb[TCA_CBQ_WRROPT-1] && |
RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) |
return -EINVAL; |
|
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_CBQ_POLICE-1] && |
RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) |
return -EINVAL; |
#endif |
|
if (cl) { |
/* Check parent */ |
if (parentid) { |
if (cl->tparent && cl->tparent->classid != parentid) |
return -EINVAL; |
if (!cl->tparent && parentid != TC_H_ROOT) |
return -EINVAL; |
} |
|
if (tb[TCA_CBQ_RATE-1]) { |
rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); |
if (rtab == NULL) |
return -EINVAL; |
} |
|
/* Change class parameters */ |
sch_tree_lock(sch); |
|
if (cl->next_alive != NULL) |
cbq_deactivate_class(cl); |
|
if (rtab) { |
rtab = xchg(&cl->R_tab, rtab); |
qdisc_put_rtab(rtab); |
} |
|
if (tb[TCA_CBQ_LSSOPT-1]) |
cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); |
|
if (tb[TCA_CBQ_WRROPT-1]) { |
cbq_rmprio(q, cl); |
cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); |
} |
|
if (tb[TCA_CBQ_OVL_STRATEGY-1]) |
cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); |
|
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_CBQ_POLICE-1]) |
cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); |
#endif |
|
if (tb[TCA_CBQ_FOPT-1]) |
cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); |
|
if (cl->q->q.qlen) |
cbq_activate_class(cl); |
|
sch_tree_unlock(sch); |
|
#ifdef CONFIG_NET_ESTIMATOR |
if (tca[TCA_RATE-1]) { |
qdisc_kill_estimator(&cl->stats); |
qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); |
} |
#endif |
return 0; |
} |
|
if (parentid == TC_H_ROOT) |
return -EINVAL; |
|
if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || |
tb[TCA_CBQ_LSSOPT-1] == NULL) |
return -EINVAL; |
|
rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); |
if (rtab == NULL) |
return -EINVAL; |
|
if (classid) { |
err = -EINVAL; |
if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) |
goto failure; |
} else { |
int i; |
classid = TC_H_MAKE(sch->handle,0x8000); |
|
for (i=0; i<0x8000; i++) { |
if (++q->hgenerator >= 0x8000) |
q->hgenerator = 1; |
if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) |
break; |
} |
err = -ENOSR; |
if (i >= 0x8000) |
goto failure; |
classid = classid|q->hgenerator; |
} |
|
parent = &q->link; |
if (parentid) { |
parent = cbq_class_lookup(q, parentid); |
err = -EINVAL; |
if (parent == NULL) |
goto failure; |
} |
|
err = -ENOBUFS; |
cl = kmalloc(sizeof(*cl), GFP_KERNEL); |
if (cl == NULL) |
goto failure; |
memset(cl, 0, sizeof(*cl)); |
cl->R_tab = rtab; |
rtab = NULL; |
cl->refcnt = 1; |
if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) |
cl->q = &noop_qdisc; |
cl->classid = classid; |
cl->tparent = parent; |
cl->qdisc = sch; |
cl->allot = parent->allot; |
cl->quantum = cl->allot; |
cl->weight = cl->R_tab->rate.rate; |
cl->stats.lock = &sch->dev->queue_lock; |
|
sch_tree_lock(sch); |
cbq_link_class(cl); |
cl->borrow = cl->tparent; |
if (cl->tparent != &q->link) |
cl->share = cl->tparent; |
cbq_adjust_levels(parent); |
cl->minidle = -0x7FFFFFFF; |
cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); |
cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); |
if (cl->ewma_log==0) |
cl->ewma_log = q->link.ewma_log; |
if (cl->maxidle==0) |
cl->maxidle = q->link.maxidle; |
if (cl->avpkt==0) |
cl->avpkt = q->link.avpkt; |
cl->overlimit = cbq_ovl_classic; |
if (tb[TCA_CBQ_OVL_STRATEGY-1]) |
cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); |
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_CBQ_POLICE-1]) |
cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); |
#endif |
if (tb[TCA_CBQ_FOPT-1]) |
cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); |
sch_tree_unlock(sch); |
|
#ifdef CONFIG_NET_ESTIMATOR |
if (tca[TCA_RATE-1]) |
qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); |
#endif |
|
*arg = (unsigned long)cl; |
return 0; |
|
failure: |
qdisc_put_rtab(rtab); |
return err; |
} |
|
static int cbq_delete(struct Qdisc *sch, unsigned long arg) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl = (struct cbq_class*)arg; |
|
if (cl->filters || cl->children || cl == &q->link) |
return -EBUSY; |
|
sch_tree_lock(sch); |
|
if (cl->next_alive) |
cbq_deactivate_class(cl); |
|
if (q->tx_borrowed == cl) |
q->tx_borrowed = q->tx_class; |
if (q->tx_class == cl) { |
q->tx_class = NULL; |
q->tx_borrowed = NULL; |
} |
#ifdef CONFIG_NET_CLS_POLICE |
if (q->rx_class == cl) |
q->rx_class = NULL; |
#endif |
|
cbq_unlink_class(cl); |
cbq_adjust_levels(cl->tparent); |
cl->defmap = 0; |
cbq_sync_defmap(cl); |
|
cbq_rmprio(q, cl); |
sch_tree_unlock(sch); |
|
if (--cl->refcnt == 0) |
cbq_destroy_class(cl); |
|
return 0; |
} |
|
static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *cl = (struct cbq_class *)arg; |
|
if (cl == NULL) |
cl = &q->link; |
|
return &cl->filter_list; |
} |
|
static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, |
u32 classid) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
struct cbq_class *p = (struct cbq_class*)parent; |
struct cbq_class *cl = cbq_class_lookup(q, classid); |
|
if (cl) { |
if (p && p->level <= cl->level) |
return 0; |
cl->filters++; |
return (unsigned long)cl; |
} |
return 0; |
} |
|
static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) |
{ |
struct cbq_class *cl = (struct cbq_class*)arg; |
|
cl->filters--; |
} |
|
static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) |
{ |
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; |
unsigned h; |
|
if (arg->stop) |
return; |
|
for (h = 0; h < 16; h++) { |
struct cbq_class *cl; |
|
for (cl = q->classes[h]; cl; cl = cl->next) { |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (arg->fn(sch, (unsigned long)cl, arg) < 0) { |
arg->stop = 1; |
return; |
} |
arg->count++; |
} |
} |
} |
|
static struct Qdisc_class_ops cbq_class_ops = |
{ |
cbq_graft, |
cbq_leaf, |
cbq_get, |
cbq_put, |
cbq_change_class, |
cbq_delete, |
cbq_walk, |
|
cbq_find_tcf, |
cbq_bind_filter, |
cbq_unbind_filter, |
|
cbq_dump_class, |
}; |
|
struct Qdisc_ops cbq_qdisc_ops = |
{ |
NULL, |
&cbq_class_ops, |
"cbq", |
sizeof(struct cbq_sched_data), |
|
cbq_enqueue, |
cbq_dequeue, |
cbq_requeue, |
cbq_drop, |
|
cbq_init, |
cbq_reset, |
cbq_destroy, |
NULL /* cbq_change */, |
|
cbq_dump, |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&cbq_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&cbq_qdisc_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/sch_teql.c
0,0 → 1,496
/* net/sched/sch_teql.c "True" (or "trivial") link equalizer. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <linux/init.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
/* |
How to setup it. |
---------------- |
|
After loading this module you will find a new device teqlN |
and new qdisc with the same name. To join a slave to the equalizer |
you should just set this qdisc on a device f.e. |
|
# tc qdisc add dev eth0 root teql0 |
# tc qdisc add dev eth1 root teql0 |
|
That's all. Full PnP 8) |
|
Applicability. |
-------------- |
|
1. Slave devices MUST be active devices, i.e., they must raise the tbusy |
signal and generate EOI events. If you want to equalize virtual devices |
like tunnels, use a normal eql device. |
2. This device puts no limitations on physical slave characteristics |
f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) |
Certainly, large difference in link speeds will make the resulting |
eqalized link unusable, because of huge packet reordering. |
I estimate an upper useful difference as ~10 times. |
3. If the slave requires address resolution, only protocols using |
neighbour cache (IPv4/IPv6) will work over the equalized link. |
Other protocols are still allowed to use the slave device directly, |
which will not break load balancing, though native slave |
traffic will have the highest priority. */ |
|
struct teql_master |
{ |
struct Qdisc_ops qops; |
struct net_device dev; |
struct Qdisc *slaves; |
struct net_device_stats stats; |
}; |
|
struct teql_sched_data |
{ |
struct Qdisc *next; |
struct teql_master *m; |
struct neighbour *ncache; |
struct sk_buff_head q; |
}; |
|
#define NEXT_SLAVE(q) (((struct teql_sched_data*)((q)->data))->next) |
|
#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST) |
|
/* "teql*" qdisc routines */ |
|
static int |
teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct net_device *dev = sch->dev; |
struct teql_sched_data *q = (struct teql_sched_data *)sch->data; |
|
__skb_queue_tail(&q->q, skb); |
if (q->q.qlen <= dev->tx_queue_len) { |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
return 0; |
} |
|
__skb_unlink(skb, &q->q); |
kfree_skb(skb); |
sch->stats.drops++; |
return NET_XMIT_DROP; |
} |
|
static int |
teql_requeue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct teql_sched_data *q = (struct teql_sched_data *)sch->data; |
|
__skb_queue_head(&q->q, skb); |
return 0; |
} |
|
static struct sk_buff * |
teql_dequeue(struct Qdisc* sch) |
{ |
struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; |
struct sk_buff *skb; |
|
skb = __skb_dequeue(&dat->q); |
if (skb == NULL) { |
struct net_device *m = dat->m->dev.qdisc->dev; |
if (m) { |
dat->m->slaves = sch; |
netif_wake_queue(m); |
} |
} |
sch->q.qlen = dat->q.qlen + dat->m->dev.qdisc->q.qlen; |
return skb; |
} |
|
static __inline__ void |
teql_neigh_release(struct neighbour *n) |
{ |
if (n) |
neigh_release(n); |
} |
|
static void |
teql_reset(struct Qdisc* sch) |
{ |
struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; |
|
skb_queue_purge(&dat->q); |
sch->q.qlen = 0; |
teql_neigh_release(xchg(&dat->ncache, NULL)); |
} |
|
static void |
teql_destroy(struct Qdisc* sch) |
{ |
struct Qdisc *q, *prev; |
struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; |
struct teql_master *master = dat->m; |
|
if ((prev = master->slaves) != NULL) { |
do { |
q = NEXT_SLAVE(prev); |
if (q == sch) { |
NEXT_SLAVE(prev) = NEXT_SLAVE(q); |
if (q == master->slaves) { |
master->slaves = NEXT_SLAVE(q); |
if (q == master->slaves) { |
master->slaves = NULL; |
spin_lock_bh(&master->dev.queue_lock); |
qdisc_reset(master->dev.qdisc); |
spin_unlock_bh(&master->dev.queue_lock); |
} |
} |
skb_queue_purge(&dat->q); |
teql_neigh_release(xchg(&dat->ncache, NULL)); |
break; |
} |
|
} while ((prev = q) != master->slaves); |
} |
|
MOD_DEC_USE_COUNT; |
} |
|
static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct net_device *dev = sch->dev; |
struct teql_master *m = (struct teql_master*)sch->ops; |
struct teql_sched_data *q = (struct teql_sched_data *)sch->data; |
|
if (dev->hard_header_len > m->dev.hard_header_len) |
return -EINVAL; |
|
if (&m->dev == dev) |
return -ELOOP; |
|
q->m = m; |
|
skb_queue_head_init(&q->q); |
|
if (m->slaves) { |
if (m->dev.flags & IFF_UP) { |
if ((m->dev.flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT)) |
|| (m->dev.flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST)) |
|| (m->dev.flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST)) |
|| dev->mtu < m->dev.mtu) |
return -EINVAL; |
} else { |
if (!(dev->flags&IFF_POINTOPOINT)) |
m->dev.flags &= ~IFF_POINTOPOINT; |
if (!(dev->flags&IFF_BROADCAST)) |
m->dev.flags &= ~IFF_BROADCAST; |
if (!(dev->flags&IFF_MULTICAST)) |
m->dev.flags &= ~IFF_MULTICAST; |
if (dev->mtu < m->dev.mtu) |
m->dev.mtu = dev->mtu; |
} |
q->next = NEXT_SLAVE(m->slaves); |
NEXT_SLAVE(m->slaves) = sch; |
} else { |
q->next = sch; |
m->slaves = sch; |
m->dev.mtu = dev->mtu; |
m->dev.flags = (m->dev.flags&~FMASK)|(dev->flags&FMASK); |
} |
|
MOD_INC_USE_COUNT; |
return 0; |
} |
|
/* "teql*" netdevice routines */ |
|
static int |
__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) |
{ |
struct teql_sched_data *q = (void*)dev->qdisc->data; |
struct neighbour *mn = skb->dst->neighbour; |
struct neighbour *n = q->ncache; |
|
if (mn->tbl == NULL) |
return -EINVAL; |
if (n && n->tbl == mn->tbl && |
memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { |
atomic_inc(&n->refcnt); |
} else { |
n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); |
if (IS_ERR(n)) |
return PTR_ERR(n); |
} |
if (neigh_event_send(n, skb_res) == 0) { |
int err; |
read_lock(&n->lock); |
err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len); |
read_unlock(&n->lock); |
if (err < 0) { |
neigh_release(n); |
return -EINVAL; |
} |
teql_neigh_release(xchg(&q->ncache, n)); |
return 0; |
} |
neigh_release(n); |
return (skb_res == NULL) ? -EAGAIN : 1; |
} |
|
static __inline__ int |
teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) |
{ |
if (dev->hard_header == NULL || |
skb->dst == NULL || |
skb->dst->neighbour == NULL) |
return 0; |
return __teql_resolve(skb, skb_res, dev); |
} |
|
static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev) |
{ |
struct teql_master *master = (void*)dev->priv; |
struct Qdisc *start, *q; |
int busy; |
int nores; |
int len = skb->len; |
struct sk_buff *skb_res = NULL; |
|
start = master->slaves; |
|
restart: |
nores = 0; |
busy = 0; |
|
if ((q = start) == NULL) |
goto drop; |
|
do { |
struct net_device *slave = q->dev; |
|
if (slave->qdisc_sleeping != q) |
continue; |
if (netif_queue_stopped(slave) || ! netif_running(slave)) { |
busy = 1; |
continue; |
} |
|
switch (teql_resolve(skb, skb_res, slave)) { |
case 0: |
if (spin_trylock(&slave->xmit_lock)) { |
slave->xmit_lock_owner = smp_processor_id(); |
if (!netif_queue_stopped(slave) && |
slave->hard_start_xmit(skb, slave) == 0) { |
slave->xmit_lock_owner = -1; |
spin_unlock(&slave->xmit_lock); |
master->slaves = NEXT_SLAVE(q); |
netif_wake_queue(dev); |
master->stats.tx_packets++; |
master->stats.tx_bytes += len; |
return 0; |
} |
slave->xmit_lock_owner = -1; |
spin_unlock(&slave->xmit_lock); |
} |
if (netif_queue_stopped(dev)) |
busy = 1; |
break; |
case 1: |
master->slaves = NEXT_SLAVE(q); |
return 0; |
default: |
nores = 1; |
break; |
} |
__skb_pull(skb, skb->nh.raw - skb->data); |
} while ((q = NEXT_SLAVE(q)) != start); |
|
if (nores && skb_res == NULL) { |
skb_res = skb; |
goto restart; |
} |
|
if (busy) { |
netif_stop_queue(dev); |
return 1; |
} |
master->stats.tx_errors++; |
|
drop: |
master->stats.tx_dropped++; |
dev_kfree_skb(skb); |
return 0; |
} |
|
static int teql_master_open(struct net_device *dev) |
{ |
struct Qdisc * q; |
struct teql_master *m = (void*)dev->priv; |
int mtu = 0xFFFE; |
unsigned flags = IFF_NOARP|IFF_MULTICAST; |
|
if (m->slaves == NULL) |
return -EUNATCH; |
|
flags = FMASK; |
|
q = m->slaves; |
do { |
struct net_device *slave = q->dev; |
|
if (slave == NULL) |
return -EUNATCH; |
|
if (slave->mtu < mtu) |
mtu = slave->mtu; |
if (slave->hard_header_len > LL_MAX_HEADER) |
return -EINVAL; |
|
/* If all the slaves are BROADCAST, master is BROADCAST |
If all the slaves are PtP, master is PtP |
Otherwise, master is NBMA. |
*/ |
if (!(slave->flags&IFF_POINTOPOINT)) |
flags &= ~IFF_POINTOPOINT; |
if (!(slave->flags&IFF_BROADCAST)) |
flags &= ~IFF_BROADCAST; |
if (!(slave->flags&IFF_MULTICAST)) |
flags &= ~IFF_MULTICAST; |
} while ((q = NEXT_SLAVE(q)) != m->slaves); |
|
m->dev.mtu = mtu; |
m->dev.flags = (m->dev.flags&~FMASK) | flags; |
netif_start_queue(&m->dev); |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
static int teql_master_close(struct net_device *dev) |
{ |
netif_stop_queue(dev); |
MOD_DEC_USE_COUNT; |
return 0; |
} |
|
static struct net_device_stats *teql_master_stats(struct net_device *dev) |
{ |
struct teql_master *m = (void*)dev->priv; |
return &m->stats; |
} |
|
static int teql_master_mtu(struct net_device *dev, int new_mtu) |
{ |
struct teql_master *m = (void*)dev->priv; |
struct Qdisc *q; |
|
if (new_mtu < 68) |
return -EINVAL; |
|
q = m->slaves; |
if (q) { |
do { |
if (new_mtu > q->dev->mtu) |
return -EINVAL; |
} while ((q=NEXT_SLAVE(q)) != m->slaves); |
} |
|
dev->mtu = new_mtu; |
return 0; |
} |
|
static int teql_master_init(struct net_device *dev) |
{ |
dev->open = teql_master_open; |
dev->hard_start_xmit = teql_master_xmit; |
dev->stop = teql_master_close; |
dev->get_stats = teql_master_stats; |
dev->change_mtu = teql_master_mtu; |
dev->type = ARPHRD_VOID; |
dev->mtu = 1500; |
dev->tx_queue_len = 100; |
dev->flags = IFF_NOARP; |
dev->hard_header_len = LL_MAX_HEADER; |
return 0; |
} |
|
static struct teql_master the_master = { |
{ |
NULL, |
NULL, |
"", |
sizeof(struct teql_sched_data), |
|
teql_enqueue, |
teql_dequeue, |
teql_requeue, |
NULL, |
|
teql_qdisc_init, |
teql_reset, |
teql_destroy, |
NULL, |
},}; |
|
|
#ifdef MODULE |
int init_module(void) |
#else |
int __init teql_init(void) |
#endif |
{ |
int err; |
|
rtnl_lock(); |
|
the_master.dev.priv = (void*)&the_master; |
err = dev_alloc_name(&the_master.dev, "teql%d"); |
if (err < 0) |
return err; |
memcpy(the_master.qops.id, the_master.dev.name, IFNAMSIZ); |
the_master.dev.init = teql_master_init; |
|
err = register_netdevice(&the_master.dev); |
if (err == 0) { |
err = register_qdisc(&the_master.qops); |
if (err) |
unregister_netdevice(&the_master.dev); |
} |
rtnl_unlock(); |
return err; |
} |
|
#ifdef MODULE |
void cleanup_module(void) |
{ |
rtnl_lock(); |
unregister_qdisc(&the_master.qops); |
unregister_netdevice(&the_master.dev); |
rtnl_unlock(); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/sch_api.c
0,0 → 1,1256
/* |
* net/sched/sch_api.c Packet scheduler API. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* |
* Fixes: |
* |
* Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. |
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support |
* Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support |
*/ |
|
#include <linux/config.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/netdevice.h> |
#include <linux/skbuff.h> |
#include <linux/rtnetlink.h> |
#include <linux/init.h> |
#include <linux/proc_fs.h> |
#include <linux/kmod.h> |
|
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
#include <asm/processor.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
|
static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, |
struct Qdisc *old, struct Qdisc *new); |
static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, |
struct Qdisc *q, unsigned long cl, int event); |
|
/* |
|
Short review. |
------------- |
|
This file consists of two interrelated parts: |
|
1. queueing disciplines manager frontend. |
2. traffic classes manager frontend. |
|
Generally, queueing discipline ("qdisc") is a black box, |
which is able to enqueue packets and to dequeue them (when |
device is ready to send something) in order and at times |
determined by algorithm hidden in it. |
|
qdisc's are divided to two categories: |
- "queues", which have no internal structure visible from outside. |
- "schedulers", which split all the packets to "traffic classes", |
using "packet classifiers" (look at cls_api.c) |
|
In turn, classes may have child qdiscs (as rule, queues) |
attached to them etc. etc. etc. |
|
The goal of the routines in this file is to translate |
information supplied by user in the form of handles |
to more intelligible for kernel form, to make some sanity |
checks and part of work, which is common to all qdiscs |
and to provide rtnetlink notifications. |
|
All real intelligent work is done inside qdisc modules. |
|
|
|
Every discipline has two major routines: enqueue and dequeue. |
|
---dequeue |
|
dequeue usually returns a skb to send. It is allowed to return NULL, |
but it does not mean that queue is empty, it just means that |
discipline does not want to send anything this time. |
Queue is really empty if q->q.qlen == 0. |
For complicated disciplines with multiple queues q->q is not |
real packet queue, but however q->q.qlen must be valid. |
|
---enqueue |
|
enqueue returns 0, if packet was enqueued successfully. |
If packet (this one or another one) was dropped, it returns |
not zero error code. |
NET_XMIT_DROP - this packet dropped |
Expected action: do not backoff, but wait until queue will clear. |
NET_XMIT_CN - probably this packet enqueued, but another one dropped. |
Expected action: backoff or ignore |
NET_XMIT_POLICED - dropped by police. |
Expected action: backoff or error to real-time apps. |
|
Auxiliary routines: |
|
---requeue |
|
requeues once dequeued packet. It is used for non-standard or |
just buggy devices, which can defer output even if dev->tbusy=0. |
|
---reset |
|
returns qdisc to initial state: purge all buffers, clear all |
timers, counters (except for statistics) etc. |
|
---init |
|
initializes newly created qdisc. |
|
---destroy |
|
destroys resources allocated by init and during lifetime of qdisc. |
|
---change |
|
changes qdisc parameters. |
*/ |
|
/* Protects list of registered TC modules. It is pure SMP lock. */ |
static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED; |
|
|
/************************************************ |
* Queueing disciplines manipulation. * |
************************************************/ |
|
|
/* The list of all installed queueing disciplines. */ |
|
static struct Qdisc_ops *qdisc_base = NULL; |
|
/* Register/uregister queueing discipline */ |
|
int register_qdisc(struct Qdisc_ops *qops) |
{ |
struct Qdisc_ops *q, **qp; |
|
write_lock(&qdisc_mod_lock); |
for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) { |
if (strcmp(qops->id, q->id) == 0) { |
write_unlock(&qdisc_mod_lock); |
return -EEXIST; |
} |
} |
|
if (qops->enqueue == NULL) |
qops->enqueue = noop_qdisc_ops.enqueue; |
if (qops->requeue == NULL) |
qops->requeue = noop_qdisc_ops.requeue; |
if (qops->dequeue == NULL) |
qops->dequeue = noop_qdisc_ops.dequeue; |
|
qops->next = NULL; |
*qp = qops; |
write_unlock(&qdisc_mod_lock); |
return 0; |
} |
|
int unregister_qdisc(struct Qdisc_ops *qops) |
{ |
struct Qdisc_ops *q, **qp; |
int err = -ENOENT; |
|
write_lock(&qdisc_mod_lock); |
for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) |
if (q == qops) |
break; |
if (q) { |
*qp = q->next; |
q->next = NULL; |
err = 0; |
} |
write_unlock(&qdisc_mod_lock); |
return err; |
} |
|
/* We know handle. Find qdisc among all qdisc's attached to device |
(root qdisc, all its children, children of children etc.) |
*/ |
|
struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) |
{ |
struct Qdisc *q; |
|
for (q = dev->qdisc_list; q; q = q->next) { |
if (q->handle == handle) |
return q; |
} |
return NULL; |
} |
|
struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) |
{ |
unsigned long cl; |
struct Qdisc *leaf; |
struct Qdisc_class_ops *cops = p->ops->cl_ops; |
|
if (cops == NULL) |
return NULL; |
cl = cops->get(p, classid); |
|
if (cl == 0) |
return NULL; |
leaf = cops->leaf(p, cl); |
cops->put(p, cl); |
return leaf; |
} |
|
/* Find queueing discipline by name */ |
|
struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) |
{ |
struct Qdisc_ops *q = NULL; |
|
if (kind) { |
read_lock(&qdisc_mod_lock); |
for (q = qdisc_base; q; q = q->next) { |
if (rtattr_strcmp(kind, q->id) == 0) |
break; |
} |
read_unlock(&qdisc_mod_lock); |
} |
return q; |
} |
|
static struct qdisc_rate_table *qdisc_rtab_list; |
|
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) |
{ |
struct qdisc_rate_table *rtab; |
|
for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { |
if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { |
rtab->refcnt++; |
return rtab; |
} |
} |
|
if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) |
return NULL; |
|
rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); |
if (rtab) { |
rtab->rate = *r; |
rtab->refcnt = 1; |
memcpy(rtab->data, RTA_DATA(tab), 1024); |
rtab->next = qdisc_rtab_list; |
qdisc_rtab_list = rtab; |
} |
return rtab; |
} |
|
void qdisc_put_rtab(struct qdisc_rate_table *tab) |
{ |
struct qdisc_rate_table *rtab, **rtabp; |
|
if (!tab || --tab->refcnt) |
return; |
|
for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { |
if (rtab == tab) { |
*rtabp = rtab->next; |
kfree(rtab); |
return; |
} |
} |
} |
|
|
/* Allocate an unique handle from space managed by kernel */ |
|
u32 qdisc_alloc_handle(struct net_device *dev) |
{ |
int i = 0x10000; |
static u32 autohandle = TC_H_MAKE(0x80000000U, 0); |
|
do { |
autohandle += TC_H_MAKE(0x10000U, 0); |
if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) |
autohandle = TC_H_MAKE(0x80000000U, 0); |
} while (qdisc_lookup(dev, autohandle) && --i > 0); |
|
return i>0 ? autohandle : 0; |
} |
|
/* Attach toplevel qdisc to device dev */ |
|
static struct Qdisc * |
dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) |
{ |
struct Qdisc *oqdisc; |
|
if (dev->flags & IFF_UP) |
dev_deactivate(dev); |
|
write_lock(&qdisc_tree_lock); |
spin_lock_bh(&dev->queue_lock); |
if (qdisc && qdisc->flags&TCQ_F_INGRES) { |
oqdisc = dev->qdisc_ingress; |
/* Prune old scheduler */ |
if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { |
/* delete */ |
qdisc_reset(oqdisc); |
dev->qdisc_ingress = NULL; |
} else { /* new */ |
dev->qdisc_ingress = qdisc; |
} |
|
} else { |
|
oqdisc = dev->qdisc_sleeping; |
|
/* Prune old scheduler */ |
if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) |
qdisc_reset(oqdisc); |
|
/* ... and graft new one */ |
if (qdisc == NULL) |
qdisc = &noop_qdisc; |
dev->qdisc_sleeping = qdisc; |
dev->qdisc = &noop_qdisc; |
} |
|
spin_unlock_bh(&dev->queue_lock); |
write_unlock(&qdisc_tree_lock); |
|
if (dev->flags & IFF_UP) |
dev_activate(dev); |
|
return oqdisc; |
} |
|
|
/* Graft qdisc "new" to class "classid" of qdisc "parent" or |
to device "dev". |
|
Old qdisc is not destroyed but returned in *old. |
*/ |
|
int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid, |
struct Qdisc *new, struct Qdisc **old) |
{ |
int err = 0; |
struct Qdisc *q = *old; |
|
|
if (parent == NULL) { |
if (q && q->flags&TCQ_F_INGRES) { |
*old = dev_graft_qdisc(dev, q); |
} else { |
*old = dev_graft_qdisc(dev, new); |
} |
} else { |
struct Qdisc_class_ops *cops = parent->ops->cl_ops; |
|
err = -EINVAL; |
|
if (cops) { |
unsigned long cl = cops->get(parent, classid); |
if (cl) { |
err = cops->graft(parent, cl, new, old); |
cops->put(parent, cl); |
} |
} |
} |
return err; |
} |
|
/* |
Allocate and initialize new qdisc. |
|
Parameters are passed via opt. |
*/ |
|
static struct Qdisc * |
qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) |
{ |
int err; |
struct rtattr *kind = tca[TCA_KIND-1]; |
struct Qdisc *sch = NULL; |
struct Qdisc_ops *ops; |
int size; |
|
ops = qdisc_lookup_ops(kind); |
#ifdef CONFIG_KMOD |
if (ops==NULL && tca[TCA_KIND-1] != NULL) { |
char module_name[4 + IFNAMSIZ + 1]; |
|
if (RTA_PAYLOAD(kind) <= IFNAMSIZ) { |
sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind)); |
request_module (module_name); |
ops = qdisc_lookup_ops(kind); |
} |
} |
#endif |
|
err = -EINVAL; |
if (ops == NULL) |
goto err_out; |
|
size = sizeof(*sch) + ops->priv_size; |
|
sch = kmalloc(size, GFP_KERNEL); |
err = -ENOBUFS; |
if (!sch) |
goto err_out; |
|
/* Grrr... Resolve race condition with module unload */ |
|
err = -EINVAL; |
if (ops != qdisc_lookup_ops(kind)) |
goto err_out; |
|
memset(sch, 0, size); |
|
skb_queue_head_init(&sch->q); |
|
if (handle == TC_H_INGRESS) |
sch->flags |= TCQ_F_INGRES; |
|
sch->ops = ops; |
sch->enqueue = ops->enqueue; |
sch->dequeue = ops->dequeue; |
sch->dev = dev; |
atomic_set(&sch->refcnt, 1); |
sch->stats.lock = &dev->queue_lock; |
if (handle == 0) { |
handle = qdisc_alloc_handle(dev); |
err = -ENOMEM; |
if (handle == 0) |
goto err_out; |
} |
|
if (handle == TC_H_INGRESS) |
sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); |
else |
sch->handle = handle; |
|
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { |
write_lock(&qdisc_tree_lock); |
sch->next = dev->qdisc_list; |
dev->qdisc_list = sch; |
write_unlock(&qdisc_tree_lock); |
#ifdef CONFIG_NET_ESTIMATOR |
if (tca[TCA_RATE-1]) |
qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); |
#endif |
return sch; |
} |
|
err_out: |
*errp = err; |
if (sch) |
kfree(sch); |
return NULL; |
} |
|
static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) |
{ |
if (tca[TCA_OPTIONS-1]) { |
int err; |
|
if (sch->ops->change == NULL) |
return -EINVAL; |
err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); |
if (err) |
return err; |
} |
#ifdef CONFIG_NET_ESTIMATOR |
if (tca[TCA_RATE-1]) { |
qdisc_kill_estimator(&sch->stats); |
qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); |
} |
#endif |
return 0; |
} |
|
struct check_loop_arg |
{ |
struct qdisc_walker w; |
struct Qdisc *p; |
int depth; |
}; |
|
static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); |
|
static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) |
{ |
struct check_loop_arg arg; |
|
if (q->ops->cl_ops == NULL) |
return 0; |
|
arg.w.stop = arg.w.skip = arg.w.count = 0; |
arg.w.fn = check_loop_fn; |
arg.depth = depth; |
arg.p = p; |
q->ops->cl_ops->walk(q, &arg.w); |
return arg.w.stop ? -ELOOP : 0; |
} |
|
static int |
check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) |
{ |
struct Qdisc *leaf; |
struct Qdisc_class_ops *cops = q->ops->cl_ops; |
struct check_loop_arg *arg = (struct check_loop_arg *)w; |
|
leaf = cops->leaf(q, cl); |
if (leaf) { |
if (leaf == arg->p || arg->depth > 7) |
return -ELOOP; |
return check_loop(leaf, arg->p, arg->depth + 1); |
} |
return 0; |
} |
|
/* |
* Delete/get qdisc. |
*/ |
|
static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) |
{ |
struct tcmsg *tcm = NLMSG_DATA(n); |
struct rtattr **tca = arg; |
struct net_device *dev; |
u32 clid = tcm->tcm_parent; |
struct Qdisc *q = NULL; |
struct Qdisc *p = NULL; |
int err; |
|
if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) |
return -ENODEV; |
|
if (clid) { |
if (clid != TC_H_ROOT) { |
if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { |
if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) |
return -ENOENT; |
q = qdisc_leaf(p, clid); |
} else { /* ingress */ |
q = dev->qdisc_ingress; |
} |
} else { |
q = dev->qdisc_sleeping; |
} |
if (!q) |
return -ENOENT; |
|
if (tcm->tcm_handle && q->handle != tcm->tcm_handle) |
return -EINVAL; |
} else { |
if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) |
return -ENOENT; |
} |
|
if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) |
return -EINVAL; |
|
if (n->nlmsg_type == RTM_DELQDISC) { |
if (!clid) |
return -EINVAL; |
if (q->handle == 0) |
return -ENOENT; |
if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) |
return err; |
if (q) { |
qdisc_notify(skb, n, clid, q, NULL); |
spin_lock_bh(&dev->queue_lock); |
qdisc_destroy(q); |
spin_unlock_bh(&dev->queue_lock); |
} |
} else { |
qdisc_notify(skb, n, clid, NULL, q); |
} |
return 0; |
} |
|
/* |
Create/change qdisc. |
*/ |
|
static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) |
{ |
struct tcmsg *tcm = NLMSG_DATA(n); |
struct rtattr **tca = arg; |
struct net_device *dev; |
u32 clid = tcm->tcm_parent; |
struct Qdisc *q = NULL; |
struct Qdisc *p = NULL; |
int err; |
|
if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) |
return -ENODEV; |
|
if (clid) { |
if (clid != TC_H_ROOT) { |
if (clid != TC_H_INGRESS) { |
if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) |
return -ENOENT; |
q = qdisc_leaf(p, clid); |
} else { /*ingress */ |
q = dev->qdisc_ingress; |
} |
} else { |
q = dev->qdisc_sleeping; |
} |
|
/* It may be default qdisc, ignore it */ |
if (q && q->handle == 0) |
q = NULL; |
|
if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { |
if (tcm->tcm_handle) { |
if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) |
return -EEXIST; |
if (TC_H_MIN(tcm->tcm_handle)) |
return -EINVAL; |
if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) |
goto create_n_graft; |
if (n->nlmsg_flags&NLM_F_EXCL) |
return -EEXIST; |
if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) |
return -EINVAL; |
if (q == p || |
(p && check_loop(q, p, 0))) |
return -ELOOP; |
atomic_inc(&q->refcnt); |
goto graft; |
} else { |
if (q == NULL) |
goto create_n_graft; |
|
/* This magic test requires explanation. |
* |
* We know, that some child q is already |
* attached to this parent and have choice: |
* either to change it or to create/graft new one. |
* |
* 1. We are allowed to create/graft only |
* if CREATE and REPLACE flags are set. |
* |
* 2. If EXCL is set, requestor wanted to say, |
* that qdisc tcm_handle is not expected |
* to exist, so that we choose create/graft too. |
* |
* 3. The last case is when no flags are set. |
* Alas, it is sort of hole in API, we |
* cannot decide what to do unambiguously. |
* For now we select create/graft, if |
* user gave KIND, which does not match existing. |
*/ |
if ((n->nlmsg_flags&NLM_F_CREATE) && |
(n->nlmsg_flags&NLM_F_REPLACE) && |
((n->nlmsg_flags&NLM_F_EXCL) || |
(tca[TCA_KIND-1] && |
rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) |
goto create_n_graft; |
} |
} |
} else { |
if (!tcm->tcm_handle) |
return -EINVAL; |
q = qdisc_lookup(dev, tcm->tcm_handle); |
} |
|
/* Change qdisc parameters */ |
if (q == NULL) |
return -ENOENT; |
if (n->nlmsg_flags&NLM_F_EXCL) |
return -EEXIST; |
if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) |
return -EINVAL; |
err = qdisc_change(q, tca); |
if (err == 0) |
qdisc_notify(skb, n, clid, NULL, q); |
return err; |
|
create_n_graft: |
if (!(n->nlmsg_flags&NLM_F_CREATE)) |
return -ENOENT; |
if (clid == TC_H_INGRESS) |
q = qdisc_create(dev, tcm->tcm_parent, tca, &err); |
else |
q = qdisc_create(dev, tcm->tcm_handle, tca, &err); |
if (q == NULL) |
return err; |
|
graft: |
if (1) { |
struct Qdisc *old_q = NULL; |
err = qdisc_graft(dev, p, clid, q, &old_q); |
if (err) { |
if (q) { |
spin_lock_bh(&dev->queue_lock); |
qdisc_destroy(q); |
spin_unlock_bh(&dev->queue_lock); |
} |
return err; |
} |
qdisc_notify(skb, n, clid, old_q, q); |
if (old_q) { |
spin_lock_bh(&dev->queue_lock); |
qdisc_destroy(old_q); |
spin_unlock_bh(&dev->queue_lock); |
} |
} |
return 0; |
} |
|
int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st) |
{ |
spin_lock_bh(st->lock); |
RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st); |
spin_unlock_bh(st->lock); |
return 0; |
|
rtattr_failure: |
spin_unlock_bh(st->lock); |
return -1; |
} |
|
|
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, |
u32 pid, u32 seq, unsigned flags, int event) |
{ |
struct tcmsg *tcm; |
struct nlmsghdr *nlh; |
unsigned char *b = skb->tail; |
|
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); |
nlh->nlmsg_flags = flags; |
tcm = NLMSG_DATA(nlh); |
tcm->tcm_family = AF_UNSPEC; |
tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; |
tcm->tcm_parent = clid; |
tcm->tcm_handle = q->handle; |
tcm->tcm_info = atomic_read(&q->refcnt); |
RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); |
if (q->ops->dump && q->ops->dump(q, skb) < 0) |
goto rtattr_failure; |
q->stats.qlen = q->q.qlen; |
if (qdisc_copy_stats(skb, &q->stats)) |
goto rtattr_failure; |
nlh->nlmsg_len = skb->tail - b; |
return skb->len; |
|
nlmsg_failure: |
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, |
u32 clid, struct Qdisc *old, struct Qdisc *new) |
{ |
struct sk_buff *skb; |
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; |
|
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); |
if (!skb) |
return -ENOBUFS; |
|
if (old && old->handle) { |
if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) |
goto err_out; |
} |
if (new) { |
if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) |
goto err_out; |
} |
|
if (skb->len) |
return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); |
|
err_out: |
kfree_skb(skb); |
return -EINVAL; |
} |
|
static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) |
{ |
int idx, q_idx; |
int s_idx, s_q_idx; |
struct net_device *dev; |
struct Qdisc *q; |
|
s_idx = cb->args[0]; |
s_q_idx = q_idx = cb->args[1]; |
read_lock(&dev_base_lock); |
for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { |
if (idx < s_idx) |
continue; |
if (idx > s_idx) |
s_q_idx = 0; |
read_lock(&qdisc_tree_lock); |
for (q = dev->qdisc_list, q_idx = 0; q; |
q = q->next, q_idx++) { |
if (q_idx < s_q_idx) |
continue; |
if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid, |
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { |
read_unlock(&qdisc_tree_lock); |
goto done; |
} |
} |
read_unlock(&qdisc_tree_lock); |
} |
|
done: |
read_unlock(&dev_base_lock); |
|
cb->args[0] = idx; |
cb->args[1] = q_idx; |
|
return skb->len; |
} |
|
|
|
/************************************************ |
* Traffic classes manipulation. * |
************************************************/ |
|
|
|
static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) |
{ |
struct tcmsg *tcm = NLMSG_DATA(n); |
struct rtattr **tca = arg; |
struct net_device *dev; |
struct Qdisc *q = NULL; |
struct Qdisc_class_ops *cops; |
unsigned long cl = 0; |
unsigned long new_cl; |
u32 pid = tcm->tcm_parent; |
u32 clid = tcm->tcm_handle; |
u32 qid = TC_H_MAJ(clid); |
int err; |
|
if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) |
return -ENODEV; |
|
/* |
parent == TC_H_UNSPEC - unspecified parent. |
parent == TC_H_ROOT - class is root, which has no parent. |
parent == X:0 - parent is root class. |
parent == X:Y - parent is a node in hierarchy. |
parent == 0:Y - parent is X:Y, where X:0 is qdisc. |
|
handle == 0:0 - generate handle from kernel pool. |
handle == 0:Y - class is X:Y, where X:0 is qdisc. |
handle == X:Y - clear. |
handle == X:0 - root class. |
*/ |
|
/* Step 1. Determine qdisc handle X:0 */ |
|
if (pid != TC_H_ROOT) { |
u32 qid1 = TC_H_MAJ(pid); |
|
if (qid && qid1) { |
/* If both majors are known, they must be identical. */ |
if (qid != qid1) |
return -EINVAL; |
} else if (qid1) { |
qid = qid1; |
} else if (qid == 0) |
qid = dev->qdisc_sleeping->handle; |
|
/* Now qid is genuine qdisc handle consistent |
both with parent and child. |
|
TC_H_MAJ(pid) still may be unspecified, complete it now. |
*/ |
if (pid) |
pid = TC_H_MAKE(qid, pid); |
} else { |
if (qid == 0) |
qid = dev->qdisc_sleeping->handle; |
} |
|
/* OK. Locate qdisc */ |
if ((q = qdisc_lookup(dev, qid)) == NULL) |
return -ENOENT; |
|
/* An check that it supports classes */ |
cops = q->ops->cl_ops; |
if (cops == NULL) |
return -EINVAL; |
|
/* Now try to get class */ |
if (clid == 0) { |
if (pid == TC_H_ROOT) |
clid = qid; |
} else |
clid = TC_H_MAKE(qid, clid); |
|
if (clid) |
cl = cops->get(q, clid); |
|
if (cl == 0) { |
err = -ENOENT; |
if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) |
goto out; |
} else { |
switch (n->nlmsg_type) { |
case RTM_NEWTCLASS: |
err = -EEXIST; |
if (n->nlmsg_flags&NLM_F_EXCL) |
goto out; |
break; |
case RTM_DELTCLASS: |
err = cops->delete(q, cl); |
if (err == 0) |
tclass_notify(skb, n, q, cl, RTM_DELTCLASS); |
goto out; |
case RTM_GETTCLASS: |
err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); |
goto out; |
default: |
err = -EINVAL; |
goto out; |
} |
} |
|
new_cl = cl; |
err = cops->change(q, clid, pid, tca, &new_cl); |
if (err == 0) |
tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); |
|
out: |
if (cl) |
cops->put(q, cl); |
|
return err; |
} |
|
|
static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, |
unsigned long cl, |
u32 pid, u32 seq, unsigned flags, int event) |
{ |
struct tcmsg *tcm; |
struct nlmsghdr *nlh; |
unsigned char *b = skb->tail; |
|
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); |
nlh->nlmsg_flags = flags; |
tcm = NLMSG_DATA(nlh); |
tcm->tcm_family = AF_UNSPEC; |
tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; |
tcm->tcm_parent = q->handle; |
tcm->tcm_handle = q->handle; |
tcm->tcm_info = 0; |
RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); |
if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0) |
goto rtattr_failure; |
nlh->nlmsg_len = skb->tail - b; |
return skb->len; |
|
nlmsg_failure: |
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, |
struct Qdisc *q, unsigned long cl, int event) |
{ |
struct sk_buff *skb; |
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; |
|
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); |
if (!skb) |
return -ENOBUFS; |
|
if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { |
kfree_skb(skb); |
return -EINVAL; |
} |
|
return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); |
} |
|
struct qdisc_dump_args |
{ |
struct qdisc_walker w; |
struct sk_buff *skb; |
struct netlink_callback *cb; |
}; |
|
static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) |
{ |
struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; |
|
return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, |
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); |
} |
|
static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) |
{ |
int t; |
int s_t; |
struct net_device *dev; |
struct Qdisc *q; |
struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); |
struct qdisc_dump_args arg; |
|
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) |
return 0; |
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) |
return 0; |
|
s_t = cb->args[0]; |
|
read_lock(&qdisc_tree_lock); |
for (q=dev->qdisc_list, t=0; q; q = q->next, t++) { |
if (t < s_t) continue; |
if (!q->ops->cl_ops) continue; |
if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle) |
continue; |
if (t > s_t) |
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); |
arg.w.fn = qdisc_class_dump; |
arg.skb = skb; |
arg.cb = cb; |
arg.w.stop = 0; |
arg.w.skip = cb->args[1]; |
arg.w.count = 0; |
q->ops->cl_ops->walk(q, &arg.w); |
cb->args[1] = arg.w.count; |
if (arg.w.stop) |
break; |
} |
read_unlock(&qdisc_tree_lock); |
|
cb->args[0] = t; |
|
dev_put(dev); |
return skb->len; |
} |
|
int psched_us_per_tick = 1; |
int psched_tick_per_us = 1; |
|
#ifdef CONFIG_PROC_FS |
static int psched_read_proc(char *buffer, char **start, off_t offset, |
int length, int *eof, void *data) |
{ |
int len; |
|
len = sprintf(buffer, "%08x %08x %08x %08x\n", |
psched_tick_per_us, psched_us_per_tick, |
1000000, HZ); |
|
len -= offset; |
|
if (len > length) |
len = length; |
if(len < 0) |
len = 0; |
|
*start = buffer + offset; |
*eof = 1; |
|
return len; |
} |
#endif |
|
#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY |
int psched_tod_diff(int delta_sec, int bound) |
{ |
int delta; |
|
if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1) |
return bound; |
delta = delta_sec * 1000000; |
if (delta > bound) |
delta = bound; |
return delta; |
} |
#endif |
|
psched_time_t psched_time_base; |
|
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU |
psched_tdiff_t psched_clock_per_hz; |
int psched_clock_scale; |
#endif |
|
#ifdef PSCHED_WATCHER |
PSCHED_WATCHER psched_time_mark; |
|
static void psched_tick(unsigned long); |
|
static struct timer_list psched_timer = |
{ function: psched_tick }; |
|
static void psched_tick(unsigned long dummy) |
{ |
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU |
psched_time_t dummy_stamp; |
PSCHED_GET_TIME(dummy_stamp); |
/* It is OK up to 4GHz cpu */ |
psched_timer.expires = jiffies + 1*HZ; |
#else |
unsigned long now = jiffies; |
psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE; |
psched_time_mark = now; |
psched_timer.expires = now + 60*60*HZ; |
#endif |
add_timer(&psched_timer); |
} |
#endif |
|
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU |
int __init psched_calibrate_clock(void) |
{ |
psched_time_t stamp, stamp1; |
struct timeval tv, tv1; |
psched_tdiff_t delay; |
long rdelay; |
unsigned long stop; |
|
#ifdef PSCHED_WATCHER |
psched_tick(0); |
#endif |
stop = jiffies + HZ/10; |
PSCHED_GET_TIME(stamp); |
do_gettimeofday(&tv); |
while (time_before(jiffies, stop)) { |
barrier(); |
cpu_relax(); |
} |
PSCHED_GET_TIME(stamp1); |
do_gettimeofday(&tv1); |
|
delay = PSCHED_TDIFF(stamp1, stamp); |
rdelay = tv1.tv_usec - tv.tv_usec; |
rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; |
if (rdelay > delay) |
return -1; |
delay /= rdelay; |
psched_tick_per_us = delay; |
while ((delay>>=1) != 0) |
psched_clock_scale++; |
psched_us_per_tick = 1<<psched_clock_scale; |
psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; |
return 0; |
} |
#endif |
|
int __init pktsched_init(void) |
{ |
struct rtnetlink_link *link_p; |
|
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU |
if (psched_calibrate_clock() < 0) |
return -1; |
#elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES |
psched_tick_per_us = HZ<<PSCHED_JSCALE; |
psched_us_per_tick = 1000000; |
#ifdef PSCHED_WATCHER |
psched_tick(0); |
#endif |
#endif |
|
link_p = rtnetlink_links[PF_UNSPEC]; |
|
/* Setup rtnetlink links. It is made here to avoid |
exporting large number of public symbols. |
*/ |
|
if (link_p) { |
link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; |
link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; |
link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; |
link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; |
link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; |
link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; |
link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; |
link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; |
} |
|
#define INIT_QDISC(name) { \ |
extern struct Qdisc_ops name##_qdisc_ops; \ |
register_qdisc(& name##_qdisc_ops); \ |
} |
|
INIT_QDISC(pfifo); |
INIT_QDISC(bfifo); |
|
#ifdef CONFIG_NET_SCH_CBQ |
INIT_QDISC(cbq); |
#endif |
#ifdef CONFIG_NET_SCH_HTB |
INIT_QDISC(htb); |
#endif |
#ifdef CONFIG_NET_SCH_CSZ |
INIT_QDISC(csz); |
#endif |
#ifdef CONFIG_NET_SCH_HPFQ |
INIT_QDISC(hpfq); |
#endif |
#ifdef CONFIG_NET_SCH_HFSC |
INIT_QDISC(hfsc); |
#endif |
#ifdef CONFIG_NET_SCH_RED |
INIT_QDISC(red); |
#endif |
#ifdef CONFIG_NET_SCH_GRED |
INIT_QDISC(gred); |
#endif |
#ifdef CONFIG_NET_SCH_INGRESS |
INIT_QDISC(ingress); |
#endif |
#ifdef CONFIG_NET_SCH_DSMARK |
INIT_QDISC(dsmark); |
#endif |
#ifdef CONFIG_NET_SCH_SFQ |
INIT_QDISC(sfq); |
#endif |
#ifdef CONFIG_NET_SCH_TBF |
INIT_QDISC(tbf); |
#endif |
#ifdef CONFIG_NET_SCH_TEQL |
teql_init(); |
#endif |
#ifdef CONFIG_NET_SCH_PRIO |
INIT_QDISC(prio); |
#endif |
#ifdef CONFIG_NET_SCH_ATM |
INIT_QDISC(atm); |
#endif |
#ifdef CONFIG_NET_CLS |
tc_filter_init(); |
#endif |
|
#ifdef CONFIG_PROC_FS |
create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL); |
#endif |
|
return 0; |
} |
/sch_prio.c
0,0 → 1,424
/* |
* net/sched/sch_prio.c Simple 3-band priority "scheduler". |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: |
* Init -- EINVAL when opt undefined |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
|
struct prio_sched_data |
{ |
int bands; |
struct tcf_proto *filter_list; |
u8 prio2band[TC_PRIO_MAX+1]; |
struct Qdisc *queues[TCQ_PRIO_BANDS]; |
}; |
|
|
static __inline__ unsigned prio_classify(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
struct tcf_result res; |
u32 band; |
|
band = skb->priority; |
if (TC_H_MAJ(skb->priority) != sch->handle) { |
if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { |
if (TC_H_MAJ(band)) |
band = 0; |
return q->prio2band[band&TC_PRIO_MAX]; |
} |
band = res.classid; |
} |
band = TC_H_MIN(band) - 1; |
return band < q->bands ? band : q->prio2band[0]; |
} |
|
static int |
prio_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
struct Qdisc *qdisc; |
int ret; |
|
qdisc = q->queues[prio_classify(skb, sch)]; |
|
if ((ret = qdisc->enqueue(skb, qdisc)) == 0) { |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
sch->q.qlen++; |
return 0; |
} |
sch->stats.drops++; |
return ret; |
} |
|
|
static int |
prio_requeue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
struct Qdisc *qdisc; |
int ret; |
|
qdisc = q->queues[prio_classify(skb, sch)]; |
|
if ((ret = qdisc->ops->requeue(skb, qdisc)) == 0) { |
sch->q.qlen++; |
return 0; |
} |
sch->stats.drops++; |
return ret; |
} |
|
|
static struct sk_buff * |
prio_dequeue(struct Qdisc* sch) |
{ |
struct sk_buff *skb; |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
int prio; |
struct Qdisc *qdisc; |
|
for (prio = 0; prio < q->bands; prio++) { |
qdisc = q->queues[prio]; |
skb = qdisc->dequeue(qdisc); |
if (skb) { |
sch->q.qlen--; |
return skb; |
} |
} |
return NULL; |
|
} |
|
static unsigned int prio_drop(struct Qdisc* sch) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
int prio; |
unsigned int len; |
struct Qdisc *qdisc; |
|
for (prio = q->bands-1; prio >= 0; prio--) { |
qdisc = q->queues[prio]; |
if ((len = qdisc->ops->drop(qdisc)) != 0) { |
sch->q.qlen--; |
return len; |
} |
} |
return 0; |
} |
|
|
static void |
prio_reset(struct Qdisc* sch) |
{ |
int prio; |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
|
for (prio=0; prio<q->bands; prio++) |
qdisc_reset(q->queues[prio]); |
sch->q.qlen = 0; |
} |
|
static void |
prio_destroy(struct Qdisc* sch) |
{ |
int prio; |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
struct tcf_proto *tp; |
|
while ((tp = q->filter_list) != NULL) { |
q->filter_list = tp->next; |
tcf_destroy(tp); |
} |
|
for (prio=0; prio<q->bands; prio++) { |
qdisc_destroy(q->queues[prio]); |
q->queues[prio] = &noop_qdisc; |
} |
MOD_DEC_USE_COUNT; |
} |
|
static int prio_tune(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
struct tc_prio_qopt *qopt = RTA_DATA(opt); |
int i; |
|
if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) |
return -EINVAL; |
if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) |
return -EINVAL; |
|
for (i=0; i<=TC_PRIO_MAX; i++) { |
if (qopt->priomap[i] >= qopt->bands) |
return -EINVAL; |
} |
|
sch_tree_lock(sch); |
q->bands = qopt->bands; |
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); |
|
for (i=q->bands; i<TCQ_PRIO_BANDS; i++) { |
struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc); |
if (child != &noop_qdisc) |
qdisc_destroy(child); |
} |
sch_tree_unlock(sch); |
|
for (i=0; i<=TC_PRIO_MAX; i++) { |
int band = q->prio2band[i]; |
if (q->queues[band] == &noop_qdisc) { |
struct Qdisc *child; |
child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); |
if (child) { |
sch_tree_lock(sch); |
child = xchg(&q->queues[band], child); |
|
if (child != &noop_qdisc) |
qdisc_destroy(child); |
sch_tree_unlock(sch); |
} |
} |
} |
return 0; |
} |
|
static int prio_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
int i; |
|
for (i=0; i<TCQ_PRIO_BANDS; i++) |
q->queues[i] = &noop_qdisc; |
|
if (opt == NULL) { |
return -EINVAL; |
} else { |
int err; |
|
if ((err= prio_tune(sch, opt)) != 0) |
return err; |
} |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
unsigned char *b = skb->tail; |
struct tc_prio_qopt opt; |
|
opt.bands = q->bands; |
memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); |
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, |
struct Qdisc **old) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
unsigned long band = arg - 1; |
|
if (band >= q->bands) |
return -EINVAL; |
|
if (new == NULL) |
new = &noop_qdisc; |
|
sch_tree_lock(sch); |
*old = q->queues[band]; |
q->queues[band] = new; |
sch->q.qlen -= (*old)->q.qlen; |
qdisc_reset(*old); |
sch_tree_unlock(sch); |
|
return 0; |
} |
|
static struct Qdisc * |
prio_leaf(struct Qdisc *sch, unsigned long arg) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
unsigned long band = arg - 1; |
|
if (band >= q->bands) |
return NULL; |
|
return q->queues[band]; |
} |
|
static unsigned long prio_get(struct Qdisc *sch, u32 classid) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
unsigned long band = TC_H_MIN(classid); |
|
if (band - 1 >= q->bands) |
return 0; |
return band; |
} |
|
static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) |
{ |
return prio_get(sch, classid); |
} |
|
|
static void prio_put(struct Qdisc *q, unsigned long cl) |
{ |
return; |
} |
|
static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) |
{ |
unsigned long cl = *arg; |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
|
if (cl - 1 > q->bands) |
return -ENOENT; |
return 0; |
} |
|
static int prio_delete(struct Qdisc *sch, unsigned long cl) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
if (cl - 1 > q->bands) |
return -ENOENT; |
return 0; |
} |
|
|
static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, |
struct tcmsg *tcm) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
|
if (cl - 1 > q->bands) |
return -ENOENT; |
tcm->tcm_handle |= TC_H_MIN(cl); |
if (q->queues[cl-1]) |
tcm->tcm_info = q->queues[cl-1]->handle; |
return 0; |
} |
|
static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
int prio; |
|
if (arg->stop) |
return; |
|
for (prio = 0; prio < q->bands; prio++) { |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (arg->fn(sch, prio+1, arg) < 0) { |
arg->stop = 1; |
break; |
} |
arg->count++; |
} |
} |
|
static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) |
{ |
struct prio_sched_data *q = (struct prio_sched_data *)sch->data; |
|
if (cl) |
return NULL; |
return &q->filter_list; |
} |
|
static struct Qdisc_class_ops prio_class_ops = |
{ |
prio_graft, |
prio_leaf, |
|
prio_get, |
prio_put, |
prio_change, |
prio_delete, |
prio_walk, |
|
prio_find_tcf, |
prio_bind, |
prio_put, |
|
prio_dump_class, |
}; |
|
struct Qdisc_ops prio_qdisc_ops = |
{ |
NULL, |
&prio_class_ops, |
"prio", |
sizeof(struct prio_sched_data), |
|
prio_enqueue, |
prio_dequeue, |
prio_requeue, |
prio_drop, |
|
prio_init, |
prio_reset, |
prio_destroy, |
prio_tune, |
|
prio_dump, |
}; |
|
#ifdef MODULE |
|
int init_module(void) |
{ |
return register_qdisc(&prio_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&prio_qdisc_ops); |
} |
|
#endif |
MODULE_LICENSE("GPL"); |
/estimator.c
0,0 → 1,197
/* |
* net/sched/estimator.c Simple rate estimator. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/netdevice.h> |
#include <linux/skbuff.h> |
#include <linux/rtnetlink.h> |
#include <linux/init.h> |
#include <linux/proc_fs.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
/* |
This code is NOT intended to be used for statistics collection, |
its purpose is to provide a base for statistical multiplexing |
for controlled load service. |
If you need only statistics, run a user level daemon which |
periodically reads byte counters. |
|
Unfortunately, rate estimation is not a very easy task. |
F.e. I did not find a simple way to estimate the current peak rate |
and even failed to formulate the problem 8)8) |
|
So I preferred not to built an estimator into the scheduler, |
but run this task separately. |
Ideally, it should be kernel thread(s), but for now it runs |
from timers, which puts apparent top bounds on the number of rated |
flows, has minimal overhead on small, but is enough |
to handle controlled load service, sets of aggregates. |
|
We measure rate over A=(1<<interval) seconds and evaluate EWMA: |
|
avrate = avrate*(1-W) + rate*W |
|
where W is chosen as negative power of 2: W = 2^(-ewma_log) |
|
The resulting time constant is: |
|
T = A/(-ln(1-W)) |
|
|
NOTES. |
|
* The stored value for avbps is scaled by 2^5, so that maximal |
rate is ~1Gbit, avpps is scaled by 2^10. |
|
* Minimal interval is HZ/4=250msec (it is the greatest common divisor |
for HZ=100 and HZ=1024 8)), maximal interval |
is (HZ/4)*2^EST_MAX_INTERVAL = 8sec. Shorter intervals |
are too expensive, longer ones can be implemented |
at user level painlessly. |
*/ |
|
#if (HZ%4) != 0 |
#error Bad HZ value. |
#endif |
|
#define EST_MAX_INTERVAL 5 |
|
struct qdisc_estimator |
{ |
struct qdisc_estimator *next; |
struct tc_stats *stats; |
unsigned interval; |
int ewma_log; |
u64 last_bytes; |
u32 last_packets; |
u32 avpps; |
u32 avbps; |
}; |
|
struct qdisc_estimator_head |
{ |
struct timer_list timer; |
struct qdisc_estimator *list; |
}; |
|
static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; |
|
/* Estimator array lock */ |
static rwlock_t est_lock = RW_LOCK_UNLOCKED; |
|
static void est_timer(unsigned long arg) |
{ |
int idx = (int)arg; |
struct qdisc_estimator *e; |
|
read_lock(&est_lock); |
for (e = elist[idx].list; e; e = e->next) { |
struct tc_stats *st = e->stats; |
u64 nbytes; |
u32 npackets; |
u32 rate; |
|
spin_lock(st->lock); |
nbytes = st->bytes; |
npackets = st->packets; |
rate = (nbytes - e->last_bytes)<<(7 - idx); |
e->last_bytes = nbytes; |
e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; |
st->bps = (e->avbps+0xF)>>5; |
|
rate = (npackets - e->last_packets)<<(12 - idx); |
e->last_packets = npackets; |
e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; |
e->stats->pps = (e->avpps+0x1FF)>>10; |
spin_unlock(st->lock); |
} |
|
mod_timer(&elist[idx].timer, jiffies + ((HZ/4)<<idx)); |
read_unlock(&est_lock); |
} |
|
int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt) |
{ |
struct qdisc_estimator *est; |
struct tc_estimator *parm = RTA_DATA(opt); |
|
if (RTA_PAYLOAD(opt) < sizeof(*parm)) |
return -EINVAL; |
|
if (parm->interval < -2 || parm->interval > 3) |
return -EINVAL; |
|
est = kmalloc(sizeof(*est), GFP_KERNEL); |
if (est == NULL) |
return -ENOBUFS; |
|
memset(est, 0, sizeof(*est)); |
est->interval = parm->interval + 2; |
est->stats = stats; |
est->ewma_log = parm->ewma_log; |
est->last_bytes = stats->bytes; |
est->avbps = stats->bps<<5; |
est->last_packets = stats->packets; |
est->avpps = stats->pps<<10; |
|
est->next = elist[est->interval].list; |
if (est->next == NULL) { |
init_timer(&elist[est->interval].timer); |
elist[est->interval].timer.data = est->interval; |
elist[est->interval].timer.expires = jiffies + ((HZ/4)<<est->interval); |
elist[est->interval].timer.function = est_timer; |
add_timer(&elist[est->interval].timer); |
} |
write_lock_bh(&est_lock); |
elist[est->interval].list = est; |
write_unlock_bh(&est_lock); |
return 0; |
} |
|
void qdisc_kill_estimator(struct tc_stats *stats) |
{ |
int idx; |
struct qdisc_estimator *est, **pest; |
|
for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { |
int killed = 0; |
pest = &elist[idx].list; |
while ((est=*pest) != NULL) { |
if (est->stats != stats) { |
pest = &est->next; |
continue; |
} |
|
write_lock_bh(&est_lock); |
*pest = est->next; |
write_unlock_bh(&est_lock); |
|
kfree(est); |
killed++; |
} |
if (killed && elist[idx].list == NULL) |
del_timer(&elist[idx].timer); |
} |
} |
|
/sch_red.c
0,0 → 1,481
/* |
* net/sched/sch_red.c Random Early Detection queue. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* |
* Changes: |
* J Hadi Salim <hadi@nortel.com> 980914: computation fixes |
* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. |
* J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
#include <net/inet_ecn.h> |
|
|
/* Random Early Detection (RED) algorithm. |
======================================= |
|
Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways |
for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. |
|
This file codes a "divisionless" version of RED algorithm |
as written down in Fig.17 of the paper. |
|
Short description. |
------------------ |
|
When a new packet arrives we calculate the average queue length: |
|
avg = (1-W)*avg + W*current_queue_len, |
|
W is the filter time constant (choosen as 2^(-Wlog)), it controls |
the inertia of the algorithm. To allow larger bursts, W should be |
decreased. |
|
if (avg > th_max) -> packet marked (dropped). |
if (avg < th_min) -> packet passes. |
if (th_min < avg < th_max) we calculate probability: |
|
Pb = max_P * (avg - th_min)/(th_max-th_min) |
|
and mark (drop) packet with this probability. |
Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). |
max_P should be small (not 1), usually 0.01..0.02 is good value. |
|
max_P is chosen as a number, so that max_P/(th_max-th_min) |
is a negative power of two in order arithmetics to contain |
only shifts. |
|
|
Parameters, settable by user: |
----------------------------- |
|
limit - bytes (must be > qth_max + burst) |
|
Hard limit on queue length, should be chosen >qth_max |
to allow packet bursts. This parameter does not |
affect the algorithms behaviour and can be chosen |
arbitrarily high (well, less than ram size) |
Really, this limit will never be reached |
if RED works correctly. |
|
qth_min - bytes (should be < qth_max/2) |
qth_max - bytes (should be at least 2*qth_min and less limit) |
Wlog - bits (<32) log(1/W). |
Plog - bits (<32) |
|
Plog is related to max_P by formula: |
|
max_P = (qth_max-qth_min)/2^Plog; |
|
F.e. if qth_max=128K and qth_min=32K, then Plog=22 |
corresponds to max_P=0.02 |
|
Scell_log |
Stab |
|
Lookup table for log((1-W)^(t/t_ave). |
|
|
NOTES: |
|
Upper bound on W. |
----------------- |
|
If you want to allow bursts of L packets of size S, |
you should choose W: |
|
L + 1 - th_min/S < (1-(1-W)^L)/W |
|
th_min/S = 32 th_min/S = 4 |
|
log(W) L |
-1 33 |
-2 35 |
-3 39 |
-4 46 |
-5 57 |
-6 75 |
-7 101 |
-8 135 |
-9 190 |
etc. |
*/ |
|
struct red_sched_data |
{ |
/* Parameters */ |
u32 limit; /* HARD maximal queue length */ |
u32 qth_min; /* Min average length threshold: A scaled */ |
u32 qth_max; /* Max average length threshold: A scaled */ |
u32 Rmask; |
u32 Scell_max; |
unsigned char flags; |
char Wlog; /* log(W) */ |
char Plog; /* random number bits */ |
char Scell_log; |
u8 Stab[256]; |
|
/* Variables */ |
unsigned long qave; /* Average queue length: A scaled */ |
int qcount; /* Packets since last random number generation */ |
u32 qR; /* Cached random number */ |
|
psched_time_t qidlestart; /* Start of idle period */ |
struct tc_red_xstats st; |
}; |
|
static int red_ecn_mark(struct sk_buff *skb) |
{ |
if (skb->nh.raw + 20 > skb->tail) |
return 0; |
|
switch (skb->protocol) { |
case __constant_htons(ETH_P_IP): |
if (!INET_ECN_is_capable(skb->nh.iph->tos)) |
return 0; |
if (INET_ECN_is_not_ce(skb->nh.iph->tos)) |
IP_ECN_set_ce(skb->nh.iph); |
return 1; |
case __constant_htons(ETH_P_IPV6): |
if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h))) |
return 0; |
IP6_ECN_set_ce(skb->nh.ipv6h); |
return 1; |
default: |
return 0; |
} |
} |
|
static int |
red_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct red_sched_data *q = (struct red_sched_data *)sch->data; |
|
psched_time_t now; |
|
if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { |
long us_idle; |
int shift; |
|
PSCHED_GET_TIME(now); |
us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); |
PSCHED_SET_PASTPERFECT(q->qidlestart); |
|
/* |
The problem: ideally, average length queue recalcultion should |
be done over constant clock intervals. This is too expensive, so that |
the calculation is driven by outgoing packets. |
When the queue is idle we have to model this clock by hand. |
|
SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) |
dummy packets as a burst after idle time, i.e. |
|
q->qave *= (1-W)^m |
|
This is an apparently overcomplicated solution (f.e. we have to precompute |
a table to make this calculation in reasonable time) |
I believe that a simpler model may be used here, |
but it is field for experiments. |
*/ |
shift = q->Stab[us_idle>>q->Scell_log]; |
|
if (shift) { |
q->qave >>= shift; |
} else { |
/* Approximate initial part of exponent |
with linear function: |
(1-W)^m ~= 1-mW + ... |
|
Seems, it is the best solution to |
problem of too coarce exponent tabulation. |
*/ |
|
us_idle = (q->qave * us_idle)>>q->Scell_log; |
if (us_idle < q->qave/2) |
q->qave -= us_idle; |
else |
q->qave >>= 1; |
} |
} else { |
q->qave += sch->stats.backlog - (q->qave >> q->Wlog); |
/* NOTE: |
q->qave is fixed point number with point at Wlog. |
The formulae above is equvalent to floating point |
version: |
|
qave = qave*(1-W) + sch->stats.backlog*W; |
--ANK (980924) |
*/ |
} |
|
if (q->qave < q->qth_min) { |
q->qcount = -1; |
enqueue: |
if (sch->stats.backlog + skb->len <= q->limit) { |
__skb_queue_tail(&sch->q, skb); |
sch->stats.backlog += skb->len; |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
return NET_XMIT_SUCCESS; |
} else { |
q->st.pdrop++; |
} |
kfree_skb(skb); |
sch->stats.drops++; |
return NET_XMIT_DROP; |
} |
if (q->qave >= q->qth_max) { |
q->qcount = -1; |
sch->stats.overlimits++; |
mark: |
if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { |
q->st.early++; |
goto drop; |
} |
q->st.marked++; |
goto enqueue; |
} |
|
if (++q->qcount) { |
/* The formula used below causes questions. |
|
OK. qR is random number in the interval 0..Rmask |
i.e. 0..(2^Plog). If we used floating point |
arithmetics, it would be: (2^Plog)*rnd_num, |
where rnd_num is less 1. |
|
Taking into account, that qave have fixed |
point at Wlog, and Plog is related to max_P by |
max_P = (qth_max-qth_min)/2^Plog; two lines |
below have the following floating point equivalent: |
|
max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount |
|
Any questions? --ANK (980924) |
*/ |
if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) |
goto enqueue; |
q->qcount = 0; |
q->qR = net_random()&q->Rmask; |
sch->stats.overlimits++; |
goto mark; |
} |
q->qR = net_random()&q->Rmask; |
goto enqueue; |
|
drop: |
kfree_skb(skb); |
sch->stats.drops++; |
return NET_XMIT_CN; |
} |
|
static int |
red_requeue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct red_sched_data *q = (struct red_sched_data *)sch->data; |
|
PSCHED_SET_PASTPERFECT(q->qidlestart); |
|
__skb_queue_head(&sch->q, skb); |
sch->stats.backlog += skb->len; |
return 0; |
} |
|
static struct sk_buff * |
red_dequeue(struct Qdisc* sch) |
{ |
struct sk_buff *skb; |
struct red_sched_data *q = (struct red_sched_data *)sch->data; |
|
skb = __skb_dequeue(&sch->q); |
if (skb) { |
sch->stats.backlog -= skb->len; |
return skb; |
} |
PSCHED_GET_TIME(q->qidlestart); |
return NULL; |
} |
|
static unsigned int red_drop(struct Qdisc* sch) |
{ |
struct sk_buff *skb; |
struct red_sched_data *q = (struct red_sched_data *)sch->data; |
|
skb = __skb_dequeue_tail(&sch->q); |
if (skb) { |
unsigned int len = skb->len; |
sch->stats.backlog -= len; |
sch->stats.drops++; |
q->st.other++; |
kfree_skb(skb); |
return len; |
} |
PSCHED_GET_TIME(q->qidlestart); |
return 0; |
} |
|
static void red_reset(struct Qdisc* sch) |
{ |
struct red_sched_data *q = (struct red_sched_data *)sch->data; |
|
__skb_queue_purge(&sch->q); |
sch->stats.backlog = 0; |
PSCHED_SET_PASTPERFECT(q->qidlestart); |
q->qave = 0; |
q->qcount = -1; |
} |
|
static int red_change(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct red_sched_data *q = (struct red_sched_data *)sch->data; |
struct rtattr *tb[TCA_RED_STAB]; |
struct tc_red_qopt *ctl; |
|
if (opt == NULL || |
rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || |
tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || |
RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || |
RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) |
return -EINVAL; |
|
ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); |
|
sch_tree_lock(sch); |
q->flags = ctl->flags; |
q->Wlog = ctl->Wlog; |
q->Plog = ctl->Plog; |
q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; |
q->Scell_log = ctl->Scell_log; |
q->Scell_max = (255<<q->Scell_log); |
q->qth_min = ctl->qth_min<<ctl->Wlog; |
q->qth_max = ctl->qth_max<<ctl->Wlog; |
q->limit = ctl->limit; |
memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); |
|
q->qcount = -1; |
if (skb_queue_len(&sch->q) == 0) |
PSCHED_SET_PASTPERFECT(q->qidlestart); |
sch_tree_unlock(sch); |
return 0; |
} |
|
static int red_init(struct Qdisc* sch, struct rtattr *opt) |
{ |
int err; |
|
MOD_INC_USE_COUNT; |
|
if ((err = red_change(sch, opt)) != 0) { |
MOD_DEC_USE_COUNT; |
} |
return err; |
} |
|
|
int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st) |
{ |
RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st); |
return 0; |
|
rtattr_failure: |
return 1; |
} |
|
static int red_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct red_sched_data *q = (struct red_sched_data *)sch->data; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
struct tc_red_qopt opt; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
opt.limit = q->limit; |
opt.qth_min = q->qth_min>>q->Wlog; |
opt.qth_max = q->qth_max>>q->Wlog; |
opt.Wlog = q->Wlog; |
opt.Plog = q->Plog; |
opt.Scell_log = q->Scell_log; |
opt.flags = q->flags; |
RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); |
rta->rta_len = skb->tail - b; |
|
if (red_copy_xstats(skb, &q->st)) |
goto rtattr_failure; |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static void red_destroy(struct Qdisc *sch) |
{ |
MOD_DEC_USE_COUNT; |
} |
|
struct Qdisc_ops red_qdisc_ops = |
{ |
NULL, |
NULL, |
"red", |
sizeof(struct red_sched_data), |
|
red_enqueue, |
red_dequeue, |
red_requeue, |
red_drop, |
|
red_init, |
red_reset, |
red_destroy, |
red_change, |
|
red_dump, |
}; |
|
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&red_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&red_qdisc_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/sch_ingress.c
0,0 → 1,386
/* net/sched/sch_ingress.c - Ingress qdisc |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Jamal Hadi Salim 1999 |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <linux/types.h> |
#include <linux/skbuff.h> |
#include <linux/netdevice.h> |
#include <linux/rtnetlink.h> |
#include <linux/netfilter_ipv4.h> |
#include <linux/netfilter.h> |
#include <linux/smp.h> |
#include <net/pkt_sched.h> |
#include <asm/byteorder.h> |
#include <asm/uaccess.h> |
#include <linux/kmod.h> |
#include <linux/stat.h> |
#include <linux/interrupt.h> |
#include <linux/list.h> |
|
|
#undef DEBUG_INGRESS |
|
#ifdef DEBUG_INGRESS /* control */ |
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define DPRINTK(format,args...) |
#endif |
|
#if 0 /* data */ |
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define D2PRINTK(format,args...) |
#endif |
|
|
#define PRIV(sch) ((struct ingress_qdisc_data *) (sch)->data) |
|
|
/* Thanks to Doron Oz for this hack |
*/ |
static int nf_registered = 0; |
|
struct ingress_qdisc_data { |
struct Qdisc *q; |
struct tcf_proto *filter_list; |
}; |
|
|
/* ------------------------- Class/flow operations ------------------------- */ |
|
|
static int ingress_graft(struct Qdisc *sch,unsigned long arg, |
struct Qdisc *new,struct Qdisc **old) |
{ |
#ifdef DEBUG_INGRESS |
struct ingress_qdisc_data *p = PRIV(sch); |
#endif |
|
DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n", |
sch, p, new, old); |
DPRINTK("\n ingress_graft: You cannot add qdiscs to classes"); |
return 1; |
} |
|
|
static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) |
{ |
return NULL; |
} |
|
|
static unsigned long ingress_get(struct Qdisc *sch,u32 classid) |
{ |
#ifdef DEBUG_INGRESS |
struct ingress_qdisc_data *p = PRIV(sch); |
#endif |
DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); |
return TC_H_MIN(classid) + 1; |
} |
|
|
static unsigned long ingress_bind_filter(struct Qdisc *sch, |
unsigned long parent, u32 classid) |
{ |
return ingress_get(sch, classid); |
} |
|
|
static void ingress_put(struct Qdisc *sch, unsigned long cl) |
{ |
} |
|
|
static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent, |
struct rtattr **tca, unsigned long *arg) |
{ |
#ifdef DEBUG_INGRESS |
struct ingress_qdisc_data *p = PRIV(sch); |
#endif |
DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x)," |
"arg 0x%lx\n", sch, p, classid, parent, *arg); |
DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); |
return 0; |
} |
|
|
|
static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker) |
{ |
#ifdef DEBUG_INGRESS |
struct ingress_qdisc_data *p = PRIV(sch); |
#endif |
DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); |
DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); |
} |
|
|
static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl) |
{ |
struct ingress_qdisc_data *p = PRIV(sch); |
|
return &p->filter_list; |
} |
|
|
/* --------------------------- Qdisc operations ---------------------------- */ |
|
|
static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) |
{ |
struct ingress_qdisc_data *p = PRIV(sch); |
struct tcf_result res; |
int result; |
|
D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); |
result = tc_classify(skb, p->filter_list, &res); |
D2PRINTK("result %d class 0x%04x\n", result, res.classid); |
/* |
* Unlike normal "enqueue" functions, ingress_enqueue returns a |
* firewall FW_* code. |
*/ |
#ifdef CONFIG_NET_CLS_POLICE |
switch (result) { |
case TC_POLICE_SHOT: |
result = NF_DROP; |
sch->stats.drops++; |
break; |
case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */ |
case TC_POLICE_OK: |
case TC_POLICE_UNSPEC: |
default: |
sch->stats.packets++; |
sch->stats.bytes += skb->len; |
result = NF_ACCEPT; |
break; |
}; |
#else |
sch->stats.packets++; |
sch->stats.bytes += skb->len; |
#endif |
|
skb->tc_index = TC_H_MIN(res.classid); |
return result; |
} |
|
|
static struct sk_buff *ingress_dequeue(struct Qdisc *sch) |
{ |
/* |
struct ingress_qdisc_data *p = PRIV(sch); |
D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p)); |
*/ |
return NULL; |
} |
|
|
static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch) |
{ |
/* |
struct ingress_qdisc_data *p = PRIV(sch); |
D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p)); |
*/ |
return 0; |
} |
|
static unsigned int ingress_drop(struct Qdisc *sch) |
{ |
#ifdef DEBUG_INGRESS |
struct ingress_qdisc_data *p = PRIV(sch); |
#endif |
DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p); |
return 0; |
} |
|
static unsigned int |
ing_hook(unsigned int hook, struct sk_buff **pskb, |
const struct net_device *indev, |
const struct net_device *outdev, |
int (*okfn)(struct sk_buff *)) |
{ |
|
struct Qdisc *q; |
struct sk_buff *skb = *pskb; |
struct net_device *dev = skb->dev; |
int fwres=NF_ACCEPT; |
|
DPRINTK("ing_hook: skb %s dev=%s len=%u\n", |
skb->sk ? "(owned)" : "(unowned)", |
skb->dev ? (*pskb)->dev->name : "(no dev)", |
skb->len); |
|
/* |
revisit later: Use a private since lock dev->queue_lock is also |
used on the egress (might slow things for an iota) |
*/ |
|
if (dev->qdisc_ingress) { |
spin_lock(&dev->queue_lock); |
if ((q = dev->qdisc_ingress) != NULL) |
fwres = q->enqueue(skb, q); |
spin_unlock(&dev->queue_lock); |
} |
|
return fwres; |
} |
|
/* after ipt_filter */ |
static struct nf_hook_ops ing_ops = |
{ |
{ NULL, NULL}, |
ing_hook, |
PF_INET, |
NF_IP_PRE_ROUTING, |
NF_IP_PRI_FILTER + 1 |
}; |
|
int ingress_init(struct Qdisc *sch,struct rtattr *opt) |
{ |
struct ingress_qdisc_data *p = PRIV(sch); |
|
if (!nf_registered) { |
if (nf_register_hook(&ing_ops) < 0) { |
printk("ingress qdisc registration error \n"); |
goto error; |
} |
nf_registered++; |
} |
|
DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); |
memset(p, 0, sizeof(*p)); |
p->filter_list = NULL; |
p->q = &noop_qdisc; |
MOD_INC_USE_COUNT; |
return 0; |
error: |
return -EINVAL; |
} |
|
|
static void ingress_reset(struct Qdisc *sch) |
{ |
struct ingress_qdisc_data *p = PRIV(sch); |
|
DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p); |
|
/* |
#if 0 |
*/ |
/* for future use */ |
qdisc_reset(p->q); |
/* |
#endif |
*/ |
} |
|
/* ------------------------------------------------------------- */ |
|
|
/* ------------------------------------------------------------- */ |
|
static void ingress_destroy(struct Qdisc *sch) |
{ |
struct ingress_qdisc_data *p = PRIV(sch); |
struct tcf_proto *tp; |
|
DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p); |
while (p->filter_list) { |
tp = p->filter_list; |
p->filter_list = tp->next; |
tcf_destroy(tp); |
} |
memset(p, 0, sizeof(*p)); |
p->filter_list = NULL; |
|
#if 0 |
/* for future use */ |
qdisc_destroy(p->q); |
#endif |
|
MOD_DEC_USE_COUNT; |
|
} |
|
|
static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
rta = (struct rtattr *) b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
rta->rta_len = skb->tail - b; |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static struct Qdisc_class_ops ingress_class_ops = |
{ |
ingress_graft, /* graft */ |
ingress_leaf, /* leaf */ |
ingress_get, /* get */ |
ingress_put, /* put */ |
ingress_change, /* change */ |
NULL, /* delete */ |
ingress_walk, /* walk */ |
|
ingress_find_tcf, /* tcf_chain */ |
ingress_bind_filter, /* bind_tcf */ |
ingress_put, /* unbind_tcf */ |
|
NULL, /* dump */ |
}; |
|
struct Qdisc_ops ingress_qdisc_ops = |
{ |
NULL, /* next */ |
&ingress_class_ops, /* cl_ops */ |
"ingress", |
sizeof(struct ingress_qdisc_data), |
|
ingress_enqueue, /* enqueue */ |
ingress_dequeue, /* dequeue */ |
ingress_requeue, /* requeue */ |
ingress_drop, /* drop */ |
|
ingress_init, /* init */ |
ingress_reset, /* reset */ |
ingress_destroy, /* destroy */ |
NULL, /* change */ |
|
ingress_dump, /* dump */ |
}; |
|
|
#ifdef MODULE |
int init_module(void) |
{ |
int ret = 0; |
|
if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) { |
printk("Unable to register Ingress qdisc\n"); |
return ret; |
} |
|
return ret; |
} |
|
|
void cleanup_module(void) |
{ |
unregister_qdisc(&ingress_qdisc_ops); |
if (nf_registered) |
nf_unregister_hook(&ing_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/sch_tbf.c
0,0 → 1,550
/* |
* net/sched/sch_tbf.c Token Bucket Filter queue. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs - |
* original idea by Martin Devera |
* |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
|
/* Simple Token Bucket Filter. |
======================================= |
|
SOURCE. |
------- |
|
None. |
|
Description. |
------------ |
|
A data flow obeys TBF with rate R and depth B, if for any |
time interval t_i...t_f the number of transmitted bits |
does not exceed B + R*(t_f-t_i). |
|
Packetized version of this definition: |
The sequence of packets of sizes s_i served at moments t_i |
obeys TBF, if for any i<=k: |
|
s_i+....+s_k <= B + R*(t_k - t_i) |
|
Algorithm. |
---------- |
|
Let N(t_i) be B/R initially and N(t) grow continuously with time as: |
|
N(t+delta) = min{B/R, N(t) + delta} |
|
If the first packet in queue has length S, it may be |
transmitted only at the time t_* when S/R <= N(t_*), |
and in this case N(t) jumps: |
|
N(t_* + 0) = N(t_* - 0) - S/R. |
|
|
|
Actually, QoS requires two TBF to be applied to a data stream. |
One of them controls steady state burst size, another |
one with rate P (peak rate) and depth M (equal to link MTU) |
limits bursts at a smaller time scale. |
|
It is easy to see that P>R, and B>M. If P is infinity, this double |
TBF is equivalent to a single one. |
|
When TBF works in reshaping mode, latency is estimated as: |
|
lat = max ((L-B)/R, (L-M)/P) |
|
|
NOTES. |
------ |
|
If TBF throttles, it starts a watchdog timer, which will wake it up |
when it is ready to transmit. |
Note that the minimal timer resolution is 1/HZ. |
If no new packets arrive during this period, |
or if the device is not awaken by EOI for some previous packet, |
TBF can stop its activity for 1/HZ. |
|
|
This means, that with depth B, the maximal rate is |
|
R_crit = B*HZ |
|
F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. |
|
Note that the peak rate TBF is much more tough: with MTU 1500 |
P_crit = 150Kbytes/sec. So, if you need greater peak |
rates, use alpha with HZ=1000 :-) |
|
With classful TBF, limit is just kept for backwards compatibility. |
It is passed to the default bfifo qdisc - if the inner qdisc is |
changed the limit is not effective anymore. |
*/ |
|
struct tbf_sched_data |
{ |
/* Parameters */ |
u32 limit; /* Maximal length of backlog: bytes */ |
u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ |
u32 mtu; |
u32 max_size; |
struct qdisc_rate_table *R_tab; |
struct qdisc_rate_table *P_tab; |
|
/* Variables */ |
long tokens; /* Current number of B tokens */ |
long ptokens; /* Current number of P tokens */ |
psched_time_t t_c; /* Time check-point */ |
struct timer_list wd_timer; /* Watchdog timer */ |
struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ |
}; |
|
#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) |
#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) |
|
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
int ret; |
|
if (skb->len > q->max_size) { |
sch->stats.drops++; |
#ifdef CONFIG_NET_CLS_POLICE |
if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) |
#endif |
kfree_skb(skb); |
|
return NET_XMIT_DROP; |
} |
|
if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) { |
sch->stats.drops++; |
return ret; |
} |
|
sch->q.qlen++; |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
return 0; |
} |
|
static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
int ret; |
|
if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) |
sch->q.qlen++; |
|
return ret; |
} |
|
static unsigned int tbf_drop(struct Qdisc* sch) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
unsigned int len; |
|
if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { |
sch->q.qlen--; |
sch->stats.drops++; |
} |
return len; |
} |
|
static void tbf_watchdog(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc*)arg; |
|
sch->flags &= ~TCQ_F_THROTTLED; |
netif_schedule(sch->dev); |
} |
|
static struct sk_buff *tbf_dequeue(struct Qdisc* sch) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
struct sk_buff *skb; |
|
skb = q->qdisc->dequeue(q->qdisc); |
|
if (skb) { |
psched_time_t now; |
long toks; |
long ptoks = 0; |
unsigned int len = skb->len; |
|
PSCHED_GET_TIME(now); |
|
toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0); |
|
if (q->P_tab) { |
ptoks = toks + q->ptokens; |
if (ptoks > (long)q->mtu) |
ptoks = q->mtu; |
ptoks -= L2T_P(q, len); |
} |
toks += q->tokens; |
if (toks > (long)q->buffer) |
toks = q->buffer; |
toks -= L2T(q, len); |
|
if ((toks|ptoks) >= 0) { |
q->t_c = now; |
q->tokens = toks; |
q->ptokens = ptoks; |
sch->q.qlen--; |
sch->flags &= ~TCQ_F_THROTTLED; |
return skb; |
} |
|
if (!netif_queue_stopped(sch->dev)) { |
long delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks)); |
|
if (delay == 0) |
delay = 1; |
|
mod_timer(&q->wd_timer, jiffies+delay); |
} |
|
/* Maybe we have a shorter packet in the queue, |
which can be sent now. It sounds cool, |
but, however, this is wrong in principle. |
We MUST NOT reorder packets under these circumstances. |
|
Really, if we split the flow into independent |
subflows, it would be a very good solution. |
This is the main idea of all FQ algorithms |
(cf. CSZ, HPFQ, HFSC) |
*/ |
|
if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { |
/* When requeue fails skb is dropped */ |
sch->q.qlen--; |
sch->stats.drops++; |
} |
|
sch->flags |= TCQ_F_THROTTLED; |
sch->stats.overlimits++; |
} |
return NULL; |
} |
|
static void tbf_reset(struct Qdisc* sch) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
|
qdisc_reset(q->qdisc); |
sch->q.qlen = 0; |
PSCHED_GET_TIME(q->t_c); |
q->tokens = q->buffer; |
q->ptokens = q->mtu; |
sch->flags &= ~TCQ_F_THROTTLED; |
del_timer(&q->wd_timer); |
} |
|
static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit) |
{ |
struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops); |
struct rtattr *rta; |
int ret; |
|
if (q) { |
rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); |
if (rta) { |
rta->rta_type = RTM_NEWQDISC; |
rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); |
((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; |
|
ret = q->ops->change(q, rta); |
kfree(rta); |
|
if (ret == 0) |
return q; |
} |
qdisc_destroy(q); |
} |
|
return NULL; |
} |
|
static int tbf_change(struct Qdisc* sch, struct rtattr *opt) |
{ |
int err = -EINVAL; |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
struct rtattr *tb[TCA_TBF_PTAB]; |
struct tc_tbf_qopt *qopt; |
struct qdisc_rate_table *rtab = NULL; |
struct qdisc_rate_table *ptab = NULL; |
struct Qdisc *child = NULL; |
int max_size,n; |
|
if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || |
tb[TCA_TBF_PARMS-1] == NULL || |
RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) |
goto done; |
|
qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); |
rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); |
if (rtab == NULL) |
goto done; |
|
if (qopt->peakrate.rate) { |
if (qopt->peakrate.rate > qopt->rate.rate) |
ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]); |
if (ptab == NULL) |
goto done; |
} |
|
for (n = 0; n < 256; n++) |
if (rtab->data[n] > qopt->buffer) break; |
max_size = (n << qopt->rate.cell_log)-1; |
if (ptab) { |
int size; |
|
for (n = 0; n < 256; n++) |
if (ptab->data[n] > qopt->mtu) break; |
size = (n << qopt->peakrate.cell_log)-1; |
if (size < max_size) max_size = size; |
} |
if (max_size < 0) |
goto done; |
|
if (q->qdisc == &noop_qdisc) { |
if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL) |
goto done; |
} |
|
sch_tree_lock(sch); |
if (child) q->qdisc = child; |
q->limit = qopt->limit; |
q->mtu = qopt->mtu; |
q->max_size = max_size; |
q->buffer = qopt->buffer; |
q->tokens = q->buffer; |
q->ptokens = q->mtu; |
rtab = xchg(&q->R_tab, rtab); |
ptab = xchg(&q->P_tab, ptab); |
sch_tree_unlock(sch); |
err = 0; |
done: |
if (rtab) |
qdisc_put_rtab(rtab); |
if (ptab) |
qdisc_put_rtab(ptab); |
return err; |
} |
|
static int tbf_init(struct Qdisc* sch, struct rtattr *opt) |
{ |
int err; |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
|
if (opt == NULL) |
return -EINVAL; |
|
MOD_INC_USE_COUNT; |
|
PSCHED_GET_TIME(q->t_c); |
init_timer(&q->wd_timer); |
q->wd_timer.function = tbf_watchdog; |
q->wd_timer.data = (unsigned long)sch; |
|
q->qdisc = &noop_qdisc; |
|
if ((err = tbf_change(sch, opt)) != 0) { |
MOD_DEC_USE_COUNT; |
} |
return err; |
} |
|
static void tbf_destroy(struct Qdisc *sch) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
|
del_timer(&q->wd_timer); |
|
if (q->P_tab) |
qdisc_put_rtab(q->P_tab); |
if (q->R_tab) |
qdisc_put_rtab(q->R_tab); |
|
qdisc_destroy(q->qdisc); |
q->qdisc = &noop_qdisc; |
|
MOD_DEC_USE_COUNT; |
} |
|
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
struct tc_tbf_qopt opt; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
opt.limit = q->limit; |
opt.rate = q->R_tab->rate; |
if (q->P_tab) |
opt.peakrate = q->P_tab->rate; |
else |
memset(&opt.peakrate, 0, sizeof(opt.peakrate)); |
opt.mtu = q->mtu; |
opt.buffer = q->buffer; |
RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); |
rta->rta_len = skb->tail - b; |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, |
struct sk_buff *skb, struct tcmsg *tcm) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data*)sch->data; |
|
if (cl != 1) /* only one class */ |
return -ENOENT; |
|
tcm->tcm_handle |= TC_H_MIN(1); |
tcm->tcm_info = q->qdisc->handle; |
|
return 0; |
} |
|
static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, |
struct Qdisc **old) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
|
if (new == NULL) |
new = &noop_qdisc; |
|
sch_tree_lock(sch); |
*old = xchg(&q->qdisc, new); |
qdisc_reset(*old); |
sch->q.qlen = 0; |
sch_tree_unlock(sch); |
|
return 0; |
} |
|
static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) |
{ |
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; |
return q->qdisc; |
} |
|
static unsigned long tbf_get(struct Qdisc *sch, u32 classid) |
{ |
return 1; |
} |
|
static void tbf_put(struct Qdisc *sch, unsigned long arg) |
{ |
} |
|
static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, |
struct rtattr **tca, unsigned long *arg) |
{ |
return -ENOSYS; |
} |
|
static int tbf_delete(struct Qdisc *sch, unsigned long arg) |
{ |
return -ENOSYS; |
} |
|
static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) |
{ |
if (!walker->stop) { |
if (walker->count >= walker->skip) |
if (walker->fn(sch, 1, walker) < 0) { |
walker->stop = 1; |
return; |
} |
walker->count++; |
} |
} |
|
static struct Qdisc_class_ops tbf_class_ops = |
{ |
.graft = tbf_graft, |
.leaf = tbf_leaf, |
.get = tbf_get, |
.put = tbf_put, |
.change = tbf_change_class, |
.delete = tbf_delete, |
.walk = tbf_walk, |
.dump = tbf_dump_class, |
}; |
|
struct Qdisc_ops tbf_qdisc_ops = |
{ |
NULL, |
&tbf_class_ops, |
"tbf", |
sizeof(struct tbf_sched_data), |
|
tbf_enqueue, |
tbf_dequeue, |
tbf_requeue, |
tbf_drop, |
|
tbf_init, |
tbf_reset, |
tbf_destroy, |
tbf_change, |
|
tbf_dump, |
}; |
|
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&tbf_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&tbf_qdisc_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/sch_generic.c
0,0 → 1,543
/* |
* net/sched/sch_generic.c Generic packet scheduler routines. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* Jamal Hadi Salim, <hadi@cyberus.ca> 990601 |
* - Ingress support |
*/ |
|
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/config.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/netdevice.h> |
#include <linux/skbuff.h> |
#include <linux/rtnetlink.h> |
#include <linux/init.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
/* Main transmission queue. */ |
|
/* Main qdisc structure lock. |
|
However, modifications |
to data, participating in scheduling must be additionally |
protected with dev->queue_lock spinlock. |
|
The idea is the following: |
- enqueue, dequeue are serialized via top level device |
spinlock dev->queue_lock. |
- tree walking is protected by read_lock(qdisc_tree_lock) |
and this lock is used only in process context. |
- updates to tree are made only under rtnl semaphore, |
hence this lock may be made without local bh disabling. |
|
qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! |
*/ |
rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED; |
|
/* |
dev->queue_lock serializes queue accesses for this device |
AND dev->qdisc pointer itself. |
|
dev->xmit_lock serializes accesses to device driver. |
|
dev->queue_lock and dev->xmit_lock are mutually exclusive, |
if one is grabbed, another must be free. |
*/ |
|
|
/* Kick device. |
Note, that this procedure can be called by a watchdog timer, so that |
we do not check dev->tbusy flag here. |
|
Returns: 0 - queue is empty. |
>0 - queue is not empty, but throttled. |
<0 - queue is not empty. Device is throttled, if dev->tbusy != 0. |
|
NOTE: Called under dev->queue_lock with locally disabled BH. |
*/ |
|
int qdisc_restart(struct net_device *dev) |
{ |
struct Qdisc *q = dev->qdisc; |
struct sk_buff *skb; |
|
/* Dequeue packet */ |
if ((skb = q->dequeue(q)) != NULL) { |
if (spin_trylock(&dev->xmit_lock)) { |
/* Remember that the driver is grabbed by us. */ |
dev->xmit_lock_owner = smp_processor_id(); |
|
/* And release queue */ |
spin_unlock(&dev->queue_lock); |
|
if (!netif_queue_stopped(dev)) { |
if (netdev_nit) |
dev_queue_xmit_nit(skb, dev); |
|
if (dev->hard_start_xmit(skb, dev) == 0) { |
dev->xmit_lock_owner = -1; |
spin_unlock(&dev->xmit_lock); |
|
spin_lock(&dev->queue_lock); |
return -1; |
} |
} |
|
/* Release the driver */ |
dev->xmit_lock_owner = -1; |
spin_unlock(&dev->xmit_lock); |
spin_lock(&dev->queue_lock); |
q = dev->qdisc; |
} else { |
/* So, someone grabbed the driver. */ |
|
/* It may be transient configuration error, |
when hard_start_xmit() recurses. We detect |
it by checking xmit owner and drop the |
packet when deadloop is detected. |
*/ |
if (dev->xmit_lock_owner == smp_processor_id()) { |
kfree_skb(skb); |
if (net_ratelimit()) |
printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); |
return -1; |
} |
netdev_rx_stat[smp_processor_id()].cpu_collision++; |
} |
|
/* Device kicked us out :( |
This is possible in three cases: |
|
0. driver is locked |
1. fastroute is enabled |
2. device cannot determine busy state |
before start of transmission (f.e. dialout) |
3. device is buggy (ppp) |
*/ |
|
q->ops->requeue(skb, q); |
netif_schedule(dev); |
return 1; |
} |
return q->q.qlen; |
} |
|
static void dev_watchdog(unsigned long arg) |
{ |
struct net_device *dev = (struct net_device *)arg; |
|
spin_lock(&dev->xmit_lock); |
if (dev->qdisc != &noop_qdisc) { |
if (netif_device_present(dev) && |
netif_running(dev) && |
netif_carrier_ok(dev)) { |
if (netif_queue_stopped(dev) && |
(jiffies - dev->trans_start) > dev->watchdog_timeo) { |
printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name); |
dev->tx_timeout(dev); |
} |
if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) |
dev_hold(dev); |
} |
} |
spin_unlock(&dev->xmit_lock); |
|
dev_put(dev); |
} |
|
static void dev_watchdog_init(struct net_device *dev) |
{ |
init_timer(&dev->watchdog_timer); |
dev->watchdog_timer.data = (unsigned long)dev; |
dev->watchdog_timer.function = dev_watchdog; |
} |
|
void __netdev_watchdog_up(struct net_device *dev) |
{ |
if (dev->tx_timeout) { |
if (dev->watchdog_timeo <= 0) |
dev->watchdog_timeo = 5*HZ; |
if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) |
dev_hold(dev); |
} |
} |
|
static void dev_watchdog_up(struct net_device *dev) |
{ |
spin_lock_bh(&dev->xmit_lock); |
__netdev_watchdog_up(dev); |
spin_unlock_bh(&dev->xmit_lock); |
} |
|
static void dev_watchdog_down(struct net_device *dev) |
{ |
spin_lock_bh(&dev->xmit_lock); |
if (del_timer(&dev->watchdog_timer)) |
__dev_put(dev); |
spin_unlock_bh(&dev->xmit_lock); |
} |
|
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces |
under all circumstances. It is difficult to invent anything faster or |
cheaper. |
*/ |
|
static int |
noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) |
{ |
kfree_skb(skb); |
return NET_XMIT_CN; |
} |
|
static struct sk_buff * |
noop_dequeue(struct Qdisc * qdisc) |
{ |
return NULL; |
} |
|
static int |
noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) |
{ |
if (net_ratelimit()) |
printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); |
kfree_skb(skb); |
return NET_XMIT_CN; |
} |
|
struct Qdisc_ops noop_qdisc_ops = |
{ |
NULL, |
NULL, |
"noop", |
0, |
|
noop_enqueue, |
noop_dequeue, |
noop_requeue, |
}; |
|
struct Qdisc noop_qdisc = |
{ |
noop_enqueue, |
noop_dequeue, |
TCQ_F_BUILTIN, |
&noop_qdisc_ops, |
}; |
|
|
struct Qdisc_ops noqueue_qdisc_ops = |
{ |
NULL, |
NULL, |
"noqueue", |
0, |
|
noop_enqueue, |
noop_dequeue, |
noop_requeue, |
|
}; |
|
struct Qdisc noqueue_qdisc = |
{ |
NULL, |
noop_dequeue, |
TCQ_F_BUILTIN, |
&noqueue_qdisc_ops, |
}; |
|
|
static const u8 prio2band[TC_PRIO_MAX+1] = |
{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; |
|
/* 3-band FIFO queue: old style, but should be a bit faster than |
generic prio+fifo combination. |
*/ |
|
static int |
pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) |
{ |
struct sk_buff_head *list; |
|
list = ((struct sk_buff_head*)qdisc->data) + |
prio2band[skb->priority&TC_PRIO_MAX]; |
|
if (list->qlen < qdisc->dev->tx_queue_len) { |
__skb_queue_tail(list, skb); |
qdisc->q.qlen++; |
qdisc->stats.bytes += skb->len; |
qdisc->stats.packets++; |
return 0; |
} |
qdisc->stats.drops++; |
kfree_skb(skb); |
return NET_XMIT_DROP; |
} |
|
static struct sk_buff * |
pfifo_fast_dequeue(struct Qdisc* qdisc) |
{ |
int prio; |
struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data); |
struct sk_buff *skb; |
|
for (prio = 0; prio < 3; prio++, list++) { |
skb = __skb_dequeue(list); |
if (skb) { |
qdisc->q.qlen--; |
return skb; |
} |
} |
return NULL; |
} |
|
static int |
pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) |
{ |
struct sk_buff_head *list; |
|
list = ((struct sk_buff_head*)qdisc->data) + |
prio2band[skb->priority&TC_PRIO_MAX]; |
|
__skb_queue_head(list, skb); |
qdisc->q.qlen++; |
return 0; |
} |
|
static void |
pfifo_fast_reset(struct Qdisc* qdisc) |
{ |
int prio; |
struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data); |
|
for (prio=0; prio < 3; prio++) |
skb_queue_purge(list+prio); |
qdisc->q.qlen = 0; |
} |
|
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) |
{ |
unsigned char *b = skb->tail; |
struct tc_prio_qopt opt; |
|
opt.bands = 3; |
memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); |
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) |
{ |
int i; |
struct sk_buff_head *list; |
|
list = ((struct sk_buff_head*)qdisc->data); |
|
for (i=0; i<3; i++) |
skb_queue_head_init(list+i); |
|
return 0; |
} |
|
static struct Qdisc_ops pfifo_fast_ops = |
{ |
NULL, |
NULL, |
"pfifo_fast", |
3 * sizeof(struct sk_buff_head), |
|
pfifo_fast_enqueue, |
pfifo_fast_dequeue, |
pfifo_fast_requeue, |
NULL, |
|
pfifo_fast_init, |
pfifo_fast_reset, |
NULL, |
NULL, |
pfifo_fast_dump, |
|
}; |
|
struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) |
{ |
struct Qdisc *sch; |
int size = sizeof(*sch) + ops->priv_size; |
|
sch = kmalloc(size, GFP_KERNEL); |
if (!sch) |
return NULL; |
memset(sch, 0, size); |
|
skb_queue_head_init(&sch->q); |
sch->ops = ops; |
sch->enqueue = ops->enqueue; |
sch->dequeue = ops->dequeue; |
sch->dev = dev; |
sch->stats.lock = &dev->queue_lock; |
atomic_set(&sch->refcnt, 1); |
if (!ops->init || ops->init(sch, NULL) == 0) |
return sch; |
|
kfree(sch); |
return NULL; |
} |
|
/* Under dev->queue_lock and BH! */ |
|
void qdisc_reset(struct Qdisc *qdisc) |
{ |
struct Qdisc_ops *ops = qdisc->ops; |
|
if (ops->reset) |
ops->reset(qdisc); |
} |
|
/* Under dev->queue_lock and BH! */ |
|
void qdisc_destroy(struct Qdisc *qdisc) |
{ |
struct Qdisc_ops *ops = qdisc->ops; |
struct net_device *dev; |
|
if (!atomic_dec_and_test(&qdisc->refcnt)) |
return; |
|
dev = qdisc->dev; |
|
if (dev) { |
struct Qdisc *q, **qp; |
for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) { |
if (q == qdisc) { |
*qp = q->next; |
break; |
} |
} |
} |
#ifdef CONFIG_NET_ESTIMATOR |
qdisc_kill_estimator(&qdisc->stats); |
#endif |
if (ops->reset) |
ops->reset(qdisc); |
if (ops->destroy) |
ops->destroy(qdisc); |
if (!(qdisc->flags&TCQ_F_BUILTIN)) |
kfree(qdisc); |
} |
|
|
void dev_activate(struct net_device *dev) |
{ |
/* No queueing discipline is attached to device; |
create default one i.e. pfifo_fast for devices, |
which need queueing and noqueue_qdisc for |
virtual interfaces |
*/ |
|
if (dev->qdisc_sleeping == &noop_qdisc) { |
struct Qdisc *qdisc; |
if (dev->tx_queue_len) { |
qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops); |
if (qdisc == NULL) { |
printk(KERN_INFO "%s: activation failed\n", dev->name); |
return; |
} |
|
write_lock(&qdisc_tree_lock); |
qdisc->next = dev->qdisc_list; |
dev->qdisc_list = qdisc; |
write_unlock(&qdisc_tree_lock); |
|
} else { |
qdisc = &noqueue_qdisc; |
} |
write_lock(&qdisc_tree_lock); |
dev->qdisc_sleeping = qdisc; |
write_unlock(&qdisc_tree_lock); |
} |
|
spin_lock_bh(&dev->queue_lock); |
if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) { |
dev->trans_start = jiffies; |
dev_watchdog_up(dev); |
} |
spin_unlock_bh(&dev->queue_lock); |
} |
|
void dev_deactivate(struct net_device *dev) |
{ |
struct Qdisc *qdisc; |
|
spin_lock_bh(&dev->queue_lock); |
qdisc = dev->qdisc; |
dev->qdisc = &noop_qdisc; |
|
qdisc_reset(qdisc); |
|
spin_unlock_bh(&dev->queue_lock); |
|
dev_watchdog_down(dev); |
|
while (test_bit(__LINK_STATE_SCHED, &dev->state)) |
yield(); |
|
spin_unlock_wait(&dev->xmit_lock); |
} |
|
void dev_init_scheduler(struct net_device *dev) |
{ |
write_lock(&qdisc_tree_lock); |
spin_lock_bh(&dev->queue_lock); |
dev->qdisc = &noop_qdisc; |
spin_unlock_bh(&dev->queue_lock); |
dev->qdisc_sleeping = &noop_qdisc; |
dev->qdisc_list = NULL; |
write_unlock(&qdisc_tree_lock); |
|
dev_watchdog_init(dev); |
} |
|
void dev_shutdown(struct net_device *dev) |
{ |
struct Qdisc *qdisc; |
|
write_lock(&qdisc_tree_lock); |
spin_lock_bh(&dev->queue_lock); |
qdisc = dev->qdisc_sleeping; |
dev->qdisc = &noop_qdisc; |
dev->qdisc_sleeping = &noop_qdisc; |
qdisc_destroy(qdisc); |
#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE) |
if ((qdisc = dev->qdisc_ingress) != NULL) { |
dev->qdisc_ingress = NULL; |
qdisc_destroy(qdisc); |
} |
#endif |
BUG_TRAP(dev->qdisc_list == NULL); |
BUG_TRAP(!timer_pending(&dev->watchdog_timer)); |
dev->qdisc_list = NULL; |
spin_unlock_bh(&dev->queue_lock); |
write_unlock(&qdisc_tree_lock); |
} |
/cls_u32.c
0,0 → 1,718
/* |
* net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* |
* The filters are packed to hash tables of key nodes |
* with a set of 32bit key/mask pairs at every node. |
* Nodes reference next level hash tables etc. |
* |
* This scheme is the best universal classifier I managed to |
* invent; it is not super-fast, but it is not slow (provided you |
* program it correctly), and general enough. And its relative |
* speed grows as the number of rules becomes larger. |
* |
* It seems that it represents the best middle point between |
* speed and manageability both by human and by machine. |
* |
* It is especially useful for link sharing combined with QoS; |
* pure RSVP doesn't need such a general approach and can use |
* much simpler (and faster) schemes, sort of cls_rsvp.c. |
*/ |
|
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/config.h> |
#include <linux/module.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <linux/rtnetlink.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
|
struct tc_u_knode |
{ |
struct tc_u_knode *next; |
u32 handle; |
struct tc_u_hnode *ht_up; |
#ifdef CONFIG_NET_CLS_POLICE |
struct tcf_police *police; |
#endif |
struct tcf_result res; |
struct tc_u_hnode *ht_down; |
struct tc_u32_sel sel; |
}; |
|
struct tc_u_hnode |
{ |
struct tc_u_hnode *next; |
u32 handle; |
struct tc_u_common *tp_c; |
int refcnt; |
unsigned divisor; |
u32 hgenerator; |
struct tc_u_knode *ht[1]; |
}; |
|
struct tc_u_common |
{ |
struct tc_u_common *next; |
struct tc_u_hnode *hlist; |
struct Qdisc *q; |
int refcnt; |
u32 hgenerator; |
}; |
|
static struct tc_u_common *u32_list; |
|
static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel) |
{ |
unsigned h = key & sel->hmask; |
|
h ^= h>>16; |
h ^= h>>8; |
return h; |
} |
|
static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) |
{ |
struct { |
struct tc_u_knode *knode; |
u8 *ptr; |
} stack[TC_U32_MAXDEPTH]; |
|
struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; |
u8 *ptr = skb->nh.raw; |
struct tc_u_knode *n; |
int sdepth = 0; |
int off2 = 0; |
int sel = 0; |
int i; |
|
next_ht: |
n = ht->ht[sel]; |
|
next_knode: |
if (n) { |
struct tc_u32_key *key = n->sel.keys; |
|
for (i = n->sel.nkeys; i>0; i--, key++) { |
if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { |
n = n->next; |
goto next_knode; |
} |
} |
if (n->ht_down == NULL) { |
check_terminal: |
if (n->sel.flags&TC_U32_TERMINAL) { |
*res = n->res; |
#ifdef CONFIG_NET_CLS_POLICE |
if (n->police) { |
int pol_res = tcf_police(skb, n->police); |
if (pol_res >= 0) |
return pol_res; |
} else |
#endif |
return 0; |
} |
n = n->next; |
goto next_knode; |
} |
|
/* PUSH */ |
if (sdepth >= TC_U32_MAXDEPTH) |
goto deadloop; |
stack[sdepth].knode = n; |
stack[sdepth].ptr = ptr; |
sdepth++; |
|
ht = n->ht_down; |
sel = 0; |
if (ht->divisor) |
sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel); |
|
if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) |
goto next_ht; |
|
if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { |
off2 = n->sel.off + 3; |
if (n->sel.flags&TC_U32_VAROFFSET) |
off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; |
off2 &= ~3; |
} |
if (n->sel.flags&TC_U32_EAT) { |
ptr += off2; |
off2 = 0; |
} |
|
if (ptr < skb->tail) |
goto next_ht; |
} |
|
/* POP */ |
if (sdepth--) { |
n = stack[sdepth].knode; |
ht = n->ht_up; |
ptr = stack[sdepth].ptr; |
goto check_terminal; |
} |
return -1; |
|
deadloop: |
if (net_ratelimit()) |
printk("cls_u32: dead loop\n"); |
return -1; |
} |
|
static __inline__ struct tc_u_hnode * |
u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) |
{ |
struct tc_u_hnode *ht; |
|
for (ht = tp_c->hlist; ht; ht = ht->next) |
if (ht->handle == handle) |
break; |
|
return ht; |
} |
|
static __inline__ struct tc_u_knode * |
u32_lookup_key(struct tc_u_hnode *ht, u32 handle) |
{ |
unsigned sel; |
struct tc_u_knode *n; |
|
sel = TC_U32_HASH(handle); |
if (sel > ht->divisor) |
return 0; |
|
for (n = ht->ht[sel]; n; n = n->next) |
if (n->handle == handle) |
return n; |
|
return NULL; |
} |
|
|
static unsigned long u32_get(struct tcf_proto *tp, u32 handle) |
{ |
struct tc_u_hnode *ht; |
struct tc_u_common *tp_c = tp->data; |
|
if (TC_U32_HTID(handle) == TC_U32_ROOT) |
ht = tp->root; |
else |
ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); |
|
if (!ht) |
return 0; |
|
if (TC_U32_KEY(handle) == 0) |
return (unsigned long)ht; |
|
return (unsigned long)u32_lookup_key(ht, handle); |
} |
|
static void u32_put(struct tcf_proto *tp, unsigned long f) |
{ |
} |
|
static u32 gen_new_htid(struct tc_u_common *tp_c) |
{ |
int i = 0x800; |
|
do { |
if (++tp_c->hgenerator == 0x7FF) |
tp_c->hgenerator = 1; |
} while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); |
|
return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; |
} |
|
static int u32_init(struct tcf_proto *tp) |
{ |
struct tc_u_hnode *root_ht; |
struct tc_u_common *tp_c; |
|
MOD_INC_USE_COUNT; |
|
for (tp_c = u32_list; tp_c; tp_c = tp_c->next) |
if (tp_c->q == tp->q) |
break; |
|
root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL); |
if (root_ht == NULL) { |
MOD_DEC_USE_COUNT; |
return -ENOBUFS; |
} |
memset(root_ht, 0, sizeof(*root_ht)); |
root_ht->divisor = 0; |
root_ht->refcnt++; |
root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; |
|
if (tp_c == NULL) { |
tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL); |
if (tp_c == NULL) { |
kfree(root_ht); |
MOD_DEC_USE_COUNT; |
return -ENOBUFS; |
} |
memset(tp_c, 0, sizeof(*tp_c)); |
tp_c->q = tp->q; |
tp_c->next = u32_list; |
u32_list = tp_c; |
} |
|
tp_c->refcnt++; |
root_ht->next = tp_c->hlist; |
tp_c->hlist = root_ht; |
root_ht->tp_c = tp_c; |
|
tp->root = root_ht; |
tp->data = tp_c; |
return 0; |
} |
|
static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) |
{ |
unsigned long cl; |
|
if ((cl = __cls_set_class(&n->res.class, 0)) != 0) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
#ifdef CONFIG_NET_CLS_POLICE |
tcf_police_release(n->police); |
#endif |
if (n->ht_down) |
n->ht_down->refcnt--; |
kfree(n); |
return 0; |
} |
|
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) |
{ |
struct tc_u_knode **kp; |
struct tc_u_hnode *ht = key->ht_up; |
|
if (ht) { |
for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { |
if (*kp == key) { |
tcf_tree_lock(tp); |
*kp = key->next; |
tcf_tree_unlock(tp); |
|
u32_destroy_key(tp, key); |
return 0; |
} |
} |
} |
BUG_TRAP(0); |
return 0; |
} |
|
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) |
{ |
struct tc_u_knode *n; |
unsigned h; |
|
for (h=0; h<=ht->divisor; h++) { |
while ((n = ht->ht[h]) != NULL) { |
ht->ht[h] = n->next; |
|
u32_destroy_key(tp, n); |
} |
} |
} |
|
static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) |
{ |
struct tc_u_common *tp_c = tp->data; |
struct tc_u_hnode **hn; |
|
BUG_TRAP(!ht->refcnt); |
|
u32_clear_hnode(tp, ht); |
|
for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { |
if (*hn == ht) { |
*hn = ht->next; |
kfree(ht); |
return 0; |
} |
} |
|
BUG_TRAP(0); |
return -ENOENT; |
} |
|
static void u32_destroy(struct tcf_proto *tp) |
{ |
struct tc_u_common *tp_c = tp->data; |
struct tc_u_hnode *root_ht = xchg(&tp->root, NULL); |
|
BUG_TRAP(root_ht != NULL); |
|
if (root_ht && --root_ht->refcnt == 0) |
u32_destroy_hnode(tp, root_ht); |
|
if (--tp_c->refcnt == 0) { |
struct tc_u_hnode *ht; |
struct tc_u_common **tp_cp; |
|
for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) { |
if (*tp_cp == tp_c) { |
*tp_cp = tp_c->next; |
break; |
} |
} |
|
for (ht=tp_c->hlist; ht; ht = ht->next) |
u32_clear_hnode(tp, ht); |
|
while ((ht = tp_c->hlist) != NULL) { |
tp_c->hlist = ht->next; |
|
BUG_TRAP(ht->refcnt == 0); |
|
kfree(ht); |
}; |
|
kfree(tp_c); |
} |
|
MOD_DEC_USE_COUNT; |
tp->data = NULL; |
} |
|
static int u32_delete(struct tcf_proto *tp, unsigned long arg) |
{ |
struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; |
|
if (ht == NULL) |
return 0; |
|
if (TC_U32_KEY(ht->handle)) |
return u32_delete_key(tp, (struct tc_u_knode*)ht); |
|
if (tp->root == ht) |
return -EINVAL; |
|
if (--ht->refcnt == 0) |
u32_destroy_hnode(tp, ht); |
|
return 0; |
} |
|
static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) |
{ |
struct tc_u_knode *n; |
unsigned i = 0x7FF; |
|
for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) |
if (i < TC_U32_NODE(n->handle)) |
i = TC_U32_NODE(n->handle); |
i++; |
|
return handle|(i>0xFFF ? 0xFFF : i); |
} |
|
static int u32_set_parms(struct Qdisc *q, unsigned long base, |
struct tc_u_hnode *ht, |
struct tc_u_knode *n, struct rtattr **tb, |
struct rtattr *est) |
{ |
if (tb[TCA_U32_LINK-1]) { |
u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); |
struct tc_u_hnode *ht_down = NULL; |
|
if (TC_U32_KEY(handle)) |
return -EINVAL; |
|
if (handle) { |
ht_down = u32_lookup_ht(ht->tp_c, handle); |
|
if (ht_down == NULL) |
return -EINVAL; |
ht_down->refcnt++; |
} |
|
sch_tree_lock(q); |
ht_down = xchg(&n->ht_down, ht_down); |
sch_tree_unlock(q); |
|
if (ht_down) |
ht_down->refcnt--; |
} |
if (tb[TCA_U32_CLASSID-1]) { |
unsigned long cl; |
|
n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); |
sch_tree_lock(q); |
cl = __cls_set_class(&n->res.class, q->ops->cl_ops->bind_tcf(q, base, n->res.classid)); |
sch_tree_unlock(q); |
if (cl) |
q->ops->cl_ops->unbind_tcf(q, cl); |
} |
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_U32_POLICE-1]) { |
struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1], est); |
|
sch_tree_lock(q); |
police = xchg(&n->police, police); |
sch_tree_unlock(q); |
|
tcf_police_release(police); |
} |
#endif |
return 0; |
} |
|
static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, |
struct rtattr **tca, |
unsigned long *arg) |
{ |
struct tc_u_common *tp_c = tp->data; |
struct tc_u_hnode *ht; |
struct tc_u_knode *n; |
struct tc_u32_sel *s; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_U32_MAX]; |
u32 htid; |
int err; |
|
if (opt == NULL) |
return handle ? -EINVAL : 0; |
|
if (rtattr_parse(tb, TCA_U32_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) |
return -EINVAL; |
|
if ((n = (struct tc_u_knode*)*arg) != NULL) { |
if (TC_U32_KEY(n->handle) == 0) |
return -EINVAL; |
|
return u32_set_parms(tp->q, base, n->ht_up, n, tb, tca[TCA_RATE-1]); |
} |
|
if (tb[TCA_U32_DIVISOR-1]) { |
unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); |
|
if (--divisor > 0x100) |
return -EINVAL; |
if (TC_U32_KEY(handle)) |
return -EINVAL; |
if (handle == 0) { |
handle = gen_new_htid(tp->data); |
if (handle == 0) |
return -ENOMEM; |
} |
ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); |
if (ht == NULL) |
return -ENOBUFS; |
memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*)); |
ht->tp_c = tp_c; |
ht->refcnt = 0; |
ht->divisor = divisor; |
ht->handle = handle; |
ht->next = tp_c->hlist; |
tp_c->hlist = ht; |
*arg = (unsigned long)ht; |
return 0; |
} |
|
if (tb[TCA_U32_HASH-1]) { |
htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); |
if (TC_U32_HTID(htid) == TC_U32_ROOT) { |
ht = tp->root; |
htid = ht->handle; |
} else { |
ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); |
if (ht == NULL) |
return -EINVAL; |
} |
} else { |
ht = tp->root; |
htid = ht->handle; |
} |
|
if (ht->divisor < TC_U32_HASH(htid)) |
return -EINVAL; |
|
if (handle) { |
if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) |
return -EINVAL; |
handle = htid | TC_U32_NODE(handle); |
} else |
handle = gen_new_kid(ht, htid); |
|
if (tb[TCA_U32_SEL-1] == 0 || |
RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) |
return -EINVAL; |
|
s = RTA_DATA(tb[TCA_U32_SEL-1]); |
n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); |
if (n == NULL) |
return -ENOBUFS; |
memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key)); |
memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); |
n->ht_up = ht; |
n->handle = handle; |
err = u32_set_parms(tp->q, base, ht, n, tb, tca[TCA_RATE-1]); |
if (err == 0) { |
struct tc_u_knode **ins; |
for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) |
if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle)) |
break; |
|
n->next = *ins; |
wmb(); |
*ins = n; |
|
*arg = (unsigned long)n; |
return 0; |
} |
kfree(n); |
return err; |
} |
|
static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) |
{ |
struct tc_u_common *tp_c = tp->data; |
struct tc_u_hnode *ht; |
struct tc_u_knode *n; |
unsigned h; |
|
if (arg->stop) |
return; |
|
for (ht = tp_c->hlist; ht; ht = ht->next) { |
if (arg->count >= arg->skip) { |
if (arg->fn(tp, (unsigned long)ht, arg) < 0) { |
arg->stop = 1; |
return; |
} |
} |
arg->count++; |
for (h = 0; h <= ht->divisor; h++) { |
for (n = ht->ht[h]; n; n = n->next) { |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (arg->fn(tp, (unsigned long)n, arg) < 0) { |
arg->stop = 1; |
return; |
} |
arg->count++; |
} |
} |
} |
} |
|
static int u32_dump(struct tcf_proto *tp, unsigned long fh, |
struct sk_buff *skb, struct tcmsg *t) |
{ |
struct tc_u_knode *n = (struct tc_u_knode*)fh; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
if (n == NULL) |
return skb->len; |
|
t->tcm_handle = n->handle; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
if (TC_U32_KEY(n->handle) == 0) { |
struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; |
u32 divisor = ht->divisor+1; |
RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); |
} else { |
RTA_PUT(skb, TCA_U32_SEL, |
sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), |
&n->sel); |
if (n->ht_up) { |
u32 htid = n->handle & 0xFFFFF000; |
RTA_PUT(skb, TCA_U32_HASH, 4, &htid); |
} |
if (n->res.classid) |
RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); |
if (n->ht_down) |
RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); |
#ifdef CONFIG_NET_CLS_POLICE |
if (n->police) { |
struct rtattr * p_rta = (struct rtattr*)skb->tail; |
|
RTA_PUT(skb, TCA_U32_POLICE, 0, NULL); |
|
if (tcf_police_dump(skb, n->police) < 0) |
goto rtattr_failure; |
|
p_rta->rta_len = skb->tail - (u8*)p_rta; |
} |
#endif |
} |
|
rta->rta_len = skb->tail - b; |
#ifdef CONFIG_NET_CLS_POLICE |
if (TC_U32_KEY(n->handle) && n->police) { |
if (qdisc_copy_stats(skb, &n->police->stats)) |
goto rtattr_failure; |
} |
#endif |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
struct tcf_proto_ops cls_u32_ops = { |
NULL, |
"u32", |
u32_classify, |
u32_init, |
u32_destroy, |
|
u32_get, |
u32_put, |
u32_change, |
u32_delete, |
u32_walk, |
u32_dump |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_tcf_proto_ops(&cls_u32_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_tcf_proto_ops(&cls_u32_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/cls_api.c
0,0 → 1,468
/* |
* net/sched/cls_api.c Packet classifier API. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* |
* Changes: |
* |
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support |
*/ |
|
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/config.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/netdevice.h> |
#include <linux/skbuff.h> |
#include <linux/rtnetlink.h> |
#include <linux/init.h> |
#include <linux/kmod.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
/* The list of all installed classifier types */ |
|
static struct tcf_proto_ops *tcf_proto_base; |
|
/* Protects list of registered TC modules. It is pure SMP lock. */ |
static rwlock_t cls_mod_lock = RW_LOCK_UNLOCKED; |
|
/* Find classifier type by string name */ |
|
struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) |
{ |
struct tcf_proto_ops *t = NULL; |
|
if (kind) { |
read_lock(&cls_mod_lock); |
for (t = tcf_proto_base; t; t = t->next) { |
if (rtattr_strcmp(kind, t->kind) == 0) |
break; |
} |
read_unlock(&cls_mod_lock); |
} |
return t; |
} |
|
/* Register(unregister) new classifier type */ |
|
int register_tcf_proto_ops(struct tcf_proto_ops *ops) |
{ |
struct tcf_proto_ops *t, **tp; |
|
write_lock(&cls_mod_lock); |
for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) { |
if (strcmp(ops->kind, t->kind) == 0) { |
write_unlock(&cls_mod_lock); |
return -EEXIST; |
} |
} |
|
ops->next = NULL; |
*tp = ops; |
write_unlock(&cls_mod_lock); |
return 0; |
} |
|
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) |
{ |
struct tcf_proto_ops *t, **tp; |
|
write_lock(&cls_mod_lock); |
for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) |
if (t == ops) |
break; |
|
if (!t) { |
write_unlock(&cls_mod_lock); |
return -ENOENT; |
} |
*tp = t->next; |
write_unlock(&cls_mod_lock); |
return 0; |
} |
|
static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, |
struct tcf_proto *tp, unsigned long fh, int event); |
|
|
/* Select new prio value from the range, managed by kernel. */ |
|
static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp) |
{ |
u32 first = TC_H_MAKE(0xC0000000U,0U); |
|
if (tp) |
first = tp->prio-1; |
|
return first; |
} |
|
/* Add/change/delete/get a filter node */ |
|
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) |
{ |
struct rtattr **tca = arg; |
struct tcmsg *t = NLMSG_DATA(n); |
u32 protocol = TC_H_MIN(t->tcm_info); |
u32 prio = TC_H_MAJ(t->tcm_info); |
u32 nprio = prio; |
u32 parent = t->tcm_parent; |
struct net_device *dev; |
struct Qdisc *q; |
struct tcf_proto **back, **chain; |
struct tcf_proto *tp = NULL; |
struct tcf_proto_ops *tp_ops; |
struct Qdisc_class_ops *cops; |
unsigned long cl = 0; |
unsigned long fh; |
int err; |
|
if (prio == 0) { |
/* If no priority is given, user wants we allocated it. */ |
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) |
return -ENOENT; |
prio = TC_H_MAKE(0x80000000U,0U); |
} |
|
/* Find head of filter chain. */ |
|
/* Find link */ |
if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL) |
return -ENODEV; |
|
/* Find qdisc */ |
if (!parent) { |
q = dev->qdisc_sleeping; |
parent = q->handle; |
} else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) |
return -EINVAL; |
|
/* Is it classful? */ |
if ((cops = q->ops->cl_ops) == NULL) |
return -EINVAL; |
|
/* Do we search for filter, attached to class? */ |
if (TC_H_MIN(parent)) { |
cl = cops->get(q, parent); |
if (cl == 0) |
return -ENOENT; |
} |
|
/* And the last stroke */ |
chain = cops->tcf_chain(q, cl); |
err = -EINVAL; |
if (chain == NULL) |
goto errout; |
|
/* Check the chain for existence of proto-tcf with this priority */ |
for (back = chain; (tp=*back) != NULL; back = &tp->next) { |
if (tp->prio >= prio) { |
if (tp->prio == prio) { |
if (!nprio || (tp->protocol != protocol && protocol)) |
goto errout; |
} else |
tp = NULL; |
break; |
} |
} |
|
if (tp == NULL) { |
/* Proto-tcf does not exist, create new one */ |
|
if (tca[TCA_KIND-1] == NULL || !protocol) |
goto errout; |
|
err = -ENOENT; |
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) |
goto errout; |
|
|
/* Create new proto tcf */ |
|
err = -ENOBUFS; |
if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL) |
goto errout; |
tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); |
#ifdef CONFIG_KMOD |
if (tp_ops==NULL && tca[TCA_KIND-1] != NULL) { |
struct rtattr *kind = tca[TCA_KIND-1]; |
char module_name[4 + IFNAMSIZ + 1]; |
|
if (RTA_PAYLOAD(kind) <= IFNAMSIZ) { |
sprintf(module_name, "cls_%s", (char*)RTA_DATA(kind)); |
request_module (module_name); |
tp_ops = tcf_proto_lookup_ops(kind); |
} |
} |
#endif |
if (tp_ops == NULL) { |
err = -EINVAL; |
kfree(tp); |
goto errout; |
} |
memset(tp, 0, sizeof(*tp)); |
tp->ops = tp_ops; |
tp->protocol = protocol; |
tp->prio = nprio ? : tcf_auto_prio(*back); |
tp->q = q; |
tp->classify = tp_ops->classify; |
tp->classid = parent; |
err = tp_ops->init(tp); |
if (err) { |
kfree(tp); |
goto errout; |
} |
write_lock(&qdisc_tree_lock); |
spin_lock_bh(&dev->queue_lock); |
tp->next = *back; |
*back = tp; |
spin_unlock_bh(&dev->queue_lock); |
write_unlock(&qdisc_tree_lock); |
} else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) |
goto errout; |
|
fh = tp->ops->get(tp, t->tcm_handle); |
|
if (fh == 0) { |
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { |
write_lock(&qdisc_tree_lock); |
spin_lock_bh(&dev->queue_lock); |
*back = tp->next; |
spin_unlock_bh(&dev->queue_lock); |
write_unlock(&qdisc_tree_lock); |
tcf_destroy(tp); |
err = 0; |
goto errout; |
} |
|
err = -ENOENT; |
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) |
goto errout; |
} else { |
switch (n->nlmsg_type) { |
case RTM_NEWTFILTER: |
err = -EEXIST; |
if (n->nlmsg_flags&NLM_F_EXCL) |
goto errout; |
break; |
case RTM_DELTFILTER: |
err = tp->ops->delete(tp, fh); |
goto errout; |
case RTM_GETTFILTER: |
err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); |
goto errout; |
default: |
err = -EINVAL; |
goto errout; |
} |
} |
|
err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); |
if (err == 0) |
tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); |
|
errout: |
if (cl) |
cops->put(q, cl); |
return err; |
} |
|
static int |
tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, |
u32 pid, u32 seq, unsigned flags, int event) |
{ |
struct tcmsg *tcm; |
struct nlmsghdr *nlh; |
unsigned char *b = skb->tail; |
|
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); |
nlh->nlmsg_flags = flags; |
tcm = NLMSG_DATA(nlh); |
tcm->tcm_family = AF_UNSPEC; |
tcm->tcm_ifindex = tp->q->dev->ifindex; |
tcm->tcm_parent = tp->classid; |
tcm->tcm_handle = 0; |
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); |
RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); |
if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) |
goto rtattr_failure; |
nlh->nlmsg_len = skb->tail - b; |
return skb->len; |
|
nlmsg_failure: |
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, |
struct tcf_proto *tp, unsigned long fh, int event) |
{ |
struct sk_buff *skb; |
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; |
|
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); |
if (!skb) |
return -ENOBUFS; |
|
if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { |
kfree_skb(skb); |
return -EINVAL; |
} |
|
return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); |
} |
|
struct tcf_dump_args |
{ |
struct tcf_walker w; |
struct sk_buff *skb; |
struct netlink_callback *cb; |
}; |
|
static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) |
{ |
struct tcf_dump_args *a = (void*)arg; |
|
return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, |
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); |
} |
|
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) |
{ |
int t; |
int s_t; |
struct net_device *dev; |
struct Qdisc *q; |
struct tcf_proto *tp, **chain; |
struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); |
unsigned long cl = 0; |
struct Qdisc_class_ops *cops; |
struct tcf_dump_args arg; |
|
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) |
return skb->len; |
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) |
return skb->len; |
|
read_lock(&qdisc_tree_lock); |
if (!tcm->tcm_parent) |
q = dev->qdisc_sleeping; |
else |
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); |
if (q == NULL) { |
read_unlock(&qdisc_tree_lock); |
dev_put(dev); |
return skb->len; |
} |
if ((cops = q->ops->cl_ops) == NULL) |
goto errout; |
if (TC_H_MIN(tcm->tcm_parent)) { |
cl = cops->get(q, tcm->tcm_parent); |
if (cl == 0) |
goto errout; |
} |
chain = cops->tcf_chain(q, cl); |
if (chain == NULL) |
goto errout; |
|
s_t = cb->args[0]; |
|
for (tp=*chain, t=0; tp; tp = tp->next, t++) { |
if (t < s_t) continue; |
if (TC_H_MAJ(tcm->tcm_info) && |
TC_H_MAJ(tcm->tcm_info) != tp->prio) |
continue; |
if (TC_H_MIN(tcm->tcm_info) && |
TC_H_MIN(tcm->tcm_info) != tp->protocol) |
continue; |
if (t > s_t) |
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); |
if (cb->args[1] == 0) { |
if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, |
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { |
break; |
} |
cb->args[1] = 1; |
} |
if (tp->ops->walk == NULL) |
continue; |
arg.w.fn = tcf_node_dump; |
arg.skb = skb; |
arg.cb = cb; |
arg.w.stop = 0; |
arg.w.skip = cb->args[1]-1; |
arg.w.count = 0; |
tp->ops->walk(tp, &arg.w); |
cb->args[1] = arg.w.count+1; |
if (arg.w.stop) |
break; |
} |
|
cb->args[0] = t; |
|
errout: |
if (cl) |
cops->put(q, cl); |
|
read_unlock(&qdisc_tree_lock); |
dev_put(dev); |
return skb->len; |
} |
|
|
int __init tc_filter_init(void) |
{ |
struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; |
|
/* Setup rtnetlink links. It is made here to avoid |
exporting large number of public symbols. |
*/ |
|
if (link_p) { |
link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; |
link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; |
link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; |
link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; |
} |
#define INIT_TC_FILTER(name) { \ |
extern struct tcf_proto_ops cls_##name##_ops; \ |
register_tcf_proto_ops(&cls_##name##_ops); \ |
} |
|
#ifdef CONFIG_NET_CLS_U32 |
INIT_TC_FILTER(u32); |
#endif |
#ifdef CONFIG_NET_CLS_ROUTE4 |
INIT_TC_FILTER(route4); |
#endif |
#ifdef CONFIG_NET_CLS_FW |
INIT_TC_FILTER(fw); |
#endif |
#ifdef CONFIG_NET_CLS_RSVP |
INIT_TC_FILTER(rsvp); |
#endif |
#ifdef CONFIG_NET_CLS_TCINDEX |
INIT_TC_FILTER(tcindex); |
#endif |
#ifdef CONFIG_NET_CLS_RSVP6 |
INIT_TC_FILTER(rsvp6); |
#endif |
return 0; |
} |
/sch_htb.c
0,0 → 1,1698
/* vim: ts=8 sw=8 |
* net/sched/sch_htb.c Hierarchical token bucket, feed tree version |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Martin Devera, <devik@cdi.cz> |
* |
* Credits (in time order) for older HTB versions: |
* Stef Coene <stef.coene@docum.org> |
* HTB support at LARTC mailing list |
* Ondrej Kraus, <krauso@barr.cz> |
* found missing INIT_QDISC(htb) |
* Vladimir Smelhaus, Aamer Akhter, Bert Hubert |
* helped a lot to locate nasty class stall bug |
* Andi Kleen, Jamal Hadi, Bert Hubert |
* code review and helpful comments on shaping |
* Tomasz Wrona, <tw@eter.tym.pl> |
* created test case so that I was able to fix nasty bug |
* Wilfried Weissmann |
* spotted bug in dequeue code and helped with fix |
* Jiri Fojtasek |
* fixed requeue routine |
* and many others. thanks. |
* |
* $Id: sch_htb.c,v 1.1.1.1 2004-04-15 01:16:18 phoenix Exp $ |
*/ |
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/version.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <linux/list.h> |
#include <linux/compiler.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
#include <linux/rbtree.h> |
|
/* HTB algorithm. |
Author: devik@cdi.cz |
======================================================================== |
HTB is like TBF with multiple classes. It is also similar to CBQ because |
it allows to assign priority to each class in hierarchy. |
In fact it is another implementation of Floyd's formal sharing. |
|
Levels: |
Each class is assigned level. Leaf has ALWAYS level 0 and root |
classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level |
one less than their parent. |
*/ |
|
#define HTB_HSIZE 16 /* classid hash size */ |
#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */ |
#define HTB_DEBUG 1 /* compile debugging support (activated by tc tool) */ |
#define HTB_RATECM 1 /* whether to use rate computer */ |
#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ |
#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) |
#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) |
#define HTB_VER 0x30010 /* major must be matched with number suplied by TC as version */ |
|
#if HTB_VER >> 16 != TC_HTB_PROTOVER |
#error "Mismatched sch_htb.c and pkt_sch.h" |
#endif |
|
/* debugging support; S is subsystem, these are defined: |
0 - netlink messages |
1 - enqueue |
2 - drop & requeue |
3 - dequeue main |
4 - dequeue one prio DRR part |
5 - dequeue class accounting |
6 - class overlimit status computation |
7 - hint tree |
8 - event queue |
10 - rate estimator |
11 - classifier |
12 - fast dequeue cache |
|
L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full |
q->debug uint32 contains 16 2-bit fields one for subsystem starting |
from LSB |
*/ |
#ifdef HTB_DEBUG |
#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L) |
#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \ |
printk(KERN_DEBUG FMT,##ARG) |
#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC) |
#define HTB_PASSQ q, |
#define HTB_ARGQ struct htb_sched *q, |
#define static |
#undef __inline__ |
#define __inline__ |
#undef inline |
#define inline |
#define HTB_CMAGIC 0xFEFAFEF1 |
#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \ |
if ((N)->rb_color == -1) break; \ |
rb_erase(N,R); \ |
(N)->rb_color = -1; } while (0) |
#else |
#define HTB_DBG_COND(S,L) (0) |
#define HTB_DBG(S,L,FMT,ARG...) |
#define HTB_PASSQ |
#define HTB_ARGQ |
#define HTB_CHCL(cl) |
#define htb_safe_rb_erase(N,R) rb_erase(N,R) |
#endif |
|
|
/* used internaly to keep status of single class */ |
enum htb_cmode { |
HTB_CANT_SEND, /* class can't send and can't borrow */ |
HTB_MAY_BORROW, /* class can't send but may borrow */ |
HTB_CAN_SEND /* class can send */ |
}; |
|
/* interior & leaf nodes; props specific to leaves are marked L: */ |
struct htb_class |
{ |
#ifdef HTB_DEBUG |
unsigned magic; |
#endif |
/* general class parameters */ |
u32 classid; |
struct tc_stats stats; /* generic stats */ |
struct tc_htb_xstats xstats;/* our special stats */ |
int refcnt; /* usage count of this class */ |
|
#ifdef HTB_RATECM |
/* rate measurement counters */ |
unsigned long rate_bytes,sum_bytes; |
unsigned long rate_packets,sum_packets; |
#endif |
|
/* topology */ |
int level; /* our level (see above) */ |
struct htb_class *parent; /* parent class */ |
struct list_head hlist; /* classid hash list item */ |
struct list_head sibling; /* sibling list item */ |
struct list_head children; /* children list */ |
|
union { |
struct htb_class_leaf { |
struct Qdisc *q; |
int prio; |
int aprio; |
int quantum; |
int deficit[TC_HTB_MAXDEPTH]; |
struct list_head drop_list; |
} leaf; |
struct htb_class_inner { |
rb_root_t feed[TC_HTB_NUMPRIO]; /* feed trees */ |
rb_node_t *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ |
} inner; |
} un; |
rb_node_t node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ |
rb_node_t pq_node; /* node for event queue */ |
unsigned long pq_key; /* the same type as jiffies global */ |
|
int prio_activity; /* for which prios are we active */ |
enum htb_cmode cmode; /* current mode of the class */ |
|
/* class attached filters */ |
struct tcf_proto *filter_list; |
int filter_cnt; |
|
int warned; /* only one warning about non work conserving .. */ |
|
/* token bucket parameters */ |
struct qdisc_rate_table *rate; /* rate table of the class itself */ |
struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ |
long buffer,cbuffer; /* token bucket depth/rate */ |
long mbuffer; /* max wait time */ |
long tokens,ctokens; /* current number of tokens */ |
psched_time_t t_c; /* checkpoint time */ |
}; |
|
/* TODO: maybe compute rate when size is too large .. or drop ? */ |
static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate, |
int size) |
{ |
int slot = size >> rate->rate.cell_log; |
if (slot > 255) { |
cl->xstats.giants++; |
slot = 255; |
} |
return rate->data[slot]; |
} |
|
struct htb_sched |
{ |
struct list_head root; /* root classes list */ |
struct list_head hash[HTB_HSIZE]; /* hashed by classid */ |
struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */ |
|
/* self list - roots of self generating tree */ |
rb_root_t row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; |
int row_mask[TC_HTB_MAXDEPTH]; |
rb_node_t *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; |
|
/* self wait list - roots of wait PQs per row */ |
rb_root_t wait_pq[TC_HTB_MAXDEPTH]; |
|
/* time of nearest event per level (row) */ |
unsigned long near_ev_cache[TC_HTB_MAXDEPTH]; |
|
/* cached value of jiffies in dequeue */ |
unsigned long jiffies; |
|
/* whether we hit non-work conserving class during this dequeue; we use */ |
int nwc_hit; /* this to disable mindelay complaint in dequeue */ |
|
int defcls; /* class where unclassified flows go to */ |
u32 debug; /* subsystem debug levels */ |
|
/* filters for qdisc itself */ |
struct tcf_proto *filter_list; |
int filter_cnt; |
|
int rate2quantum; /* quant = rate / rate2quantum */ |
psched_time_t now; /* cached dequeue time */ |
struct timer_list timer; /* send delay timer */ |
#ifdef HTB_RATECM |
struct timer_list rttim; /* rate computer timer */ |
int recmp_bucket; /* which hash bucket to recompute next */ |
#endif |
|
/* non shaped skbs; let them go directly thru */ |
struct sk_buff_head direct_queue; |
int direct_qlen; /* max qlen of above */ |
|
long direct_pkts; |
}; |
|
/* compute hash of size HTB_HSIZE for given handle */ |
static __inline__ int htb_hash(u32 h) |
{ |
#if HTB_HSIZE != 16 |
#error "Declare new hash for your HTB_HSIZE" |
#endif |
h ^= h>>8; /* stolen from cbq_hash */ |
h ^= h>>4; |
return h & 0xf; |
} |
|
/* find class in global hash table using given handle */ |
static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct list_head *p; |
if (TC_H_MAJ(handle) != sch->handle) |
return NULL; |
|
list_for_each (p,q->hash+htb_hash(handle)) { |
struct htb_class *cl = list_entry(p,struct htb_class,hlist); |
if (cl->classid == handle) |
return cl; |
} |
return NULL; |
} |
|
/** |
* htb_classify - classify a packet into class |
* |
* It returns NULL if the packet should be dropped or -1 if the packet |
* should be passed directly thru. In all other cases leaf class is returned. |
* We allow direct class selection by classid in priority. The we examine |
* filters in qdisc and in inner nodes (if higher filter points to the inner |
* node). If we end up with classid MAJOR:0 we enqueue the skb into special |
* internal fifo (direct). These packets then go directly thru. If we still |
* have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull |
* then finish and return direct queue. |
*/ |
#define HTB_DIRECT (struct htb_class*)-1 |
static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct htb_class *cl; |
struct tcf_result res; |
struct tcf_proto *tcf; |
int result; |
|
/* allow to select class by setting skb->priority to valid classid; |
note that nfmark can be used too by attaching filter fw with no |
rules in it */ |
if (skb->priority == sch->handle) |
return HTB_DIRECT; /* X:0 (direct flow) selected */ |
if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) |
return cl; |
|
tcf = q->filter_list; |
while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { |
#ifdef CONFIG_NET_CLS_POLICE |
if (result == TC_POLICE_SHOT) |
return NULL; |
#endif |
if ((cl = (void*)res.class) == NULL) { |
if (res.classid == sch->handle) |
return HTB_DIRECT; /* X:0 (direct flow) */ |
if ((cl = htb_find(res.classid,sch)) == NULL) |
break; /* filter selected invalid classid */ |
} |
if (!cl->level) |
return cl; /* we hit leaf; return it */ |
|
/* we have got inner class; apply inner filter chain */ |
tcf = cl->filter_list; |
} |
/* classification failed; try to use default class */ |
cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch); |
if (!cl || cl->level) |
return HTB_DIRECT; /* bad default .. this is safe bet */ |
return cl; |
} |
|
#ifdef HTB_DEBUG |
static void htb_next_rb_node(rb_node_t **n); |
#define HTB_DUMTREE(root,memb) if(root) { \ |
rb_node_t *n = (root)->rb_node; \ |
while (n->rb_left) n = n->rb_left; \ |
while (n) { \ |
struct htb_class *cl = rb_entry(n, struct htb_class, memb); \ |
printk(" %x",cl->classid); htb_next_rb_node (&n); \ |
} } |
|
static void htb_debug_dump (struct htb_sched *q) |
{ |
int i,p; |
printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies); |
/* rows */ |
for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) { |
printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]); |
for (p=0;p<TC_HTB_NUMPRIO;p++) { |
if (!q->row[i][p].rb_node) continue; |
printk(" p%d:",p); |
HTB_DUMTREE(q->row[i]+p,node[p]); |
} |
printk("\n"); |
} |
/* classes */ |
for (i = 0; i < HTB_HSIZE; i++) { |
struct list_head *l; |
list_for_each (l,q->hash+i) { |
struct htb_class *cl = list_entry(l,struct htb_class,hlist); |
long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0); |
printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d " |
"pa=%x f:", |
cl->classid,cl->cmode,cl->tokens,cl->ctokens, |
cl->pq_node.rb_color==-1?0:cl->pq_key,diff, |
cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity); |
if (cl->level) |
for (p=0;p<TC_HTB_NUMPRIO;p++) { |
if (!cl->un.inner.feed[p].rb_node) continue; |
printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0); |
HTB_DUMTREE(cl->un.inner.feed+p,node[p]); |
} |
printk("\n"); |
} |
} |
} |
#endif |
/** |
* htb_add_to_id_tree - adds class to the round robin list |
* |
* Routine adds class to the list (actually tree) sorted by classid. |
* Make sure that class is not already on such list for given prio. |
*/ |
static void htb_add_to_id_tree (HTB_ARGQ rb_root_t *root, |
struct htb_class *cl,int prio) |
{ |
rb_node_t **p = &root->rb_node, *parent = NULL; |
HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio); |
#ifdef HTB_DEBUG |
if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; } |
HTB_CHCL(cl); |
if (*p) { |
struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]); |
HTB_CHCL(x); |
} |
#endif |
while (*p) { |
struct htb_class *c; parent = *p; |
c = rb_entry(parent, struct htb_class, node[prio]); |
HTB_CHCL(c); |
if (cl->classid > c->classid) |
p = &parent->rb_right; |
else |
p = &parent->rb_left; |
} |
rb_link_node(&cl->node[prio], parent, p); |
rb_insert_color(&cl->node[prio], root); |
} |
|
/** |
* htb_add_to_wait_tree - adds class to the event queue with delay |
* |
* The class is added to priority event queue to indicate that class will |
* change its mode in cl->pq_key microseconds. Make sure that class is not |
* already in the queue. |
*/ |
static void htb_add_to_wait_tree (struct htb_sched *q, |
struct htb_class *cl,long delay,int debug_hint) |
{ |
rb_node_t **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; |
HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key); |
#ifdef HTB_DEBUG |
if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; } |
HTB_CHCL(cl); |
if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit()) |
printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint); |
#endif |
cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay); |
if (cl->pq_key == q->jiffies) |
cl->pq_key++; |
|
/* update the nearest event cache */ |
if (time_after(q->near_ev_cache[cl->level], cl->pq_key)) |
q->near_ev_cache[cl->level] = cl->pq_key; |
|
while (*p) { |
struct htb_class *c; parent = *p; |
c = rb_entry(parent, struct htb_class, pq_node); |
if (time_after_eq(cl->pq_key, c->pq_key)) |
p = &parent->rb_right; |
else |
p = &parent->rb_left; |
} |
rb_link_node(&cl->pq_node, parent, p); |
rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); |
} |
|
/** |
* htb_next_rb_node - finds next node in binary tree |
* |
* When we are past last key we return NULL. |
* Average complexity is 2 steps per call. |
*/ |
static void htb_next_rb_node(rb_node_t **n) |
{ |
rb_node_t *p; |
if ((*n)->rb_right) { |
/* child at right. use it or its leftmost ancestor */ |
*n = (*n)->rb_right; |
while ((*n)->rb_left) |
*n = (*n)->rb_left; |
return; |
} |
while ((p = (*n)->rb_parent) != NULL) { |
/* if we've arrived from left child then we have next node */ |
if (p->rb_left == *n) break; |
*n = p; |
} |
*n = p; |
} |
|
/** |
* htb_add_class_to_row - add class to its row |
* |
* The class is added to row at priorities marked in mask. |
* It does nothing if mask == 0. |
*/ |
static inline void htb_add_class_to_row(struct htb_sched *q, |
struct htb_class *cl,int mask) |
{ |
HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n", |
cl->classid,mask,q->row_mask[cl->level]); |
HTB_CHCL(cl); |
q->row_mask[cl->level] |= mask; |
while (mask) { |
int prio = ffz(~mask); |
mask &= ~(1 << prio); |
htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio); |
} |
} |
|
/** |
* htb_remove_class_from_row - removes class from its row |
* |
* The class is removed from row at priorities marked in mask. |
* It does nothing if mask == 0. |
*/ |
static __inline__ void htb_remove_class_from_row(struct htb_sched *q, |
struct htb_class *cl,int mask) |
{ |
int m = 0; |
HTB_CHCL(cl); |
while (mask) { |
int prio = ffz(~mask); |
mask &= ~(1 << prio); |
if (q->ptr[cl->level][prio] == cl->node+prio) |
htb_next_rb_node(q->ptr[cl->level]+prio); |
htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio); |
if (!q->row[cl->level][prio].rb_node) |
m |= 1 << prio; |
} |
HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n", |
cl->classid,mask,q->row_mask[cl->level],m); |
q->row_mask[cl->level] &= ~m; |
} |
|
/** |
* htb_activate_prios - creates active classe's feed chain |
* |
* The class is connected to ancestors and/or appropriate rows |
* for priorities it is participating on. cl->cmode must be new |
* (activated) mode. It does nothing if cl->prio_activity == 0. |
*/ |
static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl) |
{ |
struct htb_class *p = cl->parent; |
long m,mask = cl->prio_activity; |
HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); |
HTB_CHCL(cl); |
|
while (cl->cmode == HTB_MAY_BORROW && p && mask) { |
HTB_CHCL(p); |
m = mask; while (m) { |
int prio = ffz(~m); |
m &= ~(1 << prio); |
|
if (p->un.inner.feed[prio].rb_node) |
/* parent already has its feed in use so that |
reset bit in mask as parent is already ok */ |
mask &= ~(1 << prio); |
|
htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio); |
} |
HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", |
p->classid,p->prio_activity,mask,p->cmode); |
p->prio_activity |= mask; |
cl = p; p = cl->parent; |
HTB_CHCL(cl); |
} |
if (cl->cmode == HTB_CAN_SEND && mask) |
htb_add_class_to_row(q,cl,mask); |
} |
|
/** |
* htb_deactivate_prios - remove class from feed chain |
* |
* cl->cmode must represent old mode (before deactivation). It does |
* nothing if cl->prio_activity == 0. Class is removed from all feed |
* chains and rows. |
*/ |
static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) |
{ |
struct htb_class *p = cl->parent; |
long m,mask = cl->prio_activity; |
HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); |
HTB_CHCL(cl); |
|
while (cl->cmode == HTB_MAY_BORROW && p && mask) { |
m = mask; mask = 0; |
while (m) { |
int prio = ffz(~m); |
m &= ~(1 << prio); |
|
if (p->un.inner.ptr[prio] == cl->node+prio) |
htb_next_rb_node(p->un.inner.ptr + prio); |
|
htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio); |
|
if (!p->un.inner.feed[prio].rb_node) |
mask |= 1 << prio; |
} |
HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", |
p->classid,p->prio_activity,mask,p->cmode); |
p->prio_activity &= ~mask; |
cl = p; p = cl->parent; |
HTB_CHCL(cl); |
} |
if (cl->cmode == HTB_CAN_SEND && mask) |
htb_remove_class_from_row(q,cl,mask); |
} |
|
/** |
* htb_class_mode - computes and returns current class mode |
* |
* It computes cl's mode at time cl->t_c+diff and returns it. If mode |
* is not HTB_CAN_SEND then cl->pq_key is updated to time difference |
* from now to time when cl will change its state. |
* Also it is worth to note that class mode doesn't change simply |
* at cl->{c,}tokens == 0 but there can rather be hysteresis of |
* 0 .. -cl->{c,}buffer range. It is meant to limit number of |
* mode transitions per time unit. The speed gain is about 1/6. |
*/ |
static __inline__ enum htb_cmode |
htb_class_mode(struct htb_class *cl,long *diff) |
{ |
long toks; |
|
if ((toks = (cl->ctokens + *diff)) < ( |
#if HTB_HYSTERESIS |
cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : |
#endif |
0)) { |
*diff = -toks; |
return HTB_CANT_SEND; |
} |
if ((toks = (cl->tokens + *diff)) >= ( |
#if HTB_HYSTERESIS |
cl->cmode == HTB_CAN_SEND ? -cl->buffer : |
#endif |
0)) |
return HTB_CAN_SEND; |
|
*diff = -toks; |
return HTB_MAY_BORROW; |
} |
|
/** |
* htb_change_class_mode - changes classe's mode |
* |
* This should be the only way how to change classe's mode under normal |
* cirsumstances. Routine will update feed lists linkage, change mode |
* and add class to the wait event queue if appropriate. New mode should |
* be different from old one and cl->pq_key has to be valid if changing |
* to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). |
*/ |
static void |
htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) |
{ |
enum htb_cmode new_mode = htb_class_mode(cl,diff); |
|
HTB_CHCL(cl); |
HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid); |
|
if (new_mode == cl->cmode) |
return; |
|
if (cl->prio_activity) { /* not neccessary: speed optimization */ |
if (cl->cmode != HTB_CANT_SEND) |
htb_deactivate_prios(q,cl); |
cl->cmode = new_mode; |
if (new_mode != HTB_CANT_SEND) |
htb_activate_prios(q,cl); |
} else |
cl->cmode = new_mode; |
} |
|
/** |
* htb_activate - inserts leaf cl into appropriate active feeds |
* |
* Routine learns (new) priority of leaf and activates feed chain |
* for the prio. It can be called on already active leaf safely. |
* It also adds leaf into droplist. |
*/ |
static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl) |
{ |
BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen); |
HTB_CHCL(cl); |
if (!cl->prio_activity) { |
cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio); |
htb_activate_prios(q,cl); |
list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio); |
} |
} |
|
/** |
* htb_deactivate - remove leaf cl from active feeds |
* |
* Make sure that leaf is active. In the other words it can't be called |
* with non-active leaf. It also removes class from the drop list. |
*/ |
static __inline__ void |
htb_deactivate(struct htb_sched *q,struct htb_class *cl) |
{ |
BUG_TRAP(cl->prio_activity); |
HTB_CHCL(cl); |
htb_deactivate_prios(q,cl); |
cl->prio_activity = 0; |
list_del_init(&cl->un.leaf.drop_list); |
} |
|
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct htb_class *cl = htb_classify(skb,sch); |
|
if (cl == HTB_DIRECT || !cl) { |
/* enqueue to helper queue */ |
if (q->direct_queue.qlen < q->direct_qlen && cl) { |
__skb_queue_tail(&q->direct_queue, skb); |
q->direct_pkts++; |
} else { |
kfree_skb (skb); |
sch->stats.drops++; |
return NET_XMIT_DROP; |
} |
} else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { |
sch->stats.drops++; |
cl->stats.drops++; |
return NET_XMIT_DROP; |
} else { |
cl->stats.packets++; cl->stats.bytes += skb->len; |
htb_activate (q,cl); |
} |
|
sch->q.qlen++; |
sch->stats.packets++; sch->stats.bytes += skb->len; |
HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); |
return NET_XMIT_SUCCESS; |
} |
|
/* TODO: requeuing packet charges it to policers again !! */ |
static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct htb_class *cl = htb_classify(skb,sch); |
struct sk_buff *tskb; |
|
if (cl == HTB_DIRECT || !cl) { |
/* enqueue to helper queue */ |
if (q->direct_queue.qlen < q->direct_qlen && cl) { |
__skb_queue_head(&q->direct_queue, skb); |
} else { |
__skb_queue_head(&q->direct_queue, skb); |
tskb = __skb_dequeue_tail(&q->direct_queue); |
kfree_skb (tskb); |
sch->stats.drops++; |
return NET_XMIT_CN; |
} |
} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { |
sch->stats.drops++; |
cl->stats.drops++; |
return NET_XMIT_DROP; |
} else |
htb_activate (q,cl); |
|
sch->q.qlen++; |
HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); |
return NET_XMIT_SUCCESS; |
} |
|
static void htb_timer(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc*)arg; |
sch->flags &= ~TCQ_F_THROTTLED; |
wmb(); |
netif_schedule(sch->dev); |
} |
|
#ifdef HTB_RATECM |
#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0 |
static void htb_rate_timer(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc*)arg; |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct list_head *p; |
|
/* lock queue so that we can muck with it */ |
HTB_QLOCK(sch); |
HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies); |
|
q->rttim.expires = jiffies + HZ; |
add_timer(&q->rttim); |
|
/* scan and recompute one bucket at time */ |
if (++q->recmp_bucket >= HTB_HSIZE) |
q->recmp_bucket = 0; |
list_for_each (p,q->hash+q->recmp_bucket) { |
struct htb_class *cl = list_entry(p,struct htb_class,hlist); |
HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n", |
cl->classid,cl->sum_bytes,cl->sum_packets); |
RT_GEN (cl->sum_bytes,cl->rate_bytes); |
RT_GEN (cl->sum_packets,cl->rate_packets); |
} |
HTB_QUNLOCK(sch); |
} |
#endif |
|
/** |
* htb_charge_class - charges ammount "bytes" to leaf and ancestors |
* |
* Routine assumes that packet "bytes" long was dequeued from leaf cl |
* borrowing from "level". It accounts bytes to ceil leaky bucket for |
* leaf and all ancestors and to rate bucket for ancestors at levels |
* "level" and higher. It also handles possible change of mode resulting |
* from the update. Note that mode can also increase here (MAY_BORROW to |
* CAN_SEND) because we can use more precise clock that event queue here. |
* In such case we remove class from event queue first. |
*/ |
static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, |
int level,int bytes) |
{ |
long toks,diff; |
enum htb_cmode old_mode; |
HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes); |
|
#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \ |
if (toks > cl->B) toks = cl->B; \ |
toks -= L2T(cl, cl->R, bytes); \ |
if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \ |
cl->T = toks |
|
while (cl) { |
HTB_CHCL(cl); |
diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0); |
#ifdef HTB_DEBUG |
if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { |
if (net_ratelimit()) |
printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", |
cl->classid, diff, |
(unsigned long long) q->now, |
(unsigned long long) cl->t_c, |
q->jiffies); |
diff = 1000; |
} |
#endif |
if (cl->level >= level) { |
if (cl->level == level) cl->xstats.lends++; |
HTB_ACCNT (tokens,buffer,rate); |
} else { |
cl->xstats.borrows++; |
cl->tokens += diff; /* we moved t_c; update tokens */ |
} |
HTB_ACCNT (ctokens,cbuffer,ceil); |
cl->t_c = q->now; |
HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens); |
|
old_mode = cl->cmode; diff = 0; |
htb_change_class_mode(q,cl,&diff); |
if (old_mode != cl->cmode) { |
if (old_mode != HTB_CAN_SEND) |
htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); |
if (cl->cmode != HTB_CAN_SEND) |
htb_add_to_wait_tree (q,cl,diff,1); |
} |
|
#ifdef HTB_RATECM |
/* update rate counters */ |
cl->sum_bytes += bytes; cl->sum_packets++; |
#endif |
|
/* update byte stats except for leaves which are already updated */ |
if (cl->level) { |
cl->stats.bytes += bytes; |
cl->stats.packets++; |
} |
cl = cl->parent; |
} |
} |
|
/** |
* htb_do_events - make mode changes to classes at the level |
* |
* Scans event queue for pending events and applies them. Returns jiffies to |
* next pending event (0 for no event in pq). |
* Note: Aplied are events whose have cl->pq_key <= jiffies. |
*/ |
static long htb_do_events(struct htb_sched *q,int level) |
{ |
int i; |
HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n", |
level,q->wait_pq[level].rb_node,q->row_mask[level]); |
for (i = 0; i < 500; i++) { |
struct htb_class *cl; |
long diff; |
rb_node_t *p = q->wait_pq[level].rb_node; |
if (!p) return 0; |
while (p->rb_left) p = p->rb_left; |
|
cl = rb_entry(p, struct htb_class, pq_node); |
if (time_after(cl->pq_key, q->jiffies)) { |
HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies); |
return cl->pq_key - q->jiffies; |
} |
htb_safe_rb_erase(p,q->wait_pq+level); |
diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0); |
#ifdef HTB_DEBUG |
if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { |
if (net_ratelimit()) |
printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", |
cl->classid, diff, |
(unsigned long long) q->now, |
(unsigned long long) cl->t_c, |
q->jiffies); |
diff = 1000; |
} |
#endif |
htb_change_class_mode(q,cl,&diff); |
if (cl->cmode != HTB_CAN_SEND) |
htb_add_to_wait_tree (q,cl,diff,2); |
} |
if (net_ratelimit()) |
printk(KERN_WARNING "htb: too many events !\n"); |
return HZ/10; |
} |
|
/** |
* htb_lookup_leaf - returns next leaf class in DRR order |
* |
* Find leaf where current feed pointers points to. |
*/ |
static struct htb_class * |
htb_lookup_leaf(rb_root_t *tree,int prio,rb_node_t **pptr) |
{ |
int i; |
struct { |
rb_node_t *root; |
rb_node_t **pptr; |
} stk[TC_HTB_MAXDEPTH],*sp = stk; |
|
BUG_TRAP(tree->rb_node); |
sp->root = tree->rb_node; |
sp->pptr = pptr; |
|
for (i = 0; i < 65535; i++) { |
if (!*sp->pptr) { /* we are at right end; rewind & go up */ |
*sp->pptr = sp->root; |
while ((*sp->pptr)->rb_left) |
*sp->pptr = (*sp->pptr)->rb_left; |
if (sp > stk) { |
sp--; |
BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL; |
htb_next_rb_node (sp->pptr); |
} |
} else { |
struct htb_class *cl; |
cl = rb_entry(*sp->pptr,struct htb_class,node[prio]); |
HTB_CHCL(cl); |
if (!cl->level) |
return cl; |
(++sp)->root = cl->un.inner.feed[prio].rb_node; |
sp->pptr = cl->un.inner.ptr+prio; |
} |
} |
BUG_TRAP(0); |
return NULL; |
} |
|
/* dequeues packet at given priority and level; call only if |
you are sure that there is active class at prio/level */ |
static struct sk_buff * |
htb_dequeue_tree(struct htb_sched *q,int prio,int level) |
{ |
struct sk_buff *skb = NULL; |
struct htb_class *cl,*start; |
/* look initial class up in the row */ |
start = cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio); |
|
do { |
next: |
BUG_TRAP(cl); |
if (!cl) return NULL; |
HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n", |
prio,level,cl->classid,cl->un.leaf.deficit[level]); |
|
/* class can be empty - it is unlikely but can be true if leaf |
qdisc drops packets in enqueue routine or if someone used |
graft operation on the leaf since last dequeue; |
simply deactivate and skip such class */ |
if (unlikely(cl->un.leaf.q->q.qlen == 0)) { |
struct htb_class *next; |
htb_deactivate(q,cl); |
|
/* row/level might become empty */ |
if ((q->row_mask[level] & (1 << prio)) == 0) |
return NULL; |
|
next = htb_lookup_leaf (q->row[level]+prio, |
prio,q->ptr[level]+prio); |
if (cl == start) /* fix start if we just deleted it */ |
start = next; |
cl = next; |
goto next; |
} |
|
if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) |
break; |
if (!cl->warned) { |
printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid); |
cl->warned = 1; |
} |
q->nwc_hit++; |
htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); |
cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio); |
} while (cl != start); |
|
if (likely(skb != NULL)) { |
if ((cl->un.leaf.deficit[level] -= skb->len) < 0) { |
HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n", |
level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum); |
cl->un.leaf.deficit[level] += cl->un.leaf.quantum; |
htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); |
} |
/* this used to be after charge_class but this constelation |
gives us slightly better performance */ |
if (!cl->un.leaf.q->q.qlen) |
htb_deactivate (q,cl); |
htb_charge_class (q,cl,level,skb->len); |
} |
return skb; |
} |
|
static void htb_delay_by(struct Qdisc *sch,long delay) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
if (netif_queue_stopped(sch->dev)) return; |
if (delay <= 0) delay = 1; |
if (unlikely(delay > 5*HZ)) { |
if (net_ratelimit()) |
printk(KERN_INFO "HTB delay %ld > 5sec\n", delay); |
delay = 5*HZ; |
} |
/* why don't use jiffies here ? because expires can be in past */ |
mod_timer(&q->timer, q->jiffies + delay); |
sch->flags |= TCQ_F_THROTTLED; |
sch->stats.overlimits++; |
HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay); |
} |
|
static struct sk_buff *htb_dequeue(struct Qdisc *sch) |
{ |
struct sk_buff *skb = NULL; |
struct htb_sched *q = (struct htb_sched *)sch->data; |
int level; |
long min_delay; |
#ifdef HTB_DEBUG |
int evs_used = 0; |
#endif |
|
q->jiffies = jiffies; |
HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue), |
sch->q.qlen); |
|
/* try to dequeue direct packets as high prio (!) to minimize cpu work */ |
if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) { |
sch->flags &= ~TCQ_F_THROTTLED; |
sch->q.qlen--; |
return skb; |
} |
|
if (!sch->q.qlen) goto fin; |
PSCHED_GET_TIME(q->now); |
|
min_delay = LONG_MAX; |
q->nwc_hit = 0; |
for (level = 0; level < TC_HTB_MAXDEPTH; level++) { |
/* common case optimization - skip event handler quickly */ |
int m; |
long delay; |
if (time_after_eq(q->jiffies, q->near_ev_cache[level])) { |
delay = htb_do_events(q,level); |
q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ); |
#ifdef HTB_DEBUG |
evs_used++; |
#endif |
} else |
delay = q->near_ev_cache[level] - q->jiffies; |
|
if (delay && min_delay > delay) |
min_delay = delay; |
m = ~q->row_mask[level]; |
while (m != (int)(-1)) { |
int prio = ffz (m); |
m |= 1 << prio; |
skb = htb_dequeue_tree(q,prio,level); |
if (likely(skb != NULL)) { |
sch->q.qlen--; |
sch->flags &= ~TCQ_F_THROTTLED; |
goto fin; |
} |
} |
} |
#ifdef HTB_DEBUG |
if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) { |
if (min_delay == LONG_MAX) { |
printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n", |
evs_used,q->jiffies,jiffies); |
htb_debug_dump(q); |
} else |
printk(KERN_WARNING "HTB: mindelay=%ld, some class has " |
"too small rate\n",min_delay); |
} |
#endif |
htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay); |
fin: |
HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb); |
return skb; |
} |
|
/* try to drop from each class (by prio) until one succeed */ |
static unsigned int htb_drop(struct Qdisc* sch) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
int prio; |
|
for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { |
struct list_head *p; |
list_for_each (p,q->drops+prio) { |
struct htb_class *cl = list_entry(p, struct htb_class, |
un.leaf.drop_list); |
unsigned int len; |
if (cl->un.leaf.q->ops->drop && |
(len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { |
sch->q.qlen--; |
if (!cl->un.leaf.q->q.qlen) |
htb_deactivate (q,cl); |
return len; |
} |
} |
} |
return 0; |
} |
|
/* reset all classes */ |
/* always caled under BH & queue lock */ |
static void htb_reset(struct Qdisc* sch) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
int i; |
HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle); |
|
for (i = 0; i < HTB_HSIZE; i++) { |
struct list_head *p; |
list_for_each (p,q->hash+i) { |
struct htb_class *cl = list_entry(p,struct htb_class,hlist); |
if (cl->level) |
memset(&cl->un.inner,0,sizeof(cl->un.inner)); |
else { |
if (cl->un.leaf.q) |
qdisc_reset(cl->un.leaf.q); |
INIT_LIST_HEAD(&cl->un.leaf.drop_list); |
} |
cl->prio_activity = 0; |
cl->cmode = HTB_CAN_SEND; |
#ifdef HTB_DEBUG |
cl->pq_node.rb_color = -1; |
memset(cl->node,255,sizeof(cl->node)); |
#endif |
|
} |
} |
sch->flags &= ~TCQ_F_THROTTLED; |
del_timer(&q->timer); |
__skb_queue_purge(&q->direct_queue); |
sch->q.qlen = 0; |
memset(q->row,0,sizeof(q->row)); |
memset(q->row_mask,0,sizeof(q->row_mask)); |
memset(q->wait_pq,0,sizeof(q->wait_pq)); |
memset(q->ptr,0,sizeof(q->ptr)); |
for (i = 0; i < TC_HTB_NUMPRIO; i++) |
INIT_LIST_HEAD(q->drops+i); |
} |
|
static int htb_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct htb_sched *q = (struct htb_sched*)sch->data; |
struct rtattr *tb[TCA_HTB_INIT]; |
struct tc_htb_glob *gopt; |
int i; |
#ifdef HTB_DEBUG |
printk(KERN_INFO "HTB init, kernel part version %d.%d\n", |
HTB_VER >> 16,HTB_VER & 0xffff); |
#endif |
if (!opt || rtattr_parse(tb, TCA_HTB_INIT, RTA_DATA(opt), RTA_PAYLOAD(opt)) || |
tb[TCA_HTB_INIT-1] == NULL || |
RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) { |
printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); |
return -EINVAL; |
} |
gopt = RTA_DATA(tb[TCA_HTB_INIT-1]); |
if (gopt->version != HTB_VER >> 16) { |
printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n", |
HTB_VER >> 16,HTB_VER & 0xffff,gopt->version); |
return -EINVAL; |
} |
memset(q,0,sizeof(*q)); |
q->debug = gopt->debug; |
HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum); |
|
INIT_LIST_HEAD(&q->root); |
for (i = 0; i < HTB_HSIZE; i++) |
INIT_LIST_HEAD(q->hash+i); |
for (i = 0; i < TC_HTB_NUMPRIO; i++) |
INIT_LIST_HEAD(q->drops+i); |
|
init_timer(&q->timer); |
skb_queue_head_init(&q->direct_queue); |
|
q->direct_qlen = sch->dev->tx_queue_len; |
if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ |
q->direct_qlen = 2; |
q->timer.function = htb_timer; |
q->timer.data = (unsigned long)sch; |
|
#ifdef HTB_RATECM |
init_timer(&q->rttim); |
q->rttim.function = htb_rate_timer; |
q->rttim.data = (unsigned long)sch; |
q->rttim.expires = jiffies + HZ; |
add_timer(&q->rttim); |
#endif |
if ((q->rate2quantum = gopt->rate2quantum) < 1) |
q->rate2quantum = 1; |
q->defcls = gopt->defcls; |
|
MOD_INC_USE_COUNT; |
return 0; |
} |
|
static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct htb_sched *q = (struct htb_sched*)sch->data; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
struct tc_htb_glob gopt; |
HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle); |
/* stats */ |
HTB_QLOCK(sch); |
gopt.direct_pkts = q->direct_pkts; |
|
#ifdef HTB_DEBUG |
if (HTB_DBG_COND(0,2)) |
htb_debug_dump(q); |
#endif |
gopt.version = HTB_VER; |
gopt.rate2quantum = q->rate2quantum; |
gopt.defcls = q->defcls; |
gopt.debug = q->debug; |
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); |
rta->rta_len = skb->tail - b; |
sch->stats.qlen = sch->q.qlen; |
RTA_PUT(skb, TCA_STATS, sizeof(sch->stats), &sch->stats); |
HTB_QUNLOCK(sch); |
return skb->len; |
rtattr_failure: |
HTB_QUNLOCK(sch); |
skb_trim(skb, skb->tail - skb->data); |
return -1; |
} |
|
static int htb_dump_class(struct Qdisc *sch, unsigned long arg, |
struct sk_buff *skb, struct tcmsg *tcm) |
{ |
#ifdef HTB_DEBUG |
struct htb_sched *q = (struct htb_sched*)sch->data; |
#endif |
struct htb_class *cl = (struct htb_class*)arg; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
struct tc_htb_opt opt; |
|
HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid); |
|
HTB_QLOCK(sch); |
tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT; |
tcm->tcm_handle = cl->classid; |
if (!cl->level && cl->un.leaf.q) { |
tcm->tcm_info = cl->un.leaf.q->handle; |
cl->stats.qlen = cl->un.leaf.q->q.qlen; |
} |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
memset (&opt,0,sizeof(opt)); |
|
opt.rate = cl->rate->rate; opt.buffer = cl->buffer; |
opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer; |
opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio; |
opt.level = cl->level; |
RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); |
rta->rta_len = skb->tail - b; |
|
#ifdef HTB_RATECM |
cl->stats.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE); |
cl->stats.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE); |
#endif |
|
cl->xstats.tokens = cl->tokens; |
cl->xstats.ctokens = cl->ctokens; |
RTA_PUT(skb, TCA_STATS, sizeof(cl->stats), &cl->stats); |
RTA_PUT(skb, TCA_XSTATS, sizeof(cl->xstats), &cl->xstats); |
HTB_QUNLOCK(sch); |
return skb->len; |
rtattr_failure: |
HTB_QUNLOCK(sch); |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, |
struct Qdisc **old) |
{ |
struct htb_class *cl = (struct htb_class*)arg; |
|
if (cl && !cl->level) { |
if (new == NULL && (new = qdisc_create_dflt(sch->dev, |
&pfifo_qdisc_ops)) == NULL) |
return -ENOBUFS; |
sch_tree_lock(sch); |
if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) { |
if (cl->prio_activity) |
htb_deactivate ((struct htb_sched*)sch->data,cl); |
|
/* TODO: is it correct ? Why CBQ doesn't do it ? */ |
sch->q.qlen -= (*old)->q.qlen; |
qdisc_reset(*old); |
} |
sch_tree_unlock(sch); |
return 0; |
} |
return -ENOENT; |
} |
|
static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg) |
{ |
struct htb_class *cl = (struct htb_class*)arg; |
return (cl && !cl->level) ? cl->un.leaf.q : NULL; |
} |
|
static unsigned long htb_get(struct Qdisc *sch, u32 classid) |
{ |
#ifdef HTB_DEBUG |
struct htb_sched *q = (struct htb_sched *)sch->data; |
#endif |
struct htb_class *cl = htb_find(classid,sch); |
HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0); |
if (cl) |
cl->refcnt++; |
return (unsigned long)cl; |
} |
|
static void htb_destroy_filters(struct tcf_proto **fl) |
{ |
struct tcf_proto *tp; |
|
while ((tp = *fl) != NULL) { |
*fl = tp->next; |
tcf_destroy(tp); |
} |
} |
|
static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0); |
if (!cl->level) { |
BUG_TRAP(cl->un.leaf.q); |
sch->q.qlen -= cl->un.leaf.q->q.qlen; |
qdisc_destroy(cl->un.leaf.q); |
} |
qdisc_put_rtab(cl->rate); |
qdisc_put_rtab(cl->ceil); |
|
#ifdef CONFIG_NET_ESTIMATOR |
qdisc_kill_estimator(&cl->stats); |
#endif |
htb_destroy_filters (&cl->filter_list); |
|
while (!list_empty(&cl->children)) |
htb_destroy_class (sch,list_entry(cl->children.next, |
struct htb_class,sibling)); |
|
/* note: this delete may happen twice (see htb_delete) */ |
list_del(&cl->hlist); |
list_del(&cl->sibling); |
|
if (cl->prio_activity) |
htb_deactivate (q,cl); |
|
if (cl->cmode != HTB_CAN_SEND) |
htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); |
|
kfree(cl); |
} |
|
/* always caled under BH & queue lock */ |
static void htb_destroy(struct Qdisc* sch) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
HTB_DBG(0,1,"htb_destroy q=%p\n",q); |
|
del_timer_sync (&q->timer); |
#ifdef HTB_RATECM |
del_timer_sync (&q->rttim); |
#endif |
/* This line used to be after htb_destroy_class call below |
and surprisingly it worked in 2.4. But it must precede it |
because filter need its target class alive to be able to call |
unbind_filter on it (without Oops). */ |
htb_destroy_filters(&q->filter_list); |
|
while (!list_empty(&q->root)) |
htb_destroy_class (sch,list_entry(q->root.next, |
struct htb_class,sibling)); |
|
__skb_queue_purge(&q->direct_queue); |
MOD_DEC_USE_COUNT; |
} |
|
static int htb_delete(struct Qdisc *sch, unsigned long arg) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct htb_class *cl = (struct htb_class*)arg; |
HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); |
|
// TODO: why don't allow to delete subtree ? references ? does |
// tc subsys quarantee us that in htb_destroy it holds no class |
// refs so that we can remove children safely there ? |
if (!list_empty(&cl->children) || cl->filter_cnt) |
return -EBUSY; |
|
sch_tree_lock(sch); |
|
/* delete from hash and active; remainder in destroy_class */ |
list_del_init(&cl->hlist); |
if (cl->prio_activity) |
htb_deactivate (q,cl); |
|
if (--cl->refcnt == 0) |
htb_destroy_class(sch,cl); |
|
sch_tree_unlock(sch); |
return 0; |
} |
|
static void htb_put(struct Qdisc *sch, unsigned long arg) |
{ |
#ifdef HTB_DEBUG |
struct htb_sched *q = (struct htb_sched *)sch->data; |
#endif |
struct htb_class *cl = (struct htb_class*)arg; |
HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); |
|
if (--cl->refcnt == 0) |
htb_destroy_class(sch,cl); |
} |
|
static int htb_change_class(struct Qdisc *sch, u32 classid, |
u32 parentid, struct rtattr **tca, unsigned long *arg) |
{ |
int err = -EINVAL; |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct htb_class *cl = (struct htb_class*)*arg,*parent; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct qdisc_rate_table *rtab = NULL, *ctab = NULL; |
struct rtattr *tb[TCA_HTB_RTAB]; |
struct tc_htb_opt *hopt; |
|
/* extract all subattrs from opt attr */ |
if (!opt || rtattr_parse(tb, TCA_HTB_RTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || |
tb[TCA_HTB_PARMS-1] == NULL || |
RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt)) |
goto failure; |
|
parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch); |
|
hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]); |
HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum); |
rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]); |
ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]); |
if (!rtab || !ctab) goto failure; |
|
if (!cl) { /* new class */ |
struct Qdisc *new_q; |
/* check for valid classid */ |
if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch)) |
goto failure; |
|
/* check maximal depth */ |
if (parent && parent->parent && parent->parent->level < 2) { |
printk(KERN_ERR "htb: tree is too deep\n"); |
goto failure; |
} |
err = -ENOBUFS; |
if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL) |
goto failure; |
|
memset(cl, 0, sizeof(*cl)); |
cl->refcnt = 1; |
INIT_LIST_HEAD(&cl->sibling); |
INIT_LIST_HEAD(&cl->hlist); |
INIT_LIST_HEAD(&cl->children); |
INIT_LIST_HEAD(&cl->un.leaf.drop_list); |
#ifdef HTB_DEBUG |
cl->magic = HTB_CMAGIC; |
#endif |
|
/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) |
so that can't be used inside of sch_tree_lock |
-- thanks to Karlis Peisenieks */ |
new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); |
sch_tree_lock(sch); |
if (parent && !parent->level) { |
/* turn parent into inner node */ |
sch->q.qlen -= parent->un.leaf.q->q.qlen; |
qdisc_destroy (parent->un.leaf.q); |
if (parent->prio_activity) |
htb_deactivate (q,parent); |
|
/* remove from evt list because of level change */ |
if (parent->cmode != HTB_CAN_SEND) { |
htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/); |
parent->cmode = HTB_CAN_SEND; |
} |
parent->level = (parent->parent ? parent->parent->level |
: TC_HTB_MAXDEPTH) - 1; |
memset (&parent->un.inner,0,sizeof(parent->un.inner)); |
} |
/* leaf (we) needs elementary qdisc */ |
cl->un.leaf.q = new_q ? new_q : &noop_qdisc; |
|
cl->classid = classid; cl->parent = parent; |
|
/* set class to be in HTB_CAN_SEND state */ |
cl->tokens = hopt->buffer; |
cl->ctokens = hopt->cbuffer; |
cl->mbuffer = 60000000; /* 1min */ |
PSCHED_GET_TIME(cl->t_c); |
cl->cmode = HTB_CAN_SEND; |
|
/* attach to the hash list and parent's family */ |
list_add_tail(&cl->hlist, q->hash+htb_hash(classid)); |
list_add_tail(&cl->sibling, parent ? &parent->children : &q->root); |
#ifdef HTB_DEBUG |
{ |
int i; |
for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1; |
cl->pq_node.rb_color = -1; |
} |
#endif |
} else sch_tree_lock(sch); |
|
/* it used to be a nasty bug here, we have to check that node |
is really leaf before changing cl->un.leaf ! */ |
if (!cl->level) { |
cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum; |
if (!hopt->quantum && cl->un.leaf.quantum < 1000) { |
printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid); |
cl->un.leaf.quantum = 1000; |
} |
if (!hopt->quantum && cl->un.leaf.quantum > 200000) { |
printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid); |
cl->un.leaf.quantum = 200000; |
} |
if (hopt->quantum) |
cl->un.leaf.quantum = hopt->quantum; |
if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO) |
cl->un.leaf.prio = TC_HTB_NUMPRIO - 1; |
} |
|
cl->buffer = hopt->buffer; |
cl->cbuffer = hopt->cbuffer; |
if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab; |
if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab; |
sch_tree_unlock(sch); |
|
*arg = (unsigned long)cl; |
return 0; |
|
failure: |
if (rtab) qdisc_put_rtab(rtab); |
if (ctab) qdisc_put_rtab(ctab); |
return err; |
} |
|
static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct htb_class *cl = (struct htb_class *)arg; |
struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; |
HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl); |
return fl; |
} |
|
static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, |
u32 classid) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct htb_class *cl = htb_find (classid,sch); |
HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt); |
/*if (cl && !cl->level) return 0; |
The line above used to be there to prevent attaching filters to |
leaves. But at least tc_index filter uses this just to get class |
for other reasons so that we have to allow for it. |
---- |
19.6.2002 As Werner explained it is ok - bind filter is just |
another way to "lock" the class - unlike "get" this lock can |
be broken by class during destroy IIUC. |
*/ |
if (cl) |
cl->filter_cnt++; |
else |
q->filter_cnt++; |
return (unsigned long)cl; |
} |
|
static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
struct htb_class *cl = (struct htb_class *)arg; |
HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt); |
if (cl) |
cl->filter_cnt--; |
else |
q->filter_cnt--; |
} |
|
static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) |
{ |
struct htb_sched *q = (struct htb_sched *)sch->data; |
int i; |
|
if (arg->stop) |
return; |
|
for (i = 0; i < HTB_HSIZE; i++) { |
struct list_head *p; |
list_for_each (p,q->hash+i) { |
struct htb_class *cl = list_entry(p,struct htb_class,hlist); |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (arg->fn(sch, (unsigned long)cl, arg) < 0) { |
arg->stop = 1; |
return; |
} |
arg->count++; |
} |
} |
} |
|
static struct Qdisc_class_ops htb_class_ops = |
{ |
htb_graft, |
htb_leaf, |
htb_get, |
htb_put, |
htb_change_class, |
htb_delete, |
htb_walk, |
|
htb_find_tcf, |
htb_bind_filter, |
htb_unbind_filter, |
|
htb_dump_class, |
}; |
|
struct Qdisc_ops htb_qdisc_ops = |
{ |
NULL, |
&htb_class_ops, |
"htb", |
sizeof(struct htb_sched), |
|
htb_enqueue, |
htb_dequeue, |
htb_requeue, |
htb_drop, |
|
htb_init, |
htb_reset, |
htb_destroy, |
NULL /* htb_change */, |
|
htb_dump, |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&htb_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&htb_qdisc_ops); |
} |
MODULE_LICENSE("GPL"); |
#endif |
/police.c
0,0 → 1,251
/* |
* net/sched/police.c Input police filter. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/config.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/netdevice.h> |
#include <linux/skbuff.h> |
#include <linux/rtnetlink.h> |
#include <linux/init.h> |
#include <linux/proc_fs.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log]) |
#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log]) |
|
static u32 idx_gen; |
static struct tcf_police *tcf_police_ht[16]; |
/* Policer hash table lock */ |
static rwlock_t police_lock = RW_LOCK_UNLOCKED; |
|
/* Each policer is serialized by its individual spinlock */ |
|
static __inline__ unsigned tcf_police_hash(u32 index) |
{ |
return index&0xF; |
} |
|
static __inline__ struct tcf_police * tcf_police_lookup(u32 index) |
{ |
struct tcf_police *p; |
|
read_lock(&police_lock); |
for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { |
if (p->index == index) |
break; |
} |
read_unlock(&police_lock); |
return p; |
} |
|
static __inline__ u32 tcf_police_new_index(void) |
{ |
do { |
if (++idx_gen == 0) |
idx_gen = 1; |
} while (tcf_police_lookup(idx_gen)); |
|
return idx_gen; |
} |
|
|
void tcf_police_destroy(struct tcf_police *p) |
{ |
unsigned h = tcf_police_hash(p->index); |
struct tcf_police **p1p; |
|
for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { |
if (*p1p == p) { |
write_lock_bh(&police_lock); |
*p1p = p->next; |
write_unlock_bh(&police_lock); |
#ifdef CONFIG_NET_ESTIMATOR |
qdisc_kill_estimator(&p->stats); |
#endif |
if (p->R_tab) |
qdisc_put_rtab(p->R_tab); |
if (p->P_tab) |
qdisc_put_rtab(p->P_tab); |
kfree(p); |
return; |
} |
} |
BUG_TRAP(0); |
} |
|
struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est) |
{ |
unsigned h; |
struct tcf_police *p; |
struct rtattr *tb[TCA_POLICE_MAX]; |
struct tc_police *parm; |
|
if (rtattr_parse(tb, TCA_POLICE_MAX, RTA_DATA(rta), RTA_PAYLOAD(rta)) < 0) |
return NULL; |
|
if (tb[TCA_POLICE_TBF-1] == NULL) |
return NULL; |
|
parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); |
|
if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { |
p->refcnt++; |
return p; |
} |
|
p = kmalloc(sizeof(*p), GFP_KERNEL); |
if (p == NULL) |
return NULL; |
|
memset(p, 0, sizeof(*p)); |
p->refcnt = 1; |
spin_lock_init(&p->lock); |
p->stats.lock = &p->lock; |
if (parm->rate.rate) { |
if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL) |
goto failure; |
if (parm->peakrate.rate && |
(p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL) |
goto failure; |
} |
if (tb[TCA_POLICE_RESULT-1]) |
p->result = *(int*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); |
#ifdef CONFIG_NET_ESTIMATOR |
if (tb[TCA_POLICE_AVRATE-1]) |
p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); |
#endif |
p->toks = p->burst = parm->burst; |
p->mtu = parm->mtu; |
if (p->mtu == 0) { |
p->mtu = ~0; |
if (p->R_tab) |
p->mtu = 255<<p->R_tab->rate.cell_log; |
} |
if (p->P_tab) |
p->ptoks = L2T_P(p, p->mtu); |
PSCHED_GET_TIME(p->t_c); |
p->index = parm->index ? : tcf_police_new_index(); |
p->action = parm->action; |
#ifdef CONFIG_NET_ESTIMATOR |
if (est) |
qdisc_new_estimator(&p->stats, est); |
#endif |
h = tcf_police_hash(p->index); |
write_lock_bh(&police_lock); |
p->next = tcf_police_ht[h]; |
tcf_police_ht[h] = p; |
write_unlock_bh(&police_lock); |
return p; |
|
failure: |
if (p->R_tab) |
qdisc_put_rtab(p->R_tab); |
kfree(p); |
return NULL; |
} |
|
int tcf_police(struct sk_buff *skb, struct tcf_police *p) |
{ |
psched_time_t now; |
long toks; |
long ptoks = 0; |
|
spin_lock(&p->lock); |
|
p->stats.bytes += skb->len; |
p->stats.packets++; |
|
#ifdef CONFIG_NET_ESTIMATOR |
if (p->ewma_rate && p->stats.bps >= p->ewma_rate) { |
p->stats.overlimits++; |
spin_unlock(&p->lock); |
return p->action; |
} |
#endif |
|
if (skb->len <= p->mtu) { |
if (p->R_tab == NULL) { |
spin_unlock(&p->lock); |
return p->result; |
} |
|
PSCHED_GET_TIME(now); |
|
toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst, 0); |
|
if (p->P_tab) { |
ptoks = toks + p->ptoks; |
if (ptoks > (long)L2T_P(p, p->mtu)) |
ptoks = (long)L2T_P(p, p->mtu); |
ptoks -= L2T_P(p, skb->len); |
} |
toks += p->toks; |
if (toks > (long)p->burst) |
toks = p->burst; |
toks -= L2T(p, skb->len); |
|
if ((toks|ptoks) >= 0) { |
p->t_c = now; |
p->toks = toks; |
p->ptoks = ptoks; |
spin_unlock(&p->lock); |
return p->result; |
} |
} |
|
p->stats.overlimits++; |
spin_unlock(&p->lock); |
return p->action; |
} |
|
int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) |
{ |
unsigned char *b = skb->tail; |
struct tc_police opt; |
|
opt.index = p->index; |
opt.action = p->action; |
opt.mtu = p->mtu; |
opt.burst = p->burst; |
if (p->R_tab) |
opt.rate = p->R_tab->rate; |
else |
memset(&opt.rate, 0, sizeof(opt.rate)); |
if (p->P_tab) |
opt.peakrate = p->P_tab->rate; |
else |
memset(&opt.peakrate, 0, sizeof(opt.peakrate)); |
RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); |
if (p->result) |
RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); |
#ifdef CONFIG_NET_ESTIMATOR |
if (p->ewma_rate) |
RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); |
#endif |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
/cls_fw.c
0,0 → 1,379
/* |
* net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* |
* Changes: |
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one |
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <linux/netfilter.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
struct fw_head |
{ |
struct fw_filter *ht[256]; |
}; |
|
struct fw_filter |
{ |
struct fw_filter *next; |
u32 id; |
struct tcf_result res; |
#ifdef CONFIG_NET_CLS_POLICE |
struct tcf_police *police; |
#endif |
}; |
|
static __inline__ int fw_hash(u32 handle) |
{ |
return handle&0xFF; |
} |
|
static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, |
struct tcf_result *res) |
{ |
struct fw_head *head = (struct fw_head*)tp->root; |
struct fw_filter *f; |
#ifdef CONFIG_NETFILTER |
u32 id = skb->nfmark; |
#else |
u32 id = 0; |
#endif |
|
if (head == NULL) |
goto old_method; |
|
for (f=head->ht[fw_hash(id)]; f; f=f->next) { |
if (f->id == id) { |
*res = f->res; |
#ifdef CONFIG_NET_CLS_POLICE |
if (f->police) |
return tcf_police(skb, f->police); |
#endif |
return 0; |
} |
} |
return -1; |
|
old_method: |
if (id && (TC_H_MAJ(id) == 0 || |
!(TC_H_MAJ(id^tp->q->handle)))) { |
res->classid = id; |
res->class = 0; |
return 0; |
} |
return -1; |
} |
|
static unsigned long fw_get(struct tcf_proto *tp, u32 handle) |
{ |
struct fw_head *head = (struct fw_head*)tp->root; |
struct fw_filter *f; |
|
if (head == NULL) |
return 0; |
|
for (f=head->ht[fw_hash(handle)]; f; f=f->next) { |
if (f->id == handle) |
return (unsigned long)f; |
} |
return 0; |
} |
|
static void fw_put(struct tcf_proto *tp, unsigned long f) |
{ |
} |
|
static int fw_init(struct tcf_proto *tp) |
{ |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
static void fw_destroy(struct tcf_proto *tp) |
{ |
struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL); |
struct fw_filter *f; |
int h; |
|
if (head == NULL) { |
MOD_DEC_USE_COUNT; |
return; |
} |
|
for (h=0; h<256; h++) { |
while ((f=head->ht[h]) != NULL) { |
unsigned long cl; |
head->ht[h] = f->next; |
|
if ((cl = __cls_set_class(&f->res.class, 0)) != 0) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
#ifdef CONFIG_NET_CLS_POLICE |
tcf_police_release(f->police); |
#endif |
kfree(f); |
} |
} |
kfree(head); |
MOD_DEC_USE_COUNT; |
} |
|
static int fw_delete(struct tcf_proto *tp, unsigned long arg) |
{ |
struct fw_head *head = (struct fw_head*)tp->root; |
struct fw_filter *f = (struct fw_filter*)arg; |
struct fw_filter **fp; |
|
if (head == NULL || f == NULL) |
return -EINVAL; |
|
for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { |
if (*fp == f) { |
unsigned long cl; |
|
tcf_tree_lock(tp); |
*fp = f->next; |
tcf_tree_unlock(tp); |
|
if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
#ifdef CONFIG_NET_CLS_POLICE |
tcf_police_release(f->police); |
#endif |
kfree(f); |
return 0; |
} |
} |
return -EINVAL; |
} |
|
static int fw_change(struct tcf_proto *tp, unsigned long base, |
u32 handle, |
struct rtattr **tca, |
unsigned long *arg) |
{ |
struct fw_head *head = (struct fw_head*)tp->root; |
struct fw_filter *f; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_FW_MAX]; |
int err; |
|
if (!opt) |
return handle ? -EINVAL : 0; |
|
if (rtattr_parse(tb, TCA_FW_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) |
return -EINVAL; |
|
if ((f = (struct fw_filter*)*arg) != NULL) { |
/* Node exists: adjust only classid */ |
|
if (f->id != handle && handle) |
return -EINVAL; |
if (tb[TCA_FW_CLASSID-1]) { |
unsigned long cl; |
|
f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); |
cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid); |
cl = cls_set_class(tp, &f->res.class, cl); |
if (cl) |
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); |
} |
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_FW_POLICE-1]) { |
struct tcf_police *police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]); |
|
tcf_tree_lock(tp); |
police = xchg(&f->police, police); |
tcf_tree_unlock(tp); |
|
tcf_police_release(police); |
} |
#endif |
return 0; |
} |
|
if (!handle) |
return -EINVAL; |
|
if (head == NULL) { |
head = kmalloc(sizeof(struct fw_head), GFP_KERNEL); |
if (head == NULL) |
return -ENOBUFS; |
memset(head, 0, sizeof(*head)); |
|
tcf_tree_lock(tp); |
tp->root = head; |
tcf_tree_unlock(tp); |
} |
|
f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL); |
if (f == NULL) |
return -ENOBUFS; |
memset(f, 0, sizeof(*f)); |
|
f->id = handle; |
|
if (tb[TCA_FW_CLASSID-1]) { |
err = -EINVAL; |
if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != 4) |
goto errout; |
f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); |
cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); |
} |
|
#ifdef CONFIG_NET_CLS_POLICE |
if (tb[TCA_FW_POLICE-1]) |
f->police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]); |
#endif |
|
f->next = head->ht[fw_hash(handle)]; |
tcf_tree_lock(tp); |
head->ht[fw_hash(handle)] = f; |
tcf_tree_unlock(tp); |
|
*arg = (unsigned long)f; |
return 0; |
|
errout: |
if (f) |
kfree(f); |
return err; |
} |
|
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) |
{ |
struct fw_head *head = (struct fw_head*)tp->root; |
int h; |
|
if (head == NULL) |
arg->stop = 1; |
|
if (arg->stop) |
return; |
|
for (h = 0; h < 256; h++) { |
struct fw_filter *f; |
|
for (f = head->ht[h]; f; f = f->next) { |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (arg->fn(tp, (unsigned long)f, arg) < 0) { |
arg->stop = 1; |
break; |
} |
arg->count++; |
} |
} |
} |
|
static int fw_dump(struct tcf_proto *tp, unsigned long fh, |
struct sk_buff *skb, struct tcmsg *t) |
{ |
struct fw_filter *f = (struct fw_filter*)fh; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
if (f == NULL) |
return skb->len; |
|
t->tcm_handle = f->id; |
|
if (!f->res.classid |
#ifdef CONFIG_NET_CLS_POLICE |
&& !f->police |
#endif |
) |
return skb->len; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
if (f->res.classid) |
RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid); |
#ifdef CONFIG_NET_CLS_POLICE |
if (f->police) { |
struct rtattr * p_rta = (struct rtattr*)skb->tail; |
|
RTA_PUT(skb, TCA_FW_POLICE, 0, NULL); |
|
if (tcf_police_dump(skb, f->police) < 0) |
goto rtattr_failure; |
|
p_rta->rta_len = skb->tail - (u8*)p_rta; |
} |
#endif |
|
rta->rta_len = skb->tail - b; |
#ifdef CONFIG_NET_CLS_POLICE |
if (f->police) { |
if (qdisc_copy_stats(skb, &f->police->stats)) |
goto rtattr_failure; |
} |
#endif |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
struct tcf_proto_ops cls_fw_ops = { |
NULL, |
"fw", |
fw_classify, |
fw_init, |
fw_destroy, |
|
fw_get, |
fw_put, |
fw_change, |
fw_delete, |
fw_walk, |
fw_dump |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_tcf_proto_ops(&cls_fw_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_tcf_proto_ops(&cls_fw_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/sch_atm.c
0,0 → 1,718
/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */ |
|
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ |
|
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <linux/string.h> |
#include <linux/errno.h> |
#include <linux/skbuff.h> |
#include <linux/interrupt.h> |
#include <linux/atmdev.h> |
#include <linux/atmclip.h> |
#include <linux/netdevice.h> |
#include <linux/rtnetlink.h> |
#include <linux/file.h> /* for fput */ |
#include <net/pkt_sched.h> |
#include <net/sock.h> |
|
|
extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ |
#define sockfd_put(sock) fput((sock)->file) /* @@@ copied because it's |
__inline__ in socket.c */ |
|
|
#if 0 /* control */ |
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define DPRINTK(format,args...) |
#endif |
|
#if 0 /* data */ |
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define D2PRINTK(format,args...) |
#endif |
|
|
/* |
* The ATM queuing discipline provides a framework for invoking classifiers |
* (aka "filters"), which in turn select classes of this queuing discipline. |
* Each class maps the flow(s) it is handling to a given VC. Multiple classes |
* may share the same VC. |
* |
* When creating a class, VCs are specified by passing the number of the open |
* socket descriptor by which the calling process references the VC. The kernel |
* keeps the VC open at least until all classes using it are removed. |
* |
* In this file, most functions are named atm_tc_* to avoid confusion with all |
* the atm_* in net/atm. This naming convention differs from what's used in the |
* rest of net/sched. |
* |
* Known bugs: |
* - sometimes messes up the IP stack |
* - any manipulations besides the few operations described in the README, are |
* untested and likely to crash the system |
* - should lock the flow while there is data in the queue (?) |
*/ |
|
|
#define PRIV(sch) ((struct atm_qdisc_data *) (sch)->data) |
#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) |
|
|
struct atm_flow_data { |
struct Qdisc *q; /* FIFO, TBF, etc. */ |
struct tcf_proto *filter_list; |
struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ |
void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */ |
struct atm_qdisc_data *parent; /* parent qdisc */ |
struct socket *sock; /* for closing */ |
u32 classid; /* x:y type ID */ |
int ref; /* reference count */ |
struct tc_stats stats; |
struct atm_flow_data *next; |
struct atm_flow_data *excess; /* flow for excess traffic; |
NULL to set CLP instead */ |
int hdr_len; |
unsigned char hdr[0]; /* header data; MUST BE LAST */ |
}; |
|
struct atm_qdisc_data { |
struct atm_flow_data link; /* unclassified skbs go here */ |
struct atm_flow_data *flows; /* NB: "link" is also on this |
list */ |
struct tasklet_struct task; /* requeue tasklet */ |
}; |
|
|
/* ------------------------- Class/flow operations ------------------------- */ |
|
|
static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow) |
{ |
struct atm_flow_data *walk; |
|
DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow); |
for (walk = qdisc->flows; walk; walk = walk->next) |
if (walk == flow) return 1; |
DPRINTK("find_flow: not found\n"); |
return 0; |
} |
|
|
static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch, |
u32 classid) |
{ |
struct atm_flow_data *flow; |
|
for (flow = PRIV(sch)->flows; flow; flow = flow->next) |
if (flow->classid == classid) break; |
return flow; |
} |
|
|
static int atm_tc_graft(struct Qdisc *sch,unsigned long arg, |
struct Qdisc *new,struct Qdisc **old) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow = (struct atm_flow_data *) arg; |
|
DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch, |
p,flow,new,old); |
if (!find_flow(p,flow)) return -EINVAL; |
if (!new) new = &noop_qdisc; |
*old = xchg(&flow->q,new); |
if (*old) qdisc_reset(*old); |
return 0; |
} |
|
|
static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl) |
{ |
struct atm_flow_data *flow = (struct atm_flow_data *) cl; |
|
DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow); |
return flow ? flow->q : NULL; |
} |
|
|
static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid) |
{ |
struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch); |
struct atm_flow_data *flow; |
|
DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); |
flow = lookup_flow(sch,classid); |
if (flow) flow->ref++; |
DPRINTK("atm_tc_get: flow %p\n",flow); |
return (unsigned long) flow; |
} |
|
|
static unsigned long atm_tc_bind_filter(struct Qdisc *sch, |
unsigned long parent, u32 classid) |
{ |
return atm_tc_get(sch,classid); |
} |
|
|
static void destroy_filters(struct atm_flow_data *flow) |
{ |
struct tcf_proto *filter; |
|
while ((filter = flow->filter_list)) { |
DPRINTK("destroy_filters: destroying filter %p\n",filter); |
flow->filter_list = filter->next; |
tcf_destroy(filter); |
} |
} |
|
|
/* |
* atm_tc_put handles all destructions, including the ones that are explicitly |
* requested (atm_tc_destroy, etc.). The assumption here is that we never drop |
* anything that still seems to be in use. |
*/ |
|
static void atm_tc_put(struct Qdisc *sch, unsigned long cl) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow = (struct atm_flow_data *) cl; |
struct atm_flow_data **prev; |
|
DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); |
if (--flow->ref) return; |
DPRINTK("atm_tc_put: destroying\n"); |
for (prev = &p->flows; *prev; prev = &(*prev)->next) |
if (*prev == flow) break; |
if (!*prev) { |
printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow); |
return; |
} |
*prev = flow->next; |
DPRINTK("atm_tc_put: qdisc %p\n",flow->q); |
qdisc_destroy(flow->q); |
destroy_filters(flow); |
if (flow->sock) { |
DPRINTK("atm_tc_put: f_count %d\n", |
file_count(flow->sock->file)); |
flow->vcc->pop = flow->old_pop; |
sockfd_put(flow->sock); |
} |
if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess); |
if (flow != &p->link) kfree(flow); |
/* |
* If flow == &p->link, the qdisc no longer works at this point and |
* needs to be removed. (By the caller of atm_tc_put.) |
*/ |
} |
|
|
static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb) |
{ |
struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; |
|
D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p); |
VCC2FLOW(vcc)->old_pop(vcc,skb); |
tasklet_schedule(&p->task); |
} |
|
|
static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, |
struct rtattr **tca, unsigned long *arg) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow = (struct atm_flow_data *) *arg; |
struct atm_flow_data *excess = NULL; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_ATM_MAX]; |
struct socket *sock; |
int fd,error,hdr_len; |
void *hdr; |
|
DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," |
"flow %p,opt %p)\n",sch,p,classid,parent,flow,opt); |
/* |
* The concept of parents doesn't apply for this qdisc. |
*/ |
if (parent && parent != TC_H_ROOT && parent != sch->handle) |
return -EINVAL; |
/* |
* ATM classes cannot be changed. In order to change properties of the |
* ATM connection, that socket needs to be modified directly (via the |
* native ATM API. In order to send a flow to a different VC, the old |
* class needs to be removed and a new one added. (This may be changed |
* later.) |
*/ |
if (flow) return -EBUSY; |
if (opt == NULL || rtattr_parse(tb,TCA_ATM_MAX,RTA_DATA(opt), |
RTA_PAYLOAD(opt))) return -EINVAL; |
if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd)) |
return -EINVAL; |
fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]); |
DPRINTK("atm_tc_change: fd %d\n",fd); |
if (tb[TCA_ATM_HDR-1]) { |
hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]); |
hdr = RTA_DATA(tb[TCA_ATM_HDR-1]); |
} |
else { |
hdr_len = RFC1483LLC_LEN; |
hdr = NULL; /* default LLC/SNAP for IP */ |
} |
if (!tb[TCA_ATM_EXCESS-1]) excess = NULL; |
else { |
if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32)) |
return -EINVAL; |
excess = (struct atm_flow_data *) atm_tc_get(sch, |
*(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1])); |
if (!excess) return -ENOENT; |
} |
DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n", |
opt->rta_type,RTA_PAYLOAD(opt),hdr_len); |
if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */ |
DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file)); |
if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { |
error = -EPROTOTYPE; |
goto err_out; |
} |
/* @@@ should check if the socket is really operational or we'll crash |
on vcc->send */ |
if (classid) { |
if (TC_H_MAJ(classid ^ sch->handle)) { |
DPRINTK("atm_tc_change: classid mismatch\n"); |
error = -EINVAL; |
goto err_out; |
} |
if (find_flow(p,flow)) { |
error = -EEXIST; |
goto err_out; |
} |
} |
else { |
int i; |
unsigned long cl; |
|
for (i = 1; i < 0x8000; i++) { |
classid = TC_H_MAKE(sch->handle,0x8000 | i); |
if (!(cl = atm_tc_get(sch,classid))) break; |
atm_tc_put(sch,cl); |
} |
} |
DPRINTK("atm_tc_change: new id %x\n",classid); |
flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL); |
DPRINTK("atm_tc_change: flow %p\n",flow); |
if (!flow) { |
error = -ENOBUFS; |
goto err_out; |
} |
memset(flow,0,sizeof(*flow)); |
flow->filter_list = NULL; |
if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) |
flow->q = &noop_qdisc; |
DPRINTK("atm_tc_change: qdisc %p\n",flow->q); |
flow->sock = sock; |
flow->vcc = ATM_SD(sock); /* speedup */ |
flow->vcc->user_back = flow; |
DPRINTK("atm_tc_change: vcc %p\n",flow->vcc); |
flow->old_pop = flow->vcc->pop; |
flow->parent = p; |
flow->vcc->pop = sch_atm_pop; |
flow->classid = classid; |
flow->ref = 1; |
flow->excess = excess; |
flow->next = p->link.next; |
p->link.next = flow; |
flow->hdr_len = hdr_len; |
if (hdr) memcpy(flow->hdr,hdr,hdr_len); |
else { |
memcpy(flow->hdr,llc_oui,sizeof(llc_oui)); |
((u16 *) flow->hdr)[3] = htons(ETH_P_IP); |
} |
*arg = (unsigned long) flow; |
return 0; |
err_out: |
if (excess) atm_tc_put(sch,(unsigned long) excess); |
sockfd_put(sock); |
return error; |
} |
|
|
static int atm_tc_delete(struct Qdisc *sch,unsigned long arg) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow = (struct atm_flow_data *) arg; |
|
DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); |
if (!find_flow(PRIV(sch),flow)) return -EINVAL; |
if (flow->filter_list || flow == &p->link) return -EBUSY; |
/* |
* Reference count must be 2: one for "keepalive" (set at class |
* creation), and one for the reference held when calling delete. |
*/ |
if (flow->ref < 2) { |
printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref); |
return -EINVAL; |
} |
if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/ |
atm_tc_put(sch,arg); |
return 0; |
} |
|
|
static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow; |
|
DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); |
if (walker->stop) return; |
for (flow = p->flows; flow; flow = flow->next) { |
if (walker->count >= walker->skip) |
if (walker->fn(sch,(unsigned long) flow,walker) < 0) { |
walker->stop = 1; |
break; |
} |
walker->count++; |
} |
} |
|
|
static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow = (struct atm_flow_data *) cl; |
|
DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); |
return flow ? &flow->filter_list : &p->link.filter_list; |
} |
|
|
/* --------------------------- Qdisc operations ---------------------------- */ |
|
|
static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow = NULL ; /* @@@ */ |
struct tcf_result res; |
int result; |
int ret = NET_XMIT_POLICED; |
|
D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); |
result = TC_POLICE_OK; /* be nice to gcc */ |
if (TC_H_MAJ(skb->priority) != sch->handle || |
!(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority))) |
for (flow = p->flows; flow; flow = flow->next) |
if (flow->filter_list) { |
result = tc_classify(skb,flow->filter_list, |
&res); |
if (result < 0) continue; |
flow = (struct atm_flow_data *) res.class; |
if (!flow) flow = lookup_flow(sch,res.classid); |
break; |
} |
if (!flow) flow = &p->link; |
else { |
if (flow->vcc) |
ATM_SKB(skb)->atm_options = flow->vcc->atm_options; |
/*@@@ looks good ... but it's not supposed to work :-)*/ |
#ifdef CONFIG_NET_CLS_POLICE |
switch (result) { |
case TC_POLICE_SHOT: |
kfree_skb(skb); |
break; |
case TC_POLICE_RECLASSIFY: |
if (flow->excess) flow = flow->excess; |
else { |
ATM_SKB(skb)->atm_options |= |
ATM_ATMOPT_CLP; |
break; |
} |
/* fall through */ |
case TC_POLICE_OK: |
/* fall through */ |
default: |
break; |
} |
#endif |
} |
if ( |
#ifdef CONFIG_NET_CLS_POLICE |
result == TC_POLICE_SHOT || |
#endif |
(ret = flow->q->enqueue(skb,flow->q)) != 0) { |
sch->stats.drops++; |
if (flow) flow->stats.drops++; |
return ret; |
} |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
flow->stats.bytes += skb->len; |
flow->stats.packets++; |
/* |
* Okay, this may seem weird. We pretend we've dropped the packet if |
* it goes via ATM. The reason for this is that the outer qdisc |
* expects to be able to q->dequeue the packet later on if we return |
* success at this place. Also, sch->q.qdisc needs to reflect whether |
* there is a packet egligible for dequeuing or not. Note that the |
* statistics of the outer qdisc are necessarily wrong because of all |
* this. There's currently no correct solution for this. |
*/ |
if (flow == &p->link) { |
sch->q.qlen++; |
return 0; |
} |
tasklet_schedule(&p->task); |
return NET_XMIT_BYPASS; |
} |
|
|
/* |
* Dequeue packets and send them over ATM. Note that we quite deliberately |
* avoid checking net_device's flow control here, simply because sch_atm |
* uses its own channels, which have nothing to do with any CLIP/LANE/or |
* non-ATM interfaces. |
*/ |
|
|
static void sch_atm_dequeue(unsigned long data) |
{ |
struct Qdisc *sch = (struct Qdisc *) data; |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow; |
struct sk_buff *skb; |
|
D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p); |
for (flow = p->link.next; flow; flow = flow->next) |
/* |
* If traffic is properly shaped, this won't generate nasty |
* little bursts. Otherwise, it may ... (but that's okay) |
*/ |
while ((skb = flow->q->dequeue(flow->q))) { |
if (!atm_may_send(flow->vcc,skb->truesize)) { |
(void) flow->q->ops->requeue(skb,flow->q); |
break; |
} |
D2PRINTK("atm_tc_deqeueue: sending on class %p\n",flow); |
/* remove any LL header somebody else has attached */ |
skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data); |
if (skb_headroom(skb) < flow->hdr_len) { |
struct sk_buff *new; |
|
new = skb_realloc_headroom(skb,flow->hdr_len); |
dev_kfree_skb(skb); |
if (!new) continue; |
skb = new; |
} |
D2PRINTK("sch_atm_dequeue: ip %p, data %p\n", |
skb->nh.iph,skb->data); |
ATM_SKB(skb)->vcc = flow->vcc; |
memcpy(skb_push(skb,flow->hdr_len),flow->hdr, |
flow->hdr_len); |
atomic_add(skb->truesize,&flow->vcc->sk->wmem_alloc); |
/* atm.atm_options are already set by atm_tc_enqueue */ |
(void) flow->vcc->send(flow->vcc,skb); |
} |
} |
|
|
static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct sk_buff *skb; |
|
D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p); |
tasklet_schedule(&p->task); |
skb = p->link.q->dequeue(p->link.q); |
if (skb) sch->q.qlen--; |
return skb; |
} |
|
|
static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
int ret; |
|
D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); |
ret = p->link.q->ops->requeue(skb,p->link.q); |
if (!ret) sch->q.qlen++; |
else { |
sch->stats.drops++; |
p->link.stats.drops++; |
} |
return ret; |
} |
|
|
static unsigned int atm_tc_drop(struct Qdisc *sch) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow; |
unsigned int len; |
|
DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p); |
for (flow = p->flows; flow; flow = flow->next) |
if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) |
return len; |
return 0; |
} |
|
|
static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
|
DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); |
memset(p,0,sizeof(*p)); |
p->flows = &p->link; |
if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) |
p->link.q = &noop_qdisc; |
DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q); |
p->link.filter_list = NULL; |
p->link.vcc = NULL; |
p->link.sock = NULL; |
p->link.classid = sch->handle; |
p->link.ref = 1; |
p->link.next = NULL; |
tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch); |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
|
static void atm_tc_reset(struct Qdisc *sch) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow; |
|
DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p); |
for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q); |
sch->q.qlen = 0; |
} |
|
|
static void atm_tc_destroy(struct Qdisc *sch) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow; |
|
DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p); |
/* races ? */ |
while ((flow = p->flows)) { |
destroy_filters(flow); |
if (flow->ref > 1) |
printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow, |
flow->ref); |
atm_tc_put(sch,(unsigned long) flow); |
if (p->flows == flow) { |
printk(KERN_ERR "atm_destroy: putting flow %p didn't " |
"kill it\n",flow); |
p->flows = flow->next; /* brute force */ |
break; |
} |
} |
tasklet_kill(&p->task); |
MOD_DEC_USE_COUNT; |
} |
|
|
static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, |
struct sk_buff *skb, struct tcmsg *tcm) |
{ |
struct atm_qdisc_data *p = PRIV(sch); |
struct atm_flow_data *flow = (struct atm_flow_data *) cl; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", |
sch,p,flow,skb,tcm); |
if (!find_flow(p,flow)) return -EINVAL; |
tcm->tcm_handle = flow->classid; |
rta = (struct rtattr *) b; |
RTA_PUT(skb,TCA_OPTIONS,0,NULL); |
RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr); |
if (flow->vcc) { |
struct sockaddr_atmpvc pvc; |
int state; |
|
pvc.sap_family = AF_ATMPVC; |
pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; |
pvc.sap_addr.vpi = flow->vcc->vpi; |
pvc.sap_addr.vci = flow->vcc->vci; |
RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc); |
state = ATM_VF2VS(flow->vcc->flags); |
RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state); |
} |
if (flow->excess) |
RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid); |
else { |
static u32 zero = 0; |
|
RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero); |
} |
rta->rta_len = skb->tail-b; |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb,b-skb->data); |
return -1; |
} |
|
static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
return 0; |
} |
|
static struct Qdisc_class_ops atm_class_ops = |
{ |
atm_tc_graft, /* graft */ |
atm_tc_leaf, /* leaf */ |
atm_tc_get, /* get */ |
atm_tc_put, /* put */ |
atm_tc_change, /* change */ |
atm_tc_delete, /* delete */ |
atm_tc_walk, /* walk */ |
|
atm_tc_find_tcf, /* tcf_chain */ |
atm_tc_bind_filter, /* bind_tcf */ |
atm_tc_put, /* unbind_tcf */ |
|
atm_tc_dump_class, /* dump */ |
}; |
|
struct Qdisc_ops atm_qdisc_ops = |
{ |
NULL, /* next */ |
&atm_class_ops, /* cl_ops */ |
"atm", |
sizeof(struct atm_qdisc_data), |
|
atm_tc_enqueue, /* enqueue */ |
atm_tc_dequeue, /* dequeue */ |
atm_tc_requeue, /* requeue */ |
atm_tc_drop, /* drop */ |
|
atm_tc_init, /* init */ |
atm_tc_reset, /* reset */ |
atm_tc_destroy, /* destroy */ |
NULL, /* change */ |
|
atm_tc_dump /* dump */ |
}; |
|
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&atm_qdisc_ops); |
} |
|
|
void cleanup_module(void) |
{ |
unregister_qdisc(&atm_qdisc_ops); |
} |
#endif |
/sch_gred.c
0,0 → 1,637
/* |
* net/sched/sch_gred.c Generic Random Early Detection queue. |
* |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 |
* |
* 991129: - Bug fix with grio mode |
* - a better sing. AvgQ mode with Grio(WRED) |
* - A finer grained VQ dequeue based on sugestion |
* from Ren Liu |
* - More error checks |
* |
* |
* |
* For all the glorious comments look at Alexey's sch_red.c |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
#if 1 /* control */ |
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define DPRINTK(format,args...) |
#endif |
|
#if 0 /* data */ |
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define D2PRINTK(format,args...) |
#endif |
|
struct gred_sched_data; |
struct gred_sched; |
|
struct gred_sched_data |
{ |
/* Parameters */ |
u32 limit; /* HARD maximal queue length */ |
u32 qth_min; /* Min average length threshold: A scaled */ |
u32 qth_max; /* Max average length threshold: A scaled */ |
u32 DP; /* the drop pramaters */ |
char Wlog; /* log(W) */ |
char Plog; /* random number bits */ |
u32 Scell_max; |
u32 Rmask; |
u32 bytesin; /* bytes seen on virtualQ so far*/ |
u32 packetsin; /* packets seen on virtualQ so far*/ |
u32 backlog; /* bytes on the virtualQ */ |
u32 forced; /* packets dropped for exceeding limits */ |
u32 early; /* packets dropped as a warning */ |
u32 other; /* packets dropped by invoking drop() */ |
u32 pdrop; /* packets dropped because we exceeded physical queue limits */ |
char Scell_log; |
u8 Stab[256]; |
u8 prio; /* the prio of this vq */ |
|
/* Variables */ |
unsigned long qave; /* Average queue length: A scaled */ |
int qcount; /* Packets since last random number generation */ |
u32 qR; /* Cached random number */ |
|
psched_time_t qidlestart; /* Start of idle period */ |
}; |
|
struct gred_sched |
{ |
struct gred_sched_data *tab[MAX_DPs]; |
u32 DPs; |
u32 def; |
u8 initd; |
u8 grio; |
u8 eqp; |
}; |
|
static int |
gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
psched_time_t now; |
struct gred_sched_data *q=NULL; |
struct gred_sched *t= (struct gred_sched *)sch->data; |
unsigned long qave=0; |
int i=0; |
|
if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { |
D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); |
goto do_enqueue; |
} |
|
|
if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { |
printk("GRED: setting to default (%d)\n ",t->def); |
if (!(q=t->tab[t->def])) { |
DPRINTK("GRED: setting to default FAILED! dropping!! " |
"(%d)\n ", t->def); |
goto drop; |
} |
/* fix tc_index? --could be controvesial but needed for |
requeueing */ |
skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; |
} |
|
D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " |
"general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, |
sch->stats.backlog); |
/* sum up all the qaves of prios <= to ours to get the new qave*/ |
if (!t->eqp && t->grio) { |
for (i=0;i<t->DPs;i++) { |
if ((!t->tab[i]) || (i==q->DP)) |
continue; |
|
if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) |
qave +=t->tab[i]->qave; |
} |
|
} |
|
q->packetsin++; |
q->bytesin+=skb->len; |
|
if (t->eqp && t->grio) { |
qave=0; |
q->qave=t->tab[t->def]->qave; |
q->qidlestart=t->tab[t->def]->qidlestart; |
} |
|
if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { |
long us_idle; |
PSCHED_GET_TIME(now); |
us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); |
PSCHED_SET_PASTPERFECT(q->qidlestart); |
|
q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; |
} else { |
if (t->eqp) { |
q->qave += sch->stats.backlog - (q->qave >> q->Wlog); |
} else { |
q->qave += q->backlog - (q->qave >> q->Wlog); |
} |
|
} |
|
|
if (t->eqp && t->grio) |
t->tab[t->def]->qave=q->qave; |
|
if ((q->qave+qave) < q->qth_min) { |
q->qcount = -1; |
enqueue: |
if (q->backlog + skb->len <= q->limit) { |
q->backlog += skb->len; |
do_enqueue: |
__skb_queue_tail(&sch->q, skb); |
sch->stats.backlog += skb->len; |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
return 0; |
} else { |
q->pdrop++; |
} |
|
drop: |
kfree_skb(skb); |
sch->stats.drops++; |
return NET_XMIT_DROP; |
} |
if ((q->qave+qave) >= q->qth_max) { |
q->qcount = -1; |
sch->stats.overlimits++; |
q->forced++; |
goto drop; |
} |
if (++q->qcount) { |
if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) |
goto enqueue; |
q->qcount = 0; |
q->qR = net_random()&q->Rmask; |
sch->stats.overlimits++; |
q->early++; |
goto drop; |
} |
q->qR = net_random()&q->Rmask; |
goto enqueue; |
} |
|
static int |
gred_requeue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct gred_sched_data *q; |
struct gred_sched *t= (struct gred_sched *)sch->data; |
q= t->tab[(skb->tc_index&0xf)]; |
/* error checking here -- probably unnecessary */ |
PSCHED_SET_PASTPERFECT(q->qidlestart); |
|
__skb_queue_head(&sch->q, skb); |
sch->stats.backlog += skb->len; |
q->backlog += skb->len; |
return 0; |
} |
|
static struct sk_buff * |
gred_dequeue(struct Qdisc* sch) |
{ |
struct sk_buff *skb; |
struct gred_sched_data *q; |
struct gred_sched *t= (struct gred_sched *)sch->data; |
|
skb = __skb_dequeue(&sch->q); |
if (skb) { |
sch->stats.backlog -= skb->len; |
q= t->tab[(skb->tc_index&0xf)]; |
if (q) { |
q->backlog -= skb->len; |
if (!q->backlog && !t->eqp) |
PSCHED_GET_TIME(q->qidlestart); |
} else { |
D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); |
} |
return skb; |
} |
|
if (t->eqp) { |
q= t->tab[t->def]; |
if (!q) |
D2PRINTK("no default VQ set: Results will be " |
"screwed up\n"); |
else |
PSCHED_GET_TIME(q->qidlestart); |
} |
|
return NULL; |
} |
|
static unsigned int gred_drop(struct Qdisc* sch) |
{ |
struct sk_buff *skb; |
|
struct gred_sched_data *q; |
struct gred_sched *t= (struct gred_sched *)sch->data; |
|
skb = __skb_dequeue_tail(&sch->q); |
if (skb) { |
unsigned int len = skb->len; |
sch->stats.backlog -= len; |
sch->stats.drops++; |
q= t->tab[(skb->tc_index&0xf)]; |
if (q) { |
q->backlog -= len; |
q->other++; |
if (!q->backlog && !t->eqp) |
PSCHED_GET_TIME(q->qidlestart); |
} else { |
D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); |
} |
|
kfree_skb(skb); |
return len; |
} |
|
q=t->tab[t->def]; |
if (!q) { |
D2PRINTK("no default VQ set: Results might be screwed up\n"); |
return 0; |
} |
|
PSCHED_GET_TIME(q->qidlestart); |
return 0; |
|
} |
|
static void gred_reset(struct Qdisc* sch) |
{ |
int i; |
struct gred_sched_data *q; |
struct gred_sched *t= (struct gred_sched *)sch->data; |
|
__skb_queue_purge(&sch->q); |
|
sch->stats.backlog = 0; |
|
for (i=0;i<t->DPs;i++) { |
q= t->tab[i]; |
if (!q) |
continue; |
PSCHED_SET_PASTPERFECT(q->qidlestart); |
q->qave = 0; |
q->qcount = -1; |
q->backlog = 0; |
q->other=0; |
q->forced=0; |
q->pdrop=0; |
q->early=0; |
} |
} |
|
static int gred_change(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct gred_sched *table = (struct gred_sched *)sch->data; |
struct gred_sched_data *q; |
struct tc_gred_qopt *ctl; |
struct tc_gred_sopt *sopt; |
struct rtattr *tb[TCA_GRED_STAB]; |
struct rtattr *tb2[TCA_GRED_DPS]; |
int i; |
|
if (opt == NULL || |
rtattr_parse(tb, TCA_GRED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ) |
return -EINVAL; |
|
if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { |
rtattr_parse(tb2, TCA_GRED_DPS, RTA_DATA(opt), |
RTA_PAYLOAD(opt)); |
|
if (tb2[TCA_GRED_DPS-1] == 0) |
return -EINVAL; |
|
sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); |
table->DPs=sopt->DPs; |
table->def=sopt->def_DP; |
table->grio=sopt->grio; |
table->initd=0; |
/* probably need to clear all the table DP entries as well */ |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
|
if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || |
RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || |
RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) |
return -EINVAL; |
|
ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); |
if (ctl->DP > MAX_DPs-1 ) { |
/* misbehaving is punished! Put in the default drop probability */ |
DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP " |
"set to default at %d\n",ctl->DP,table->def); |
ctl->DP=table->def; |
} |
|
if (table->tab[ctl->DP] == NULL) { |
table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), |
GFP_KERNEL); |
if (NULL == table->tab[ctl->DP]) |
return -ENOMEM; |
memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); |
} |
q= table->tab[ctl->DP]; |
|
if (table->grio) { |
if (ctl->prio <=0) { |
if (table->def && table->tab[table->def]) { |
DPRINTK("\nGRED: DP %u does not have a prio" |
"setting default to %d\n",ctl->DP, |
table->tab[table->def]->prio); |
q->prio=table->tab[table->def]->prio; |
} else { |
DPRINTK("\nGRED: DP %u does not have a prio" |
" setting default to 8\n",ctl->DP); |
q->prio=8; |
} |
} else { |
q->prio=ctl->prio; |
} |
} else { |
q->prio=8; |
} |
|
|
q->DP=ctl->DP; |
q->Wlog = ctl->Wlog; |
q->Plog = ctl->Plog; |
q->limit = ctl->limit; |
q->Scell_log = ctl->Scell_log; |
q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; |
q->Scell_max = (255<<q->Scell_log); |
q->qth_min = ctl->qth_min<<ctl->Wlog; |
q->qth_max = ctl->qth_max<<ctl->Wlog; |
q->qave=0; |
q->backlog=0; |
q->qcount = -1; |
q->other=0; |
q->forced=0; |
q->pdrop=0; |
q->early=0; |
|
PSCHED_SET_PASTPERFECT(q->qidlestart); |
memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); |
|
if ( table->initd && table->grio) { |
/* this looks ugly but its not in the fast path */ |
for (i=0;i<table->DPs;i++) { |
if ((!table->tab[i]) || (i==q->DP) ) |
continue; |
if (table->tab[i]->prio == q->prio ){ |
/* WRED mode detected */ |
table->eqp=1; |
break; |
} |
} |
} |
|
if (!table->initd) { |
table->initd=1; |
/* |
the first entry also goes into the default until |
over-written |
*/ |
|
if (table->tab[table->def] == NULL) { |
table->tab[table->def]= |
kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); |
if (NULL == table->tab[table->def]) |
return -ENOMEM; |
|
memset(table->tab[table->def], 0, |
(sizeof(struct gred_sched_data))); |
} |
q= table->tab[table->def]; |
q->DP=table->def; |
q->Wlog = ctl->Wlog; |
q->Plog = ctl->Plog; |
q->limit = ctl->limit; |
q->Scell_log = ctl->Scell_log; |
q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; |
q->Scell_max = (255<<q->Scell_log); |
q->qth_min = ctl->qth_min<<ctl->Wlog; |
q->qth_max = ctl->qth_max<<ctl->Wlog; |
|
if (table->grio) |
q->prio=table->tab[ctl->DP]->prio; |
else |
q->prio=8; |
|
q->qcount = -1; |
PSCHED_SET_PASTPERFECT(q->qidlestart); |
memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); |
} |
return 0; |
|
} |
|
static int gred_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct gred_sched *table = (struct gred_sched *)sch->data; |
struct tc_gred_sopt *sopt; |
struct rtattr *tb[TCA_GRED_STAB]; |
struct rtattr *tb2[TCA_GRED_DPS]; |
|
if (opt == NULL || |
rtattr_parse(tb, TCA_GRED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ) |
return -EINVAL; |
|
if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0 ) { |
rtattr_parse(tb2, TCA_GRED_DPS, RTA_DATA(opt),RTA_PAYLOAD(opt)); |
|
if (tb2[TCA_GRED_DPS-1] == 0) |
return -EINVAL; |
|
sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); |
table->DPs=sopt->DPs; |
table->def=sopt->def_DP; |
table->grio=sopt->grio; |
table->initd=0; |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
DPRINTK("\n GRED_INIT error!\n"); |
return -EINVAL; |
} |
|
static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
unsigned long qave; |
struct rtattr *rta; |
struct tc_gred_qopt *opt = NULL ; |
struct tc_gred_qopt *dst; |
struct gred_sched *table = (struct gred_sched *)sch->data; |
struct gred_sched_data *q; |
int i; |
unsigned char *b = skb->tail; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); |
|
if (opt == NULL) { |
DPRINTK("gred_dump:failed to malloc for %Zd\n", |
sizeof(struct tc_gred_qopt)*MAX_DPs); |
goto rtattr_failure; |
} |
|
memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); |
|
if (!table->initd) { |
DPRINTK("NO GRED Queues setup!\n"); |
} |
|
for (i=0;i<MAX_DPs;i++) { |
dst= &opt[i]; |
q= table->tab[i]; |
|
if (!q) { |
/* hack -- fix at some point with proper message |
This is how we indicate to tc that there is no VQ |
at this DP */ |
|
dst->DP=MAX_DPs+i; |
continue; |
} |
|
dst->limit=q->limit; |
dst->qth_min=q->qth_min>>q->Wlog; |
dst->qth_max=q->qth_max>>q->Wlog; |
dst->DP=q->DP; |
dst->backlog=q->backlog; |
if (q->qave) { |
if (table->eqp && table->grio) { |
q->qidlestart=table->tab[table->def]->qidlestart; |
q->qave=table->tab[table->def]->qave; |
} |
if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { |
long idle; |
psched_time_t now; |
PSCHED_GET_TIME(now); |
idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); |
qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; |
dst->qave = qave >> q->Wlog; |
|
} else { |
dst->qave = q->qave >> q->Wlog; |
} |
} else { |
dst->qave = 0; |
} |
|
|
dst->Wlog = q->Wlog; |
dst->Plog = q->Plog; |
dst->Scell_log = q->Scell_log; |
dst->other = q->other; |
dst->forced = q->forced; |
dst->early = q->early; |
dst->pdrop = q->pdrop; |
dst->prio = q->prio; |
dst->packets=q->packetsin; |
dst->bytesin=q->bytesin; |
} |
|
RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); |
rta->rta_len = skb->tail - b; |
|
kfree(opt); |
return skb->len; |
|
rtattr_failure: |
if (opt) |
kfree(opt); |
DPRINTK("gred_dump: FAILURE!!!!\n"); |
|
/* also free the opt struct here */ |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static void gred_destroy(struct Qdisc *sch) |
{ |
struct gred_sched *table = (struct gred_sched *)sch->data; |
int i; |
|
for (i = 0;i < table->DPs; i++) { |
if (table->tab[i]) |
kfree(table->tab[i]); |
} |
MOD_DEC_USE_COUNT; |
} |
|
struct Qdisc_ops gred_qdisc_ops = |
{ |
NULL, |
NULL, |
"gred", |
sizeof(struct gred_sched), |
gred_enqueue, |
gred_dequeue, |
gred_requeue, |
gred_drop, |
gred_init, |
gred_reset, |
gred_destroy, |
gred_change, /* change */ |
gred_dump, |
}; |
|
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&gred_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&gred_qdisc_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/sch_dsmark.c
0,0 → 1,486
/* net/sched/sch_dsmark.c - Differentiated Services field marker */ |
|
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ |
|
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <linux/types.h> |
#include <linux/string.h> |
#include <linux/errno.h> |
#include <linux/skbuff.h> |
#include <linux/netdevice.h> /* for pkt_sched */ |
#include <linux/rtnetlink.h> |
#include <net/pkt_sched.h> |
#include <net/dsfield.h> |
#include <asm/byteorder.h> |
|
|
#if 1 /* control */ |
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define DPRINTK(format,args...) |
#endif |
|
#if 0 /* data */ |
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) |
#else |
#define D2PRINTK(format,args...) |
#endif |
|
|
#define PRIV(sch) ((struct dsmark_qdisc_data *) (sch)->data) |
|
|
/* |
* classid class marking |
* ------- ----- ------- |
* n/a 0 n/a |
* x:0 1 use entry [0] |
* ... ... ... |
* x:y y>0 y+1 use entry [y] |
* ... ... ... |
* x:indices-1 indices use entry [indices-1] |
* ... ... ... |
* x:y y+1 use entry [y & (indices-1)] |
* ... ... ... |
* 0xffff 0x10000 use entry [indices-1] |
*/ |
|
|
#define NO_DEFAULT_INDEX (1 << 16) |
|
struct dsmark_qdisc_data { |
struct Qdisc *q; |
struct tcf_proto *filter_list; |
__u8 *mask; /* "owns" the array */ |
__u8 *value; |
__u16 indices; |
__u32 default_index; /* index range is 0...0xffff */ |
int set_tc_index; |
}; |
|
|
/* ------------------------- Class/flow operations ------------------------- */ |
|
|
static int dsmark_graft(struct Qdisc *sch,unsigned long arg, |
struct Qdisc *new,struct Qdisc **old) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
|
DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, |
old); |
if (!new) |
new = &noop_qdisc; |
sch_tree_lock(sch); |
*old = xchg(&p->q,new); |
if (*old) |
qdisc_reset(*old); |
sch->q.qlen = 0; |
sch_tree_unlock(sch); /* @@@ move up ? */ |
return 0; |
} |
|
|
static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
|
return p->q; |
} |
|
|
static unsigned long dsmark_get(struct Qdisc *sch,u32 classid) |
{ |
struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch); |
|
DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); |
return TC_H_MIN(classid)+1; |
} |
|
|
static unsigned long dsmark_bind_filter(struct Qdisc *sch, |
unsigned long parent, u32 classid) |
{ |
return dsmark_get(sch,classid); |
} |
|
|
static void dsmark_put(struct Qdisc *sch, unsigned long cl) |
{ |
} |
|
|
static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, |
struct rtattr **tca, unsigned long *arg) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_DSMARK_MAX]; |
|
DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," |
"arg 0x%lx\n",sch,p,classid,parent,*arg); |
if (*arg > p->indices) |
return -ENOENT; |
if (!opt || rtattr_parse(tb, TCA_DSMARK_MAX, RTA_DATA(opt), |
RTA_PAYLOAD(opt))) |
return -EINVAL; |
if (tb[TCA_DSMARK_MASK-1]) { |
if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1])) |
return -EINVAL; |
p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]); |
} |
if (tb[TCA_DSMARK_VALUE-1]) { |
if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1])) |
return -EINVAL; |
p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]); |
} |
return 0; |
} |
|
|
static int dsmark_delete(struct Qdisc *sch,unsigned long arg) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
|
if (!arg || arg > p->indices) |
return -EINVAL; |
p->mask[arg-1] = 0xff; |
p->value[arg-1] = 0; |
return 0; |
} |
|
|
static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
int i; |
|
DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); |
if (walker->stop) |
return; |
for (i = 0; i < p->indices; i++) { |
if (p->mask[i] == 0xff && !p->value[i]) |
continue; |
if (walker->count >= walker->skip) { |
if (walker->fn(sch, i+1, walker) < 0) { |
walker->stop = 1; |
break; |
} |
} |
walker->count++; |
} |
} |
|
|
static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
|
return &p->filter_list; |
} |
|
|
/* --------------------------- Qdisc operations ---------------------------- */ |
|
|
static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
struct tcf_result res; |
int result; |
int ret = NET_XMIT_POLICED; |
|
D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); |
if (p->set_tc_index) { |
switch (skb->protocol) { |
case __constant_htons(ETH_P_IP): |
skb->tc_index = ipv4_get_dsfield(skb->nh.iph); |
break; |
case __constant_htons(ETH_P_IPV6): |
skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h); |
break; |
default: |
skb->tc_index = 0; |
break; |
}; |
} |
result = TC_POLICE_OK; /* be nice to gcc */ |
if (TC_H_MAJ(skb->priority) == sch->handle) { |
skb->tc_index = TC_H_MIN(skb->priority); |
} else { |
result = tc_classify(skb,p->filter_list,&res); |
D2PRINTK("result %d class 0x%04x\n",result,res.classid); |
switch (result) { |
#ifdef CONFIG_NET_CLS_POLICE |
case TC_POLICE_SHOT: |
kfree_skb(skb); |
break; |
#if 0 |
case TC_POLICE_RECLASSIFY: |
/* FIXME: what to do here ??? */ |
#endif |
#endif |
case TC_POLICE_OK: |
skb->tc_index = TC_H_MIN(res.classid); |
break; |
case TC_POLICE_UNSPEC: |
/* fall through */ |
default: |
if (p->default_index != NO_DEFAULT_INDEX) |
skb->tc_index = p->default_index; |
break; |
}; |
} |
if ( |
#ifdef CONFIG_NET_CLS_POLICE |
result == TC_POLICE_SHOT || |
#endif |
|
((ret = p->q->enqueue(skb,p->q)) != 0)) { |
sch->stats.drops++; |
return ret; |
} |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
sch->q.qlen++; |
return ret; |
} |
|
|
static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
struct sk_buff *skb; |
int index; |
|
D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p); |
skb = p->q->ops->dequeue(p->q); |
if (!skb) |
return NULL; |
sch->q.qlen--; |
index = skb->tc_index & (p->indices-1); |
D2PRINTK("index %d->%d\n",skb->tc_index,index); |
switch (skb->protocol) { |
case __constant_htons(ETH_P_IP): |
ipv4_change_dsfield(skb->nh.iph, |
p->mask[index],p->value[index]); |
break; |
case __constant_htons(ETH_P_IPV6): |
ipv6_change_dsfield(skb->nh.ipv6h, |
p->mask[index],p->value[index]); |
break; |
default: |
/* |
* Only complain if a change was actually attempted. |
* This way, we can send non-IP traffic through dsmark |
* and don't need yet another qdisc as a bypass. |
*/ |
if (p->mask[index] != 0xff || p->value[index]) |
printk(KERN_WARNING "dsmark_dequeue: " |
"unsupported protocol %d\n", |
htons(skb->protocol)); |
break; |
}; |
return skb; |
} |
|
|
static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) |
{ |
int ret; |
struct dsmark_qdisc_data *p = PRIV(sch); |
|
D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); |
if ((ret = p->q->ops->requeue(skb, p->q)) == 0) { |
sch->q.qlen++; |
return 0; |
} |
sch->stats.drops++; |
return ret; |
} |
|
|
static unsigned int dsmark_drop(struct Qdisc *sch) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
unsigned int len; |
|
DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); |
if (!p->q->ops->drop) |
return 0; |
if (!(len = p->q->ops->drop(p->q))) |
return 0; |
sch->q.qlen--; |
return len; |
} |
|
|
int dsmark_init(struct Qdisc *sch,struct rtattr *opt) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
struct rtattr *tb[TCA_DSMARK_MAX]; |
__u16 tmp; |
|
DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); |
if (rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 || |
!tb[TCA_DSMARK_INDICES-1] || |
RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16)) |
return -EINVAL; |
memset(p,0,sizeof(*p)); |
p->filter_list = NULL; |
p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]); |
if (!p->indices) |
return -EINVAL; |
for (tmp = p->indices; tmp != 1; tmp >>= 1) { |
if (tmp & 1) |
return -EINVAL; |
} |
p->default_index = NO_DEFAULT_INDEX; |
if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) { |
if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16)) |
return -EINVAL; |
p->default_index = |
*(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]); |
} |
p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1]; |
p->mask = kmalloc(p->indices*2,GFP_KERNEL); |
if (!p->mask) |
return -ENOMEM; |
p->value = p->mask+p->indices; |
memset(p->mask,0xff,p->indices); |
memset(p->value,0,p->indices); |
if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) |
p->q = &noop_qdisc; |
DPRINTK("dsmark_init: qdisc %p\n",&p->q); |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
|
static void dsmark_reset(struct Qdisc *sch) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
|
DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); |
qdisc_reset(p->q); |
sch->q.qlen = 0; |
} |
|
|
static void dsmark_destroy(struct Qdisc *sch) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
struct tcf_proto *tp; |
|
DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p); |
while (p->filter_list) { |
tp = p->filter_list; |
p->filter_list = tp->next; |
tcf_destroy(tp); |
} |
qdisc_destroy(p->q); |
p->q = &noop_qdisc; |
kfree(p->mask); |
MOD_DEC_USE_COUNT; |
} |
|
|
static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, |
struct sk_buff *skb, struct tcmsg *tcm) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl); |
if (!cl || cl > p->indices) |
return -EINVAL; |
tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1); |
rta = (struct rtattr *) b; |
RTA_PUT(skb,TCA_OPTIONS,0,NULL); |
RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]); |
RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]); |
rta->rta_len = skb->tail-b; |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb,b-skb->data); |
return -1; |
} |
|
static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct dsmark_qdisc_data *p = PRIV(sch); |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
|
rta = (struct rtattr *) b; |
RTA_PUT(skb,TCA_OPTIONS,0,NULL); |
RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices); |
if (p->default_index != NO_DEFAULT_INDEX) { |
__u16 tmp = p->default_index; |
|
RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp); |
} |
if (p->set_tc_index) |
RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL); |
rta->rta_len = skb->tail-b; |
return skb->len; |
|
rtattr_failure: |
skb_trim(skb,b-skb->data); |
return -1; |
} |
|
static struct Qdisc_class_ops dsmark_class_ops = |
{ |
dsmark_graft, /* graft */ |
dsmark_leaf, /* leaf */ |
dsmark_get, /* get */ |
dsmark_put, /* put */ |
dsmark_change, /* change */ |
dsmark_delete, /* delete */ |
dsmark_walk, /* walk */ |
|
dsmark_find_tcf, /* tcf_chain */ |
dsmark_bind_filter, /* bind_tcf */ |
dsmark_put, /* unbind_tcf */ |
|
dsmark_dump_class, /* dump */ |
}; |
|
struct Qdisc_ops dsmark_qdisc_ops = |
{ |
NULL, /* next */ |
&dsmark_class_ops, /* cl_ops */ |
"dsmark", |
sizeof(struct dsmark_qdisc_data), |
|
dsmark_enqueue, /* enqueue */ |
dsmark_dequeue, /* dequeue */ |
dsmark_requeue, /* requeue */ |
dsmark_drop, /* drop */ |
|
dsmark_init, /* init */ |
dsmark_reset, /* reset */ |
dsmark_destroy, /* destroy */ |
NULL, /* change */ |
|
dsmark_dump /* dump */ |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&dsmark_qdisc_ops); |
} |
|
|
void cleanup_module(void) |
{ |
unregister_qdisc(&dsmark_qdisc_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/sch_hfsc.c
0,0 → 1,1841
/* |
* Copyright (c) 2003 Patrick McHardy, <kaber@trash.net> |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version 2 |
* of the License, or (at your option) any later version. |
* |
* 2003-10-17 - Ported from altq |
*/ |
/* |
* Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. |
* |
* Permission to use, copy, modify, and distribute this software and |
* its documentation is hereby granted (including for commercial or |
* for-profit use), provided that both the copyright notice and this |
* permission notice appear in all copies of the software, derivative |
* works, or modified versions, and any portions thereof. |
* |
* THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF |
* WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS |
* SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED |
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
* DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE |
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT |
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
* DAMAGE. |
* |
* Carnegie Mellon encourages (but does not require) users of this |
* software to return any improvements or extensions that they make, |
* and to grant Carnegie Mellon the rights to redistribute these |
* changes without encumbrance. |
*/ |
/* |
* H-FSC is described in Proceedings of SIGCOMM'97, |
* "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, |
* Real-Time and Priority Service" |
* by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. |
* |
* Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing. |
* when a class has an upperlimit, the fit-time is computed from the |
* upperlimit service curve. the link-sharing scheduler does not schedule |
* a class whose fit-time exceeds the current time. |
*/ |
|
#include <linux/kernel.h> |
#include <linux/config.h> |
#include <linux/module.h> |
#include <linux/types.h> |
#include <linux/errno.h> |
#include <linux/compiler.h> |
#include <linux/spinlock.h> |
#include <linux/skbuff.h> |
#include <linux/string.h> |
#include <linux/slab.h> |
#include <linux/timer.h> |
#include <linux/list.h> |
#include <linux/init.h> |
#include <linux/netdevice.h> |
#include <linux/rtnetlink.h> |
#include <linux/pkt_sched.h> |
#include <net/pkt_sched.h> |
#include <net/pkt_cls.h> |
#include <asm/system.h> |
#include <asm/div64.h> |
|
#define HFSC_DEBUG 1 |
|
/* |
* kernel internal service curve representation: |
* coordinates are given by 64 bit unsigned integers. |
* x-axis: unit is clock count. |
* y-axis: unit is byte. |
* |
* The service curve parameters are converted to the internal |
* representation. The slope values are scaled to avoid overflow. |
* the inverse slope values as well as the y-projection of the 1st |
* segment are kept in order to to avoid 64-bit divide operations |
* that are expensive on 32-bit architectures. |
*/ |
|
struct internal_sc |
{ |
u64 sm1; /* scaled slope of the 1st segment */ |
u64 ism1; /* scaled inverse-slope of the 1st segment */ |
u64 dx; /* the x-projection of the 1st segment */ |
u64 dy; /* the y-projection of the 1st segment */ |
u64 sm2; /* scaled slope of the 2nd segment */ |
u64 ism2; /* scaled inverse-slope of the 2nd segment */ |
}; |
|
/* runtime service curve */ |
struct runtime_sc |
{ |
u64 x; /* current starting position on x-axis */ |
u64 y; /* current starting position on y-axis */ |
u64 sm1; /* scaled slope of the 1st segment */ |
u64 ism1; /* scaled inverse-slope of the 1st segment */ |
u64 dx; /* the x-projection of the 1st segment */ |
u64 dy; /* the y-projection of the 1st segment */ |
u64 sm2; /* scaled slope of the 2nd segment */ |
u64 ism2; /* scaled inverse-slope of the 2nd segment */ |
}; |
|
enum hfsc_class_flags |
{ |
HFSC_RSC = 0x1, |
HFSC_FSC = 0x2, |
HFSC_USC = 0x4 |
}; |
|
struct hfsc_class |
{ |
u32 classid; /* class id */ |
unsigned int refcnt; /* usage count */ |
|
struct tc_stats stats; /* generic statistics */ |
unsigned int level; /* class level in hierarchy */ |
struct tcf_proto *filter_list; /* filter list */ |
unsigned int filter_cnt; /* filter count */ |
|
struct hfsc_sched *sched; /* scheduler data */ |
struct hfsc_class *cl_parent; /* parent class */ |
struct list_head siblings; /* sibling classes */ |
struct list_head children; /* child classes */ |
struct Qdisc *qdisc; /* leaf qdisc */ |
|
struct list_head actlist; /* active children list */ |
struct list_head alist; /* active children list member */ |
struct list_head ellist; /* eligible list member */ |
struct list_head hlist; /* hash list member */ |
struct list_head dlist; /* drop list member */ |
|
u64 cl_total; /* total work in bytes */ |
u64 cl_cumul; /* cumulative work in bytes done by |
real-time criteria */ |
|
u64 cl_d; /* deadline*/ |
u64 cl_e; /* eligible time */ |
u64 cl_vt; /* virtual time */ |
u64 cl_f; /* time when this class will fit for |
link-sharing, max(myf, cfmin) */ |
u64 cl_myf; /* my fit-time (calculated from this |
class's own upperlimit curve) */ |
u64 cl_myfadj; /* my fit-time adjustment (to cancel |
history dependence) */ |
u64 cl_cfmin; /* earliest children's fit-time (used |
with cl_myf to obtain cl_f) */ |
u64 cl_cvtmin; /* minimal virtual time among the |
children fit for link-sharing |
(monotonic within a period) */ |
u64 cl_vtadj; /* intra-period cumulative vt |
adjustment */ |
u64 cl_vtoff; /* inter-period cumulative vt offset */ |
u64 cl_cvtmax; /* max child's vt in the last period */ |
|
struct internal_sc cl_rsc; /* internal real-time service curve */ |
struct internal_sc cl_fsc; /* internal fair service curve */ |
struct internal_sc cl_usc; /* internal upperlimit service curve */ |
struct runtime_sc cl_deadline; /* deadline curve */ |
struct runtime_sc cl_eligible; /* eligible curve */ |
struct runtime_sc cl_virtual; /* virtual curve */ |
struct runtime_sc cl_ulimit; /* upperlimit curve */ |
|
unsigned long cl_flags; /* which curves are valid */ |
unsigned long cl_vtperiod; /* vt period sequence number */ |
unsigned long cl_parentperiod;/* parent's vt period sequence number*/ |
unsigned long cl_nactive; /* number of active children */ |
}; |
|
#define HFSC_HSIZE 16 |
|
struct hfsc_sched |
{ |
u16 defcls; /* default class id */ |
struct hfsc_class root; /* root class */ |
struct list_head clhash[HFSC_HSIZE]; /* class hash */ |
struct list_head eligible; /* eligible list */ |
struct list_head droplist; /* active leaf class list (for |
dropping) */ |
struct sk_buff_head requeue; /* requeued packet */ |
struct timer_list wd_timer; /* watchdog timer */ |
}; |
|
/* |
* macros |
*/ |
#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY |
#include <linux/time.h> |
#undef PSCHED_GET_TIME |
#define PSCHED_GET_TIME(stamp) \ |
do { \ |
struct timeval tv; \ |
do_gettimeofday(&tv); \ |
(stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec; \ |
} while (0) |
#endif |
|
#if HFSC_DEBUG |
#define ASSERT(cond) \ |
do { \ |
if (unlikely(!(cond))) \ |
printk("assertion %s failed at %s:%i (%s)\n", \ |
#cond, __FILE__, __LINE__, __FUNCTION__); \ |
} while (0) |
#else |
#define ASSERT(cond) |
#endif /* HFSC_DEBUG */ |
|
#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ |
|
|
/* |
* eligible list holds backlogged classes being sorted by their eligible times. |
* there is one eligible list per hfsc instance. |
*/ |
|
static void |
ellist_insert(struct hfsc_class *cl) |
{ |
struct list_head *head = &cl->sched->eligible; |
struct hfsc_class *p; |
|
/* check the last entry first */ |
if (list_empty(head) || |
((p = list_entry(head->prev, struct hfsc_class, ellist)) && |
p->cl_e <= cl->cl_e)) { |
list_add_tail(&cl->ellist, head); |
return; |
} |
|
list_for_each_entry(p, head, ellist) { |
if (cl->cl_e < p->cl_e) { |
/* insert cl before p */ |
list_add_tail(&cl->ellist, &p->ellist); |
return; |
} |
} |
ASSERT(0); /* should not reach here */ |
} |
|
static inline void |
ellist_remove(struct hfsc_class *cl) |
{ |
list_del(&cl->ellist); |
} |
|
static void |
ellist_update(struct hfsc_class *cl) |
{ |
struct list_head *head = &cl->sched->eligible; |
struct hfsc_class *p, *last; |
|
/* |
* the eligible time of a class increases monotonically. |
* if the next entry has a larger eligible time, nothing to do. |
*/ |
if (cl->ellist.next == head || |
((p = list_entry(cl->ellist.next, struct hfsc_class, ellist)) && |
cl->cl_e <= p->cl_e)) |
return; |
|
/* check the last entry */ |
last = list_entry(head->prev, struct hfsc_class, ellist); |
if (last->cl_e <= cl->cl_e) { |
list_move_tail(&cl->ellist, head); |
return; |
} |
|
/* |
* the new position must be between the next entry |
* and the last entry |
*/ |
list_for_each_entry_continue(p, head, ellist) { |
if (cl->cl_e < p->cl_e) { |
list_move_tail(&cl->ellist, &p->ellist); |
return; |
} |
} |
ASSERT(0); /* should not reach here */ |
} |
|
/* find the class with the minimum deadline among the eligible classes */ |
static inline struct hfsc_class * |
ellist_get_mindl(struct list_head *head, u64 cur_time) |
{ |
struct hfsc_class *p, *cl = NULL; |
|
list_for_each_entry(p, head, ellist) { |
if (p->cl_e > cur_time) |
break; |
if (cl == NULL || p->cl_d < cl->cl_d) |
cl = p; |
} |
return cl; |
} |
|
/* find the class with minimum eligible time among the eligible classes */ |
static inline struct hfsc_class * |
ellist_get_minel(struct list_head *head) |
{ |
if (list_empty(head)) |
return NULL; |
return list_entry(head->next, struct hfsc_class, ellist); |
} |
|
/* |
* active children list holds backlogged child classes being sorted |
* by their virtual time. each intermediate class has one active |
* children list. |
*/ |
static void |
actlist_insert(struct hfsc_class *cl) |
{ |
struct list_head *head = &cl->cl_parent->actlist; |
struct hfsc_class *p; |
|
/* check the last entry first */ |
if (list_empty(head) || |
((p = list_entry(head->prev, struct hfsc_class, alist)) && |
p->cl_vt <= cl->cl_vt)) { |
list_add_tail(&cl->alist, head); |
return; |
} |
|
list_for_each_entry(p, head, alist) { |
if (cl->cl_vt < p->cl_vt) { |
/* insert cl before p */ |
list_add_tail(&cl->alist, &p->alist); |
return; |
} |
} |
ASSERT(0); /* should not reach here */ |
} |
|
static inline void |
actlist_remove(struct hfsc_class *cl) |
{ |
list_del(&cl->alist); |
} |
|
static void |
actlist_update(struct hfsc_class *cl) |
{ |
struct list_head *head = &cl->cl_parent->actlist; |
struct hfsc_class *p, *last; |
|
/* |
* the virtual time of a class increases monotonically. |
* if the next entry has a larger virtual time, nothing to do. |
*/ |
if (cl->alist.next == head || |
((p = list_entry(cl->alist.next, struct hfsc_class, alist)) && |
cl->cl_vt <= p->cl_vt)) |
return; |
|
/* check the last entry */ |
last = list_entry(head->prev, struct hfsc_class, alist); |
if (last->cl_vt <= cl->cl_vt) { |
list_move_tail(&cl->alist, head); |
return; |
} |
|
/* |
* the new position must be between the next entry |
* and the last entry |
*/ |
list_for_each_entry_continue(p, head, alist) { |
if (cl->cl_vt < p->cl_vt) { |
list_move_tail(&cl->alist, &p->alist); |
return; |
} |
} |
ASSERT(0); /* should not reach here */ |
} |
|
static inline struct hfsc_class * |
actlist_firstfit(struct hfsc_class *cl, u64 cur_time) |
{ |
struct hfsc_class *p; |
|
list_for_each_entry(p, &cl->actlist, alist) { |
if (p->cl_f <= cur_time) { |
return p; |
} |
} |
return NULL; |
} |
|
/* |
* get the leaf class with the minimum vt in the hierarchy |
*/ |
static struct hfsc_class * |
actlist_get_minvt(struct hfsc_class *cl, u64 cur_time) |
{ |
/* if root-class's cfmin is bigger than cur_time nothing to do */ |
if (cl->cl_cfmin > cur_time) |
return NULL; |
|
while (cl->level > 0) { |
cl = actlist_firstfit(cl, cur_time); |
if (cl == NULL) |
return NULL; |
/* |
* update parent's cl_cvtmin. |
*/ |
if (cl->cl_parent->cl_cvtmin < cl->cl_vt) |
cl->cl_parent->cl_cvtmin = cl->cl_vt; |
} |
return cl; |
} |
|
/* |
* service curve support functions |
* |
* external service curve parameters |
* m: bps |
* d: us |
* internal service curve parameters |
* sm: (bytes/psched_us) << SM_SHIFT |
* ism: (psched_us/byte) << ISM_SHIFT |
* dx: psched_us |
* |
* Time source resolution |
* PSCHED_JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us. |
* PSCHED_CPU: resolution is between 0.5us and 1us. |
* PSCHED_GETTIMEOFDAY: resolution is exactly 1us. |
* |
* sm and ism are scaled in order to keep effective digits. |
* SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective |
* digits in decimal using the following table. |
* |
* Note: We can afford the additional accuracy (altq hfsc keeps at most |
* 3 effective digits) thanks to the fact that linux clock is bounded |
* much more tightly. |
* |
* bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps |
* ------------+------------------------------------------------------- |
* bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3 |
* bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3 |
* bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3 |
* |
* 0.5us/byte 160 16 1.6 0.16 0.016 |
* us/byte 80 8 0.8 0.08 0.008 |
* 1.27us/byte 63 6.3 0.63 0.063 0.0063 |
*/ |
#define SM_SHIFT 20 |
#define ISM_SHIFT 18 |
|
#define SM_MASK ((1ULL << SM_SHIFT) - 1) |
#define ISM_MASK ((1ULL << ISM_SHIFT) - 1) |
|
static inline u64 |
seg_x2y(u64 x, u64 sm) |
{ |
u64 y; |
|
/* |
* compute |
* y = x * sm >> SM_SHIFT |
* but divide it for the upper and lower bits to avoid overflow |
*/ |
y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); |
return y; |
} |
|
static inline u64 |
seg_y2x(u64 y, u64 ism) |
{ |
u64 x; |
|
if (y == 0) |
x = 0; |
else if (ism == HT_INFINITY) |
x = HT_INFINITY; |
else { |
x = (y >> ISM_SHIFT) * ism |
+ (((y & ISM_MASK) * ism) >> ISM_SHIFT); |
} |
return x; |
} |
|
/* Convert m (bps) into sm (bytes/psched us) */ |
static u64 |
m2sm(u32 m) |
{ |
u64 sm; |
|
sm = ((u64)m << SM_SHIFT); |
sm += PSCHED_JIFFIE2US(HZ) - 1; |
do_div(sm, PSCHED_JIFFIE2US(HZ)); |
return sm; |
} |
|
/* convert m (bps) into ism (psched us/byte) */ |
static u64 |
m2ism(u32 m) |
{ |
u64 ism; |
|
if (m == 0) |
ism = HT_INFINITY; |
else { |
ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT); |
ism += m - 1; |
do_div(ism, m); |
} |
return ism; |
} |
|
/* convert d (us) into dx (psched us) */ |
static u64 |
d2dx(u32 d) |
{ |
u64 dx; |
|
dx = ((u64)d * PSCHED_JIFFIE2US(HZ)); |
dx += 1000000 - 1; |
do_div(dx, 1000000); |
return dx; |
} |
|
/* convert sm (bytes/psched us) into m (bps) */ |
static u32 |
sm2m(u64 sm) |
{ |
u64 m; |
|
m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT; |
return (u32)m; |
} |
|
/* convert dx (psched us) into d (us) */ |
static u32 |
dx2d(u64 dx) |
{ |
u64 d; |
|
d = dx * 1000000; |
do_div(d, PSCHED_JIFFIE2US(HZ)); |
return (u32)d; |
} |
|
static void |
sc2isc(struct tc_service_curve *sc, struct internal_sc *isc) |
{ |
isc->sm1 = m2sm(sc->m1); |
isc->ism1 = m2ism(sc->m1); |
isc->dx = d2dx(sc->d); |
isc->dy = seg_x2y(isc->dx, isc->sm1); |
isc->sm2 = m2sm(sc->m2); |
isc->ism2 = m2ism(sc->m2); |
} |
|
/* |
* initialize the runtime service curve with the given internal |
* service curve starting at (x, y). |
*/ |
static void |
rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) |
{ |
rtsc->x = x; |
rtsc->y = y; |
rtsc->sm1 = isc->sm1; |
rtsc->ism1 = isc->ism1; |
rtsc->dx = isc->dx; |
rtsc->dy = isc->dy; |
rtsc->sm2 = isc->sm2; |
rtsc->ism2 = isc->ism2; |
} |
|
/* |
* calculate the y-projection of the runtime service curve by the |
* given x-projection value |
*/ |
static u64 |
rtsc_y2x(struct runtime_sc *rtsc, u64 y) |
{ |
u64 x; |
|
if (y < rtsc->y) |
x = rtsc->x; |
else if (y <= rtsc->y + rtsc->dy) { |
/* x belongs to the 1st segment */ |
if (rtsc->dy == 0) |
x = rtsc->x + rtsc->dx; |
else |
x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); |
} else { |
/* x belongs to the 2nd segment */ |
x = rtsc->x + rtsc->dx |
+ seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); |
} |
return x; |
} |
|
static u64 |
rtsc_x2y(struct runtime_sc *rtsc, u64 x) |
{ |
u64 y; |
|
if (x <= rtsc->x) |
y = rtsc->y; |
else if (x <= rtsc->x + rtsc->dx) |
/* y belongs to the 1st segment */ |
y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); |
else |
/* y belongs to the 2nd segment */ |
y = rtsc->y + rtsc->dy |
+ seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); |
return y; |
} |
|
/* |
* update the runtime service curve by taking the minimum of the current |
* runtime service curve and the service curve starting at (x, y). |
*/ |
static void |
rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) |
{ |
u64 y1, y2, dx, dy; |
u32 dsm; |
|
if (isc->sm1 <= isc->sm2) { |
/* service curve is convex */ |
y1 = rtsc_x2y(rtsc, x); |
if (y1 < y) |
/* the current rtsc is smaller */ |
return; |
rtsc->x = x; |
rtsc->y = y; |
return; |
} |
|
/* |
* service curve is concave |
* compute the two y values of the current rtsc |
* y1: at x |
* y2: at (x + dx) |
*/ |
y1 = rtsc_x2y(rtsc, x); |
if (y1 <= y) { |
/* rtsc is below isc, no change to rtsc */ |
return; |
} |
|
y2 = rtsc_x2y(rtsc, x + isc->dx); |
if (y2 >= y + isc->dy) { |
/* rtsc is above isc, replace rtsc by isc */ |
rtsc->x = x; |
rtsc->y = y; |
rtsc->dx = isc->dx; |
rtsc->dy = isc->dy; |
return; |
} |
|
/* |
* the two curves intersect |
* compute the offsets (dx, dy) using the reverse |
* function of seg_x2y() |
* seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) |
*/ |
dx = (y1 - y) << SM_SHIFT; |
dsm = isc->sm1 - isc->sm2; |
do_div(dx, dsm); |
/* |
* check if (x, y1) belongs to the 1st segment of rtsc. |
* if so, add the offset. |
*/ |
if (rtsc->x + rtsc->dx > x) |
dx += rtsc->x + rtsc->dx - x; |
dy = seg_x2y(dx, isc->sm1); |
|
rtsc->x = x; |
rtsc->y = y; |
rtsc->dx = dx; |
rtsc->dy = dy; |
return; |
} |
|
static void |
init_ed(struct hfsc_class *cl, unsigned int next_len) |
{ |
u64 cur_time; |
|
PSCHED_GET_TIME(cur_time); |
|
/* update the deadline curve */ |
rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); |
|
/* |
* update the eligible curve. |
* for concave, it is equal to the deadline curve. |
* for convex, it is a linear curve with slope m2. |
*/ |
cl->cl_eligible = cl->cl_deadline; |
if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { |
cl->cl_eligible.dx = 0; |
cl->cl_eligible.dy = 0; |
} |
|
/* compute e and d */ |
cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); |
cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); |
|
ellist_insert(cl); |
} |
|
static void |
update_ed(struct hfsc_class *cl, unsigned int next_len) |
{ |
cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); |
cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); |
|
ellist_update(cl); |
} |
|
static inline void |
update_d(struct hfsc_class *cl, unsigned int next_len) |
{ |
cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); |
} |
|
static void |
update_cfmin(struct hfsc_class *cl) |
{ |
struct hfsc_class *p; |
u64 cfmin; |
|
if (list_empty(&cl->actlist)) { |
cl->cl_cfmin = 0; |
return; |
} |
cfmin = HT_INFINITY; |
list_for_each_entry(p, &cl->actlist, alist) { |
if (p->cl_f == 0) { |
cl->cl_cfmin = 0; |
return; |
} |
if (p->cl_f < cfmin) |
cfmin = p->cl_f; |
} |
cl->cl_cfmin = cfmin; |
} |
|
static void |
init_vf(struct hfsc_class *cl, unsigned int len) |
{ |
struct hfsc_class *max_cl, *p; |
u64 vt, f, cur_time; |
int go_active; |
|
cur_time = 0; |
go_active = 1; |
for (; cl->cl_parent != NULL; cl = cl->cl_parent) { |
if (go_active && cl->cl_nactive++ == 0) |
go_active = 1; |
else |
go_active = 0; |
|
if (go_active) { |
if (!list_empty(&cl->cl_parent->actlist)) { |
max_cl = list_entry(cl->cl_parent->actlist.prev, |
struct hfsc_class, alist); |
/* |
* set vt to the average of the min and max |
* classes. if the parent's period didn't |
* change, don't decrease vt of the class. |
*/ |
vt = max_cl->cl_vt; |
if (cl->cl_parent->cl_cvtmin != 0) |
vt = (cl->cl_parent->cl_cvtmin + vt)/2; |
|
if (cl->cl_parent->cl_vtperiod != |
cl->cl_parentperiod || vt > cl->cl_vt) |
cl->cl_vt = vt; |
} else { |
/* |
* first child for a new parent backlog period. |
* add parent's cvtmax to vtoff of children |
* to make a new vt (vtoff + vt) larger than |
* the vt in the last period for all children. |
*/ |
vt = cl->cl_parent->cl_cvtmax; |
list_for_each_entry(p, &cl->cl_parent->children, |
siblings) |
p->cl_vtoff += vt; |
cl->cl_vt = 0; |
cl->cl_parent->cl_cvtmax = 0; |
cl->cl_parent->cl_cvtmin = 0; |
} |
|
/* update the virtual curve */ |
vt = cl->cl_vt + cl->cl_vtoff; |
rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, |
cl->cl_total); |
if (cl->cl_virtual.x == vt) { |
cl->cl_virtual.x -= cl->cl_vtoff; |
cl->cl_vtoff = 0; |
} |
cl->cl_vtadj = 0; |
|
cl->cl_vtperiod++; /* increment vt period */ |
cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; |
if (cl->cl_parent->cl_nactive == 0) |
cl->cl_parentperiod++; |
cl->cl_f = 0; |
|
actlist_insert(cl); |
|
if (cl->cl_flags & HFSC_USC) { |
/* class has upper limit curve */ |
if (cur_time == 0) |
PSCHED_GET_TIME(cur_time); |
|
/* update the ulimit curve */ |
rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, |
cl->cl_total); |
/* compute myf */ |
cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, |
cl->cl_total); |
cl->cl_myfadj = 0; |
} |
} |
|
f = max(cl->cl_myf, cl->cl_cfmin); |
if (f != cl->cl_f) { |
cl->cl_f = f; |
update_cfmin(cl->cl_parent); |
} |
} |
} |
|
static void |
update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) |
{ |
u64 f; /* , myf_bound, delta; */ |
int go_passive = 0; |
|
if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC) |
go_passive = 1; |
|
for (; cl->cl_parent != NULL; cl = cl->cl_parent) { |
cl->cl_total += len; |
|
if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0) |
continue; |
|
if (go_passive && --cl->cl_nactive == 0) |
go_passive = 1; |
else |
go_passive = 0; |
|
if (go_passive) { |
/* no more active child, going passive */ |
|
/* update cvtmax of the parent class */ |
if (cl->cl_vt > cl->cl_parent->cl_cvtmax) |
cl->cl_parent->cl_cvtmax = cl->cl_vt; |
|
/* remove this class from the vt list */ |
actlist_remove(cl); |
|
update_cfmin(cl->cl_parent); |
|
continue; |
} |
|
/* |
* update vt and f |
*/ |
cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) |
- cl->cl_vtoff + cl->cl_vtadj; |
|
/* |
* if vt of the class is smaller than cvtmin, |
* the class was skipped in the past due to non-fit. |
* if so, we need to adjust vtadj. |
*/ |
if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { |
cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; |
cl->cl_vt = cl->cl_parent->cl_cvtmin; |
} |
|
/* update the vt list */ |
actlist_update(cl); |
|
if (cl->cl_flags & HFSC_USC) { |
cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, |
cl->cl_total); |
#if 0 |
/* |
* This code causes classes to stay way under their |
* limit when multiple classes are used at gigabit |
* speed. needs investigation. -kaber |
*/ |
/* |
* if myf lags behind by more than one clock tick |
* from the current time, adjust myfadj to prevent |
* a rate-limited class from going greedy. |
* in a steady state under rate-limiting, myf |
* fluctuates within one clock tick. |
*/ |
myf_bound = cur_time - PSCHED_JIFFIE2US(1); |
if (cl->cl_myf < myf_bound) { |
delta = cur_time - cl->cl_myf; |
cl->cl_myfadj += delta; |
cl->cl_myf += delta; |
} |
#endif |
} |
|
f = max(cl->cl_myf, cl->cl_cfmin); |
if (f != cl->cl_f) { |
cl->cl_f = f; |
update_cfmin(cl->cl_parent); |
} |
} |
} |
|
static void |
set_active(struct hfsc_class *cl, unsigned int len) |
{ |
if (cl->cl_flags & HFSC_RSC) |
init_ed(cl, len); |
if (cl->cl_flags & HFSC_FSC) |
init_vf(cl, len); |
|
list_add_tail(&cl->dlist, &cl->sched->droplist); |
} |
|
static void |
set_passive(struct hfsc_class *cl) |
{ |
if (cl->cl_flags & HFSC_RSC) |
ellist_remove(cl); |
|
list_del(&cl->dlist); |
|
/* |
* actlist is now handled in update_vf() so that update_vf(cl, 0, 0) |
* needs to be called explicitly to remove a class from actlist |
*/ |
} |
|
/* |
* hack to get length of first packet in queue. |
*/ |
static unsigned int |
qdisc_peek_len(struct Qdisc *sch) |
{ |
struct sk_buff *skb; |
unsigned int len; |
|
skb = sch->dequeue(sch); |
if (skb == NULL) { |
if (net_ratelimit()) |
printk("qdisc_peek_len: non work-conserving qdisc ?\n"); |
return 0; |
} |
len = skb->len; |
if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) { |
if (net_ratelimit()) |
printk("qdisc_peek_len: failed to requeue\n"); |
return 0; |
} |
return len; |
} |
|
static void |
hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) |
{ |
unsigned int len = cl->qdisc->q.qlen; |
|
qdisc_reset(cl->qdisc); |
if (len > 0) { |
update_vf(cl, 0, 0); |
set_passive(cl); |
sch->q.qlen -= len; |
} |
} |
|
static void |
hfsc_adjust_levels(struct hfsc_class *cl) |
{ |
struct hfsc_class *p; |
unsigned int level; |
|
do { |
level = 0; |
list_for_each_entry(p, &cl->children, siblings) { |
if (p->level > level) |
level = p->level; |
} |
cl->level = level + 1; |
} while ((cl = cl->cl_parent) != NULL); |
} |
|
static inline unsigned int |
hfsc_hash(u32 h) |
{ |
h ^= h >> 8; |
h ^= h >> 4; |
|
return h & (HFSC_HSIZE - 1); |
} |
|
static inline struct hfsc_class * |
hfsc_find_class(u32 classid, struct Qdisc *sch) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl; |
|
list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) { |
if (cl->classid == classid) |
return cl; |
} |
return NULL; |
} |
|
static void |
hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc, |
u64 cur_time) |
{ |
sc2isc(rsc, &cl->cl_rsc); |
rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); |
cl->cl_eligible = cl->cl_deadline; |
if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { |
cl->cl_eligible.dx = 0; |
cl->cl_eligible.dy = 0; |
} |
cl->cl_flags |= HFSC_RSC; |
} |
|
static void |
hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) |
{ |
sc2isc(fsc, &cl->cl_fsc); |
rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); |
cl->cl_flags |= HFSC_FSC; |
} |
|
static void |
hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, |
u64 cur_time) |
{ |
sc2isc(usc, &cl->cl_usc); |
rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); |
cl->cl_flags |= HFSC_USC; |
} |
|
static int |
hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, |
struct rtattr **tca, unsigned long *arg) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl = (struct hfsc_class *)*arg; |
struct hfsc_class *parent = NULL; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_HFSC_MAX]; |
struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; |
u64 cur_time; |
|
if (opt == NULL || |
rtattr_parse(tb, TCA_HFSC_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt))) |
return -EINVAL; |
|
if (tb[TCA_HFSC_RSC-1]) { |
if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc)) |
return -EINVAL; |
rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]); |
if (rsc->m1 == 0 && rsc->m2 == 0) |
rsc = NULL; |
} |
|
if (tb[TCA_HFSC_FSC-1]) { |
if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc)) |
return -EINVAL; |
fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]); |
if (fsc->m1 == 0 && fsc->m2 == 0) |
fsc = NULL; |
} |
|
if (tb[TCA_HFSC_USC-1]) { |
if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc)) |
return -EINVAL; |
usc = RTA_DATA(tb[TCA_HFSC_USC-1]); |
if (usc->m1 == 0 && usc->m2 == 0) |
usc = NULL; |
} |
|
if (cl != NULL) { |
if (parentid) { |
if (cl->cl_parent && cl->cl_parent->classid != parentid) |
return -EINVAL; |
if (cl->cl_parent == NULL && parentid != TC_H_ROOT) |
return -EINVAL; |
} |
PSCHED_GET_TIME(cur_time); |
|
sch_tree_lock(sch); |
if (rsc != NULL) |
hfsc_change_rsc(cl, rsc, cur_time); |
if (fsc != NULL) |
hfsc_change_fsc(cl, fsc); |
if (usc != NULL) |
hfsc_change_usc(cl, usc, cur_time); |
|
if (cl->qdisc->q.qlen != 0) { |
if (cl->cl_flags & HFSC_RSC) |
update_ed(cl, qdisc_peek_len(cl->qdisc)); |
if (cl->cl_flags & HFSC_FSC) |
update_vf(cl, 0, cur_time); |
} |
sch_tree_unlock(sch); |
|
#ifdef CONFIG_NET_ESTIMATOR |
if (tca[TCA_RATE-1]) { |
qdisc_kill_estimator(&cl->stats); |
qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); |
} |
#endif |
return 0; |
} |
|
if (parentid == TC_H_ROOT) |
return -EEXIST; |
|
parent = &q->root; |
if (parentid) { |
parent = hfsc_find_class(parentid, sch); |
if (parent == NULL) |
return -ENOENT; |
} |
|
if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) |
return -EINVAL; |
if (hfsc_find_class(classid, sch)) |
return -EEXIST; |
|
if (rsc == NULL && fsc == NULL) |
return -EINVAL; |
|
cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL); |
if (cl == NULL) |
return -ENOBUFS; |
memset(cl, 0, sizeof(struct hfsc_class)); |
|
if (rsc != NULL) |
hfsc_change_rsc(cl, rsc, 0); |
if (fsc != NULL) |
hfsc_change_fsc(cl, fsc); |
if (usc != NULL) |
hfsc_change_usc(cl, usc, 0); |
|
cl->refcnt = 1; |
cl->classid = classid; |
cl->sched = q; |
cl->cl_parent = parent; |
cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); |
if (cl->qdisc == NULL) |
cl->qdisc = &noop_qdisc; |
cl->stats.lock = &sch->dev->queue_lock; |
INIT_LIST_HEAD(&cl->children); |
INIT_LIST_HEAD(&cl->actlist); |
|
sch_tree_lock(sch); |
list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]); |
list_add_tail(&cl->siblings, &parent->children); |
if (parent->level == 0) |
hfsc_purge_queue(sch, parent); |
hfsc_adjust_levels(parent); |
sch_tree_unlock(sch); |
|
#ifdef CONFIG_NET_ESTIMATOR |
if (tca[TCA_RATE-1]) |
qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); |
#endif |
*arg = (unsigned long)cl; |
return 0; |
} |
|
static void |
hfsc_destroy_filters(struct tcf_proto **fl) |
{ |
struct tcf_proto *tp; |
|
while ((tp = *fl) != NULL) { |
*fl = tp->next; |
tcf_destroy(tp); |
} |
} |
|
static void |
hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
|
hfsc_destroy_filters(&cl->filter_list); |
qdisc_destroy(cl->qdisc); |
#ifdef CONFIG_NET_ESTIMATOR |
qdisc_kill_estimator(&cl->stats); |
#endif |
if (cl != &q->root) |
kfree(cl); |
} |
|
static int |
hfsc_delete_class(struct Qdisc *sch, unsigned long arg) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl = (struct hfsc_class *)arg; |
|
if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) |
return -EBUSY; |
|
sch_tree_lock(sch); |
|
list_del(&cl->hlist); |
list_del(&cl->siblings); |
hfsc_adjust_levels(cl->cl_parent); |
hfsc_purge_queue(sch, cl); |
if (--cl->refcnt == 0) |
hfsc_destroy_class(sch, cl); |
|
sch_tree_unlock(sch); |
return 0; |
} |
|
static struct hfsc_class * |
hfsc_classify(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl; |
struct tcf_result res; |
struct tcf_proto *tcf; |
int result; |
|
if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 && |
(cl = hfsc_find_class(skb->priority, sch)) != NULL) |
if (cl->level == 0) |
return cl; |
|
tcf = q->root.filter_list; |
while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { |
#ifdef CONFIG_NET_CLS_POLICE |
if (result == TC_POLICE_SHOT) |
return NULL; |
#endif |
if ((cl = (struct hfsc_class *)res.class) == NULL) { |
if ((cl = hfsc_find_class(res.classid, sch)) == NULL) |
break; /* filter selected invalid classid */ |
} |
|
if (cl->level == 0) |
return cl; /* hit leaf class */ |
|
/* apply inner filter chain */ |
tcf = cl->filter_list; |
} |
|
/* classification failed, try default class */ |
cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); |
if (cl == NULL || cl->level > 0) |
return NULL; |
|
return cl; |
} |
|
static int |
hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, |
struct Qdisc **old) |
{ |
struct hfsc_class *cl = (struct hfsc_class *)arg; |
|
if (cl == NULL) |
return -ENOENT; |
if (cl->level > 0) |
return -EINVAL; |
if (new == NULL) { |
new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); |
if (new == NULL) |
new = &noop_qdisc; |
} |
|
sch_tree_lock(sch); |
hfsc_purge_queue(sch, cl); |
*old = xchg(&cl->qdisc, new); |
sch_tree_unlock(sch); |
return 0; |
} |
|
static struct Qdisc * |
hfsc_class_leaf(struct Qdisc *sch, unsigned long arg) |
{ |
struct hfsc_class *cl = (struct hfsc_class *)arg; |
|
if (cl != NULL && cl->level == 0) |
return cl->qdisc; |
|
return NULL; |
} |
|
static unsigned long |
hfsc_get_class(struct Qdisc *sch, u32 classid) |
{ |
struct hfsc_class *cl = hfsc_find_class(classid, sch); |
|
if (cl != NULL) |
cl->refcnt++; |
|
return (unsigned long)cl; |
} |
|
static void |
hfsc_put_class(struct Qdisc *sch, unsigned long arg) |
{ |
struct hfsc_class *cl = (struct hfsc_class *)arg; |
|
if (--cl->refcnt == 0) |
hfsc_destroy_class(sch, cl); |
} |
|
static unsigned long |
hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) |
{ |
struct hfsc_class *p = (struct hfsc_class *)parent; |
struct hfsc_class *cl = hfsc_find_class(classid, sch); |
|
if (cl != NULL) { |
if (p != NULL && p->level <= cl->level) |
return 0; |
cl->filter_cnt++; |
} |
|
return (unsigned long)cl; |
} |
|
static void |
hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) |
{ |
struct hfsc_class *cl = (struct hfsc_class *)arg; |
|
cl->filter_cnt--; |
} |
|
static struct tcf_proto ** |
hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl = (struct hfsc_class *)arg; |
|
if (cl == NULL) |
cl = &q->root; |
|
return &cl->filter_list; |
} |
|
static int |
hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) |
{ |
struct tc_service_curve tsc; |
|
tsc.m1 = sm2m(sc->sm1); |
tsc.d = dx2d(sc->dx); |
tsc.m2 = sm2m(sc->sm2); |
RTA_PUT(skb, attr, sizeof(tsc), &tsc); |
|
return skb->len; |
|
rtattr_failure: |
return -1; |
} |
|
static inline int |
hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) |
{ |
if ((cl->cl_flags & HFSC_RSC) && |
(hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) |
goto rtattr_failure; |
|
if ((cl->cl_flags & HFSC_FSC) && |
(hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) |
goto rtattr_failure; |
|
if ((cl->cl_flags & HFSC_USC) && |
(hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) |
goto rtattr_failure; |
|
return skb->len; |
|
rtattr_failure: |
return -1; |
} |
|
static inline int |
hfsc_dump_stats(struct sk_buff *skb, struct hfsc_class *cl) |
{ |
cl->stats.qlen = cl->qdisc->q.qlen; |
if (qdisc_copy_stats(skb, &cl->stats) < 0) |
goto rtattr_failure; |
|
return skb->len; |
|
rtattr_failure: |
return -1; |
} |
|
static inline int |
hfsc_dump_xstats(struct sk_buff *skb, struct hfsc_class *cl) |
{ |
struct tc_hfsc_stats xstats; |
|
xstats.level = cl->level; |
xstats.period = cl->cl_vtperiod; |
xstats.work = cl->cl_total; |
xstats.rtwork = cl->cl_cumul; |
RTA_PUT(skb, TCA_XSTATS, sizeof(xstats), &xstats); |
|
return skb->len; |
|
rtattr_failure: |
return -1; |
} |
|
static int |
hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, |
struct tcmsg *tcm) |
{ |
struct hfsc_class *cl = (struct hfsc_class *)arg; |
unsigned char *b = skb->tail; |
struct rtattr *rta = (struct rtattr *)b; |
|
tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT; |
tcm->tcm_handle = cl->classid; |
if (cl->level == 0) |
tcm->tcm_info = cl->qdisc->handle; |
|
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
if (hfsc_dump_curves(skb, cl) < 0) |
goto rtattr_failure; |
rta->rta_len = skb->tail - b; |
|
if ((hfsc_dump_stats(skb, cl) < 0) || |
(hfsc_dump_xstats(skb, cl) < 0)) |
goto rtattr_failure; |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static void |
hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl; |
unsigned int i; |
|
if (arg->stop) |
return; |
|
for (i = 0; i < HFSC_HSIZE; i++) { |
list_for_each_entry(cl, &q->clhash[i], hlist) { |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (arg->fn(sch, (unsigned long)cl, arg) < 0) { |
arg->stop = 1; |
return; |
} |
arg->count++; |
} |
} |
} |
|
static void |
hfsc_watchdog(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc *)arg; |
|
sch->flags &= ~TCQ_F_THROTTLED; |
netif_schedule(sch->dev); |
} |
|
static void |
hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl; |
u64 next_time = 0; |
long delay; |
|
if ((cl = ellist_get_minel(&q->eligible)) != NULL) |
next_time = cl->cl_e; |
if (q->root.cl_cfmin != 0) { |
if (next_time == 0 || next_time > q->root.cl_cfmin) |
next_time = q->root.cl_cfmin; |
} |
ASSERT(next_time != 0); |
delay = next_time - cur_time; |
delay = PSCHED_US2JIFFIE(delay); |
|
sch->flags |= TCQ_F_THROTTLED; |
mod_timer(&q->wd_timer, jiffies + delay); |
} |
|
static int |
hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct tc_hfsc_qopt *qopt; |
unsigned int i; |
|
if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) |
return -EINVAL; |
qopt = RTA_DATA(opt); |
|
memset(q, 0, sizeof(struct hfsc_sched)); |
sch->stats.lock = &sch->dev->queue_lock; |
|
q->defcls = qopt->defcls; |
for (i = 0; i < HFSC_HSIZE; i++) |
INIT_LIST_HEAD(&q->clhash[i]); |
INIT_LIST_HEAD(&q->eligible); |
INIT_LIST_HEAD(&q->droplist); |
skb_queue_head_init(&q->requeue); |
|
q->root.refcnt = 1; |
q->root.classid = sch->handle; |
q->root.sched = q; |
q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); |
if (q->root.qdisc == NULL) |
q->root.qdisc = &noop_qdisc; |
q->root.stats.lock = &sch->dev->queue_lock; |
INIT_LIST_HEAD(&q->root.children); |
INIT_LIST_HEAD(&q->root.actlist); |
|
list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]); |
|
init_timer(&q->wd_timer); |
q->wd_timer.function = hfsc_watchdog; |
q->wd_timer.data = (unsigned long)sch; |
|
MOD_INC_USE_COUNT; |
return 0; |
} |
|
static int |
hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct tc_hfsc_qopt *qopt; |
|
if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) |
return -EINVAL;; |
qopt = RTA_DATA(opt); |
|
sch_tree_lock(sch); |
q->defcls = qopt->defcls; |
sch_tree_unlock(sch); |
|
return 0; |
} |
|
static void |
hfsc_reset_class(struct hfsc_class *cl) |
{ |
cl->cl_total = 0; |
cl->cl_cumul = 0; |
cl->cl_d = 0; |
cl->cl_e = 0; |
cl->cl_vt = 0; |
cl->cl_vtadj = 0; |
cl->cl_vtoff = 0; |
cl->cl_cvtmin = 0; |
cl->cl_cvtmax = 0; |
cl->cl_vtperiod = 0; |
cl->cl_parentperiod = 0; |
cl->cl_f = 0; |
cl->cl_myf = 0; |
cl->cl_myfadj = 0; |
cl->cl_cfmin = 0; |
cl->cl_nactive = 0; |
INIT_LIST_HEAD(&cl->actlist); |
qdisc_reset(cl->qdisc); |
|
if (cl->cl_flags & HFSC_RSC) |
rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0); |
if (cl->cl_flags & HFSC_FSC) |
rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0); |
if (cl->cl_flags & HFSC_USC) |
rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0); |
} |
|
static void |
hfsc_reset_qdisc(struct Qdisc *sch) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl; |
unsigned int i; |
|
for (i = 0; i < HFSC_HSIZE; i++) { |
list_for_each_entry(cl, &q->clhash[i], hlist) |
hfsc_reset_class(cl); |
} |
__skb_queue_purge(&q->requeue); |
INIT_LIST_HEAD(&q->eligible); |
INIT_LIST_HEAD(&q->droplist); |
del_timer(&q->wd_timer); |
sch->flags &= ~TCQ_F_THROTTLED; |
sch->q.qlen = 0; |
} |
|
static void |
hfsc_destroy_qdisc(struct Qdisc *sch) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl, *next; |
unsigned int i; |
|
for (i = 0; i < HFSC_HSIZE; i++) { |
list_for_each_entry_safe(cl, next, &q->clhash[i], hlist) |
hfsc_destroy_class(sch, cl); |
} |
__skb_queue_purge(&q->requeue); |
del_timer(&q->wd_timer); |
MOD_DEC_USE_COUNT; |
} |
|
static int |
hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
unsigned char *b = skb->tail; |
struct tc_hfsc_qopt qopt; |
|
qopt.defcls = q->defcls; |
RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); |
|
sch->stats.qlen = sch->q.qlen; |
if (qdisc_copy_stats(skb, &sch->stats) < 0) |
goto rtattr_failure; |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int |
hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct hfsc_class *cl = hfsc_classify(skb, sch); |
unsigned int len = skb->len; |
int err; |
|
if (cl == NULL) { |
kfree_skb(skb); |
sch->stats.drops++; |
return NET_XMIT_DROP; |
} |
|
err = cl->qdisc->enqueue(skb, cl->qdisc); |
if (unlikely(err != NET_XMIT_SUCCESS)) { |
cl->stats.drops++; |
sch->stats.drops++; |
return err; |
} |
|
if (cl->qdisc->q.qlen == 1) |
set_active(cl, len); |
|
cl->stats.packets++; |
cl->stats.bytes += len; |
sch->stats.packets++; |
sch->stats.bytes += len; |
sch->q.qlen++; |
|
return NET_XMIT_SUCCESS; |
} |
|
static struct sk_buff * |
hfsc_dequeue(struct Qdisc *sch) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl; |
struct sk_buff *skb; |
u64 cur_time; |
unsigned int next_len; |
int realtime = 0; |
|
if (sch->q.qlen == 0) |
return NULL; |
if ((skb = __skb_dequeue(&q->requeue))) |
goto out; |
|
PSCHED_GET_TIME(cur_time); |
|
/* |
* if there are eligible classes, use real-time criteria. |
* find the class with the minimum deadline among |
* the eligible classes. |
*/ |
if ((cl = ellist_get_mindl(&q->eligible, cur_time)) != NULL) { |
realtime = 1; |
} else { |
/* |
* use link-sharing criteria |
* get the class with the minimum vt in the hierarchy |
*/ |
cl = actlist_get_minvt(&q->root, cur_time); |
if (cl == NULL) { |
sch->stats.overlimits++; |
if (!netif_queue_stopped(sch->dev)) |
hfsc_schedule_watchdog(sch, cur_time); |
return NULL; |
} |
} |
|
skb = cl->qdisc->dequeue(cl->qdisc); |
if (skb == NULL) { |
if (net_ratelimit()) |
printk("HFSC: Non-work-conserving qdisc ?\n"); |
return NULL; |
} |
|
update_vf(cl, skb->len, cur_time); |
if (realtime) |
cl->cl_cumul += skb->len; |
|
if (cl->qdisc->q.qlen != 0) { |
if (cl->cl_flags & HFSC_RSC) { |
/* update ed */ |
next_len = qdisc_peek_len(cl->qdisc); |
if (realtime) |
update_ed(cl, next_len); |
else |
update_d(cl, next_len); |
} |
} else { |
/* the class becomes passive */ |
set_passive(cl); |
} |
|
out: |
sch->flags &= ~TCQ_F_THROTTLED; |
sch->q.qlen--; |
|
return skb; |
} |
|
static int |
hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
|
__skb_queue_head(&q->requeue, skb); |
sch->q.qlen++; |
return NET_XMIT_SUCCESS; |
} |
|
static unsigned int |
hfsc_drop(struct Qdisc *sch) |
{ |
struct hfsc_sched *q = (struct hfsc_sched *)sch->data; |
struct hfsc_class *cl; |
unsigned int len; |
|
list_for_each_entry(cl, &q->droplist, dlist) { |
if (cl->qdisc->ops->drop != NULL && |
(len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { |
if (cl->qdisc->q.qlen == 0) { |
update_vf(cl, 0, 0); |
set_passive(cl); |
} else { |
list_move_tail(&cl->dlist, &q->droplist); |
} |
cl->stats.drops++; |
sch->stats.drops++; |
sch->q.qlen--; |
return len; |
} |
} |
return 0; |
} |
|
static struct Qdisc_class_ops hfsc_class_ops = { |
.change = hfsc_change_class, |
.delete = hfsc_delete_class, |
.graft = hfsc_graft_class, |
.leaf = hfsc_class_leaf, |
.get = hfsc_get_class, |
.put = hfsc_put_class, |
.bind_tcf = hfsc_bind_tcf, |
.unbind_tcf = hfsc_unbind_tcf, |
.tcf_chain = hfsc_tcf_chain, |
.dump = hfsc_dump_class, |
.walk = hfsc_walk |
}; |
|
struct Qdisc_ops hfsc_qdisc_ops = { |
.id = "hfsc", |
.init = hfsc_init_qdisc, |
.change = hfsc_change_qdisc, |
.reset = hfsc_reset_qdisc, |
.destroy = hfsc_destroy_qdisc, |
.dump = hfsc_dump_qdisc, |
.enqueue = hfsc_enqueue, |
.dequeue = hfsc_dequeue, |
.requeue = hfsc_requeue, |
.drop = hfsc_drop, |
.cl_ops = &hfsc_class_ops, |
.priv_size = sizeof(struct hfsc_sched) |
}; |
|
static int __init |
hfsc_init(void) |
{ |
return register_qdisc(&hfsc_qdisc_ops); |
} |
|
static void __exit |
hfsc_cleanup(void) |
{ |
unregister_qdisc(&hfsc_qdisc_ops); |
} |
|
MODULE_LICENSE("GPL"); |
module_init(hfsc_init); |
module_exit(hfsc_cleanup); |
/sch_fifo.c
0,0 → 1,211
/* |
* net/sched/sch_fifo.c The simplest FIFO queue. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
#include <linux/config.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
/* 1 band FIFO pseudo-"scheduler" */ |
|
struct fifo_sched_data |
{ |
unsigned limit; |
}; |
|
static int |
bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; |
|
if (sch->stats.backlog + skb->len <= q->limit) { |
__skb_queue_tail(&sch->q, skb); |
sch->stats.backlog += skb->len; |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
return 0; |
} |
sch->stats.drops++; |
#ifdef CONFIG_NET_CLS_POLICE |
if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) |
#endif |
kfree_skb(skb); |
return NET_XMIT_DROP; |
} |
|
static int |
bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
__skb_queue_head(&sch->q, skb); |
sch->stats.backlog += skb->len; |
return 0; |
} |
|
static struct sk_buff * |
bfifo_dequeue(struct Qdisc* sch) |
{ |
struct sk_buff *skb; |
|
skb = __skb_dequeue(&sch->q); |
if (skb) |
sch->stats.backlog -= skb->len; |
return skb; |
} |
|
static unsigned int |
fifo_drop(struct Qdisc* sch) |
{ |
struct sk_buff *skb; |
|
skb = __skb_dequeue_tail(&sch->q); |
if (skb) { |
unsigned int len = skb->len; |
sch->stats.backlog -= len; |
kfree_skb(skb); |
return len; |
} |
return 0; |
} |
|
static void |
fifo_reset(struct Qdisc* sch) |
{ |
skb_queue_purge(&sch->q); |
sch->stats.backlog = 0; |
} |
|
static int |
pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; |
|
if (sch->q.qlen < q->limit) { |
__skb_queue_tail(&sch->q, skb); |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
return 0; |
} |
sch->stats.drops++; |
#ifdef CONFIG_NET_CLS_POLICE |
if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) |
#endif |
kfree_skb(skb); |
return NET_XMIT_DROP; |
} |
|
static int |
pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
__skb_queue_head(&sch->q, skb); |
return 0; |
} |
|
|
static struct sk_buff * |
pfifo_dequeue(struct Qdisc* sch) |
{ |
return __skb_dequeue(&sch->q); |
} |
|
static int fifo_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct fifo_sched_data *q = (void*)sch->data; |
|
if (opt == NULL) { |
unsigned int limit = sch->dev->tx_queue_len ? : 1; |
|
if (sch->ops == &bfifo_qdisc_ops) |
q->limit = limit*sch->dev->mtu; |
else |
q->limit = limit; |
} else { |
struct tc_fifo_qopt *ctl = RTA_DATA(opt); |
if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) |
return -EINVAL; |
q->limit = ctl->limit; |
} |
return 0; |
} |
|
static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct fifo_sched_data *q = (void*)sch->data; |
unsigned char *b = skb->tail; |
struct tc_fifo_qopt opt; |
|
opt.limit = q->limit; |
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
struct Qdisc_ops pfifo_qdisc_ops = |
{ |
NULL, |
NULL, |
"pfifo", |
sizeof(struct fifo_sched_data), |
|
pfifo_enqueue, |
pfifo_dequeue, |
pfifo_requeue, |
fifo_drop, |
|
fifo_init, |
fifo_reset, |
NULL, |
fifo_init, |
|
fifo_dump, |
}; |
|
struct Qdisc_ops bfifo_qdisc_ops = |
{ |
NULL, |
NULL, |
"bfifo", |
sizeof(struct fifo_sched_data), |
|
bfifo_enqueue, |
bfifo_dequeue, |
bfifo_requeue, |
fifo_drop, |
|
fifo_init, |
fifo_reset, |
NULL, |
fifo_init, |
fifo_dump, |
}; |
/cls_rsvp6.c
0,0 → 1,43
/* |
* net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <linux/ipv6.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
#define RSVP_DST_LEN 4 |
#define RSVP_ID "rsvp6" |
#define RSVP_OPS cls_rsvp6_ops |
|
#include "cls_rsvp.h" |
MODULE_LICENSE("GPL"); |
/sch_sfq.c
0,0 → 1,502
/* |
* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <linux/init.h> |
#include <net/ip.h> |
#include <linux/ipv6.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
|
/* Stochastic Fairness Queuing algorithm. |
======================================= |
|
Source: |
Paul E. McKenney "Stochastic Fairness Queuing", |
IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. |
|
Paul E. McKenney "Stochastic Fairness Queuing", |
"Interworking: Research and Experience", v.2, 1991, p.113-131. |
|
|
See also: |
M. Shreedhar and George Varghese "Efficient Fair |
Queuing using Deficit Round Robin", Proc. SIGCOMM 95. |
|
|
This is not the thing that is usually called (W)FQ nowadays. |
It does not use any timestamp mechanism, but instead |
processes queues in round-robin order. |
|
ADVANTAGE: |
|
- It is very cheap. Both CPU and memory requirements are minimal. |
|
DRAWBACKS: |
|
- "Stochastic" -> It is not 100% fair. |
When hash collisions occur, several flows are considered as one. |
|
- "Round-robin" -> It introduces larger delays than virtual clock |
based schemes, and should not be used for isolating interactive |
traffic from non-interactive. It means, that this scheduler |
should be used as leaf of CBQ or P3, which put interactive traffic |
to higher priority band. |
|
We still need true WFQ for top level CSZ, but using WFQ |
for the best effort traffic is absolutely pointless: |
SFQ is superior for this purpose. |
|
IMPLEMENTATION: |
This implementation limits maximal queue length to 128; |
maximal mtu to 2^15-1; number of hash buckets to 1024. |
The only goal of this restrictions was that all data |
fit into one 4K page :-). Struct sfq_sched_data is |
organized in anti-cache manner: all the data for a bucket |
are scattered over different locations. This is not good, |
but it allowed me to put it into 4K. |
|
It is easy to increase these values, but not in flight. */ |
|
#define SFQ_DEPTH 128 |
#define SFQ_HASH_DIVISOR 1024 |
|
/* This type should contain at least SFQ_DEPTH*2 values */ |
typedef unsigned char sfq_index; |
|
struct sfq_head |
{ |
sfq_index next; |
sfq_index prev; |
}; |
|
struct sfq_sched_data |
{ |
/* Parameters */ |
int perturb_period; |
unsigned quantum; /* Allotment per round: MUST BE >= MTU */ |
int limit; |
|
/* Variables */ |
struct timer_list perturb_timer; |
int perturbation; |
sfq_index tail; /* Index of current slot in round */ |
sfq_index max_depth; /* Maximal depth */ |
|
sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ |
sfq_index next[SFQ_DEPTH]; /* Active slots link */ |
short allot[SFQ_DEPTH]; /* Current allotment per slot */ |
unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ |
struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ |
struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ |
}; |
|
static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) |
{ |
int pert = q->perturbation; |
|
/* Have we any rotation primitives? If not, WHY? */ |
h ^= (h1<<pert) ^ (h1>>(0x1F - pert)); |
h ^= h>>10; |
return h & 0x3FF; |
} |
|
static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) |
{ |
u32 h, h2; |
|
switch (skb->protocol) { |
case __constant_htons(ETH_P_IP): |
{ |
struct iphdr *iph = skb->nh.iph; |
h = iph->daddr; |
h2 = iph->saddr^iph->protocol; |
if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && |
(iph->protocol == IPPROTO_TCP || |
iph->protocol == IPPROTO_UDP || |
iph->protocol == IPPROTO_ESP)) |
h2 ^= *(((u32*)iph) + iph->ihl); |
break; |
} |
case __constant_htons(ETH_P_IPV6): |
{ |
struct ipv6hdr *iph = skb->nh.ipv6h; |
h = iph->daddr.s6_addr32[3]; |
h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; |
if (iph->nexthdr == IPPROTO_TCP || |
iph->nexthdr == IPPROTO_UDP || |
iph->nexthdr == IPPROTO_ESP) |
h2 ^= *(u32*)&iph[1]; |
break; |
} |
default: |
h = (u32)(unsigned long)skb->dst^skb->protocol; |
h2 = (u32)(unsigned long)skb->sk; |
} |
return sfq_fold_hash(q, h, h2); |
} |
|
extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x) |
{ |
sfq_index p, n; |
int d = q->qs[x].qlen + SFQ_DEPTH; |
|
p = d; |
n = q->dep[d].next; |
q->dep[x].next = n; |
q->dep[x].prev = p; |
q->dep[p].next = q->dep[n].prev = x; |
} |
|
extern __inline__ void sfq_dec(struct sfq_sched_data *q, sfq_index x) |
{ |
sfq_index p, n; |
|
n = q->dep[x].next; |
p = q->dep[x].prev; |
q->dep[p].next = n; |
q->dep[n].prev = p; |
|
if (n == p && q->max_depth == q->qs[x].qlen + 1) |
q->max_depth--; |
|
sfq_link(q, x); |
} |
|
extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x) |
{ |
sfq_index p, n; |
int d; |
|
n = q->dep[x].next; |
p = q->dep[x].prev; |
q->dep[p].next = n; |
q->dep[n].prev = p; |
d = q->qs[x].qlen; |
if (q->max_depth < d) |
q->max_depth = d; |
|
sfq_link(q, x); |
} |
|
static unsigned int sfq_drop(struct Qdisc *sch) |
{ |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
sfq_index d = q->max_depth; |
struct sk_buff *skb; |
unsigned int len; |
|
/* Queue is full! Find the longest slot and |
drop a packet from it */ |
|
if (d > 1) { |
sfq_index x = q->dep[d+SFQ_DEPTH].next; |
skb = q->qs[x].prev; |
len = skb->len; |
__skb_unlink(skb, &q->qs[x]); |
kfree_skb(skb); |
sfq_dec(q, x); |
sch->q.qlen--; |
sch->stats.drops++; |
return len; |
} |
|
if (d == 1) { |
/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ |
d = q->next[q->tail]; |
q->next[q->tail] = q->next[d]; |
q->allot[q->next[d]] += q->quantum; |
skb = q->qs[d].prev; |
len = skb->len; |
__skb_unlink(skb, &q->qs[d]); |
kfree_skb(skb); |
sfq_dec(q, d); |
sch->q.qlen--; |
q->ht[q->hash[d]] = SFQ_DEPTH; |
sch->stats.drops++; |
return len; |
} |
|
return 0; |
} |
|
static int |
sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
unsigned hash = sfq_hash(q, skb); |
sfq_index x; |
|
x = q->ht[hash]; |
if (x == SFQ_DEPTH) { |
q->ht[hash] = x = q->dep[SFQ_DEPTH].next; |
q->hash[x] = hash; |
} |
__skb_queue_tail(&q->qs[x], skb); |
sfq_inc(q, x); |
if (q->qs[x].qlen == 1) { /* The flow is new */ |
if (q->tail == SFQ_DEPTH) { /* It is the first flow */ |
q->tail = x; |
q->next[x] = x; |
q->allot[x] = q->quantum; |
} else { |
q->next[x] = q->next[q->tail]; |
q->next[q->tail] = x; |
q->tail = x; |
} |
} |
if (++sch->q.qlen < q->limit-1) { |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
return 0; |
} |
|
sfq_drop(sch); |
return NET_XMIT_CN; |
} |
|
static int |
sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
unsigned hash = sfq_hash(q, skb); |
sfq_index x; |
|
x = q->ht[hash]; |
if (x == SFQ_DEPTH) { |
q->ht[hash] = x = q->dep[SFQ_DEPTH].next; |
q->hash[x] = hash; |
} |
__skb_queue_head(&q->qs[x], skb); |
sfq_inc(q, x); |
if (q->qs[x].qlen == 1) { /* The flow is new */ |
if (q->tail == SFQ_DEPTH) { /* It is the first flow */ |
q->tail = x; |
q->next[x] = x; |
q->allot[x] = q->quantum; |
} else { |
q->next[x] = q->next[q->tail]; |
q->next[q->tail] = x; |
q->tail = x; |
} |
} |
if (++sch->q.qlen < q->limit - 1) |
return 0; |
|
sch->stats.drops++; |
sfq_drop(sch); |
return NET_XMIT_CN; |
} |
|
|
|
|
static struct sk_buff * |
sfq_dequeue(struct Qdisc* sch) |
{ |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
struct sk_buff *skb; |
sfq_index a, old_a; |
|
/* No active slots */ |
if (q->tail == SFQ_DEPTH) |
return NULL; |
|
a = old_a = q->next[q->tail]; |
|
/* Grab packet */ |
skb = __skb_dequeue(&q->qs[a]); |
sfq_dec(q, a); |
sch->q.qlen--; |
|
/* Is the slot empty? */ |
if (q->qs[a].qlen == 0) { |
q->ht[q->hash[a]] = SFQ_DEPTH; |
a = q->next[a]; |
if (a == old_a) { |
q->tail = SFQ_DEPTH; |
return skb; |
} |
q->next[q->tail] = a; |
q->allot[a] += q->quantum; |
} else if ((q->allot[a] -= skb->len) <= 0) { |
q->tail = a; |
a = q->next[a]; |
q->allot[a] += q->quantum; |
} |
return skb; |
} |
|
static void |
sfq_reset(struct Qdisc* sch) |
{ |
struct sk_buff *skb; |
|
while ((skb = sfq_dequeue(sch)) != NULL) |
kfree_skb(skb); |
} |
|
static void sfq_perturbation(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc*)arg; |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
|
q->perturbation = net_random()&0x1F; |
q->perturb_timer.expires = jiffies + q->perturb_period; |
|
if (q->perturb_period) { |
q->perturb_timer.expires = jiffies + q->perturb_period; |
add_timer(&q->perturb_timer); |
} |
} |
|
static int sfq_change(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
struct tc_sfq_qopt *ctl = RTA_DATA(opt); |
|
if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) |
return -EINVAL; |
|
sch_tree_lock(sch); |
q->quantum = ctl->quantum ? : psched_mtu(sch->dev); |
q->perturb_period = ctl->perturb_period*HZ; |
if (ctl->limit) |
q->limit = min_t(u32, ctl->limit, SFQ_DEPTH); |
|
while (sch->q.qlen >= q->limit-1) |
sfq_drop(sch); |
|
del_timer(&q->perturb_timer); |
if (q->perturb_period) { |
q->perturb_timer.expires = jiffies + q->perturb_period; |
add_timer(&q->perturb_timer); |
} |
sch_tree_unlock(sch); |
return 0; |
} |
|
static int sfq_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
int i; |
|
q->perturb_timer.data = (unsigned long)sch; |
q->perturb_timer.function = sfq_perturbation; |
init_timer(&q->perturb_timer); |
|
for (i=0; i<SFQ_HASH_DIVISOR; i++) |
q->ht[i] = SFQ_DEPTH; |
for (i=0; i<SFQ_DEPTH; i++) { |
skb_queue_head_init(&q->qs[i]); |
q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; |
q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; |
} |
q->limit = SFQ_DEPTH; |
q->max_depth = 0; |
q->tail = SFQ_DEPTH; |
if (opt == NULL) { |
q->quantum = psched_mtu(sch->dev); |
q->perturb_period = 0; |
} else { |
int err = sfq_change(sch, opt); |
if (err) |
return err; |
} |
for (i=0; i<SFQ_DEPTH; i++) |
sfq_link(q, i); |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
static void sfq_destroy(struct Qdisc *sch) |
{ |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
del_timer(&q->perturb_timer); |
MOD_DEC_USE_COUNT; |
} |
|
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; |
unsigned char *b = skb->tail; |
struct tc_sfq_qopt opt; |
|
opt.quantum = q->quantum; |
opt.perturb_period = q->perturb_period/HZ; |
|
opt.limit = q->limit; |
opt.divisor = SFQ_HASH_DIVISOR; |
opt.flows = q->limit; |
|
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
struct Qdisc_ops sfq_qdisc_ops = |
{ |
NULL, |
NULL, |
"sfq", |
sizeof(struct sfq_sched_data), |
|
sfq_enqueue, |
sfq_dequeue, |
sfq_requeue, |
sfq_drop, |
|
sfq_init, |
sfq_reset, |
sfq_destroy, |
NULL, /* sfq_change */ |
|
sfq_dump, |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&sfq_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&sfq_qdisc_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/Config.in
0,0 → 1,42
# |
# Traffic control configuration. |
# |
tristate ' CBQ packet scheduler' CONFIG_NET_SCH_CBQ |
tristate ' HTB packet scheduler' CONFIG_NET_SCH_HTB |
tristate ' CSZ packet scheduler' CONFIG_NET_SCH_CSZ |
#tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ |
tristate ' H-FSC packet scheduler' CONFIG_NET_SCH_HFSC |
if [ "$CONFIG_ATM" = "y" -o "$CONFIG_ATM" = "m" ]; then |
dep_tristate ' ATM pseudo-scheduler' CONFIG_NET_SCH_ATM $CONFIG_ATM |
fi |
tristate ' The simplest PRIO pseudoscheduler' CONFIG_NET_SCH_PRIO |
tristate ' RED queue' CONFIG_NET_SCH_RED |
tristate ' SFQ queue' CONFIG_NET_SCH_SFQ |
tristate ' TEQL queue' CONFIG_NET_SCH_TEQL |
tristate ' TBF queue' CONFIG_NET_SCH_TBF |
tristate ' GRED queue' CONFIG_NET_SCH_GRED |
tristate ' Network delay simulator' CONFIG_NET_SCH_DELAY |
tristate ' Diffserv field marker' CONFIG_NET_SCH_DSMARK |
if [ "$CONFIG_NETFILTER" = "y" ]; then |
tristate ' Ingress Qdisc' CONFIG_NET_SCH_INGRESS |
fi |
bool ' QoS support' CONFIG_NET_QOS |
if [ "$CONFIG_NET_QOS" = "y" ]; then |
bool ' Rate estimator' CONFIG_NET_ESTIMATOR |
fi |
bool ' Packet classifier API' CONFIG_NET_CLS |
if [ "$CONFIG_NET_CLS" = "y" ]; then |
tristate ' TC index classifier' CONFIG_NET_CLS_TCINDEX |
tristate ' Routing table based classifier' CONFIG_NET_CLS_ROUTE4 |
if [ "$CONFIG_NET_CLS_ROUTE4" != "n" ]; then |
define_bool CONFIG_NET_CLS_ROUTE y |
fi |
tristate ' Firewall based classifier' CONFIG_NET_CLS_FW |
tristate ' U32 classifier' CONFIG_NET_CLS_U32 |
if [ "$CONFIG_NET_QOS" = "y" ]; then |
tristate ' Special RSVP classifier' CONFIG_NET_CLS_RSVP |
tristate ' Special RSVP classifier for IPv6' CONFIG_NET_CLS_RSVP6 |
bool ' Traffic policing (needed for in/egress)' CONFIG_NET_CLS_POLICE |
fi |
fi |
|
/cls_rsvp.c
0,0 → 1,42
/* |
* net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
*/ |
|
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
#define RSVP_DST_LEN 1 |
#define RSVP_ID "rsvp" |
#define RSVP_OPS cls_rsvp_ops |
|
#include "cls_rsvp.h" |
MODULE_LICENSE("GPL"); |
/sch_delay.c
0,0 → 1,277
/* |
* net/sched/sch_delay.c Simple constant delay |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Stephen Hemminger <shemminger@osdl.org> |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
|
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
/* Network delay simulator |
This scheduler adds a fixed delay to all packets. |
Similar to NISTnet and BSD Dummynet. |
|
It uses byte fifo underneath similar to TBF */ |
struct dly_sched_data { |
u32 latency; |
u32 limit; |
struct timer_list timer; |
struct Qdisc *qdisc; |
}; |
|
/* Time stamp put into socket buffer control block */ |
struct dly_skb_cb { |
psched_time_t queuetime; |
}; |
|
/* Enqueue packets with underlying discipline (fifo) |
* but mark them with current time first. |
*/ |
static int dly_enqueue(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
struct dly_skb_cb *cb = (struct dly_skb_cb *)skb->cb; |
int ret; |
|
PSCHED_GET_TIME(cb->queuetime); |
|
/* Queue to underlying scheduler */ |
ret = q->qdisc->enqueue(skb, q->qdisc); |
if (ret) |
sch->stats.drops++; |
else { |
sch->q.qlen++; |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
} |
return 0; |
} |
|
/* Requeue packets but don't change time stamp */ |
static int dly_requeue(struct sk_buff *skb, struct Qdisc *sch) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
int ret; |
|
ret = q->qdisc->ops->requeue(skb, q->qdisc); |
if (ret == 0) |
sch->q.qlen++; |
return ret; |
} |
|
static unsigned int dly_drop(struct Qdisc *sch) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
unsigned int len; |
|
len = q->qdisc->ops->drop(q->qdisc); |
if (len) { |
sch->q.qlen--; |
sch->stats.drops++; |
} |
return len; |
} |
|
/* Dequeue packet. |
* If packet needs to be held up, then stop the |
* queue and set timer to wakeup later. |
*/ |
static struct sk_buff *dly_dequeue(struct Qdisc *sch) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
struct sk_buff *skb = q->qdisc->dequeue(q->qdisc); |
|
if (skb) { |
struct dly_skb_cb *cb = (struct dly_skb_cb *)skb->cb; |
psched_time_t now; |
long diff; |
|
PSCHED_GET_TIME(now); |
diff = q->latency - PSCHED_TDIFF(now, cb->queuetime); |
|
if (diff <= 0) { |
sch->q.qlen--; |
sch->flags &= ~TCQ_F_THROTTLED; |
return skb; |
} |
|
if (!netif_queue_stopped(sch->dev)) { |
long delay = PSCHED_US2JIFFIE(diff); |
if (delay <= 0) |
delay = 1; |
mod_timer(&q->timer, jiffies+delay); |
} |
|
if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { |
sch->q.qlen--; |
sch->stats.drops++; |
} |
sch->flags |= TCQ_F_THROTTLED; |
} |
return NULL; |
} |
|
static void dly_reset(struct Qdisc *sch) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
|
qdisc_reset(q->qdisc); |
sch->q.qlen = 0; |
sch->flags &= ~TCQ_F_THROTTLED; |
del_timer(&q->timer); |
} |
|
static void dly_timer(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc *)arg; |
|
sch->flags &= ~TCQ_F_THROTTLED; |
netif_schedule(sch->dev); |
} |
|
/* Tell Fifo the new limit. */ |
static int change_limit(struct Qdisc *q, u32 limit) |
{ |
struct rtattr *rta; |
int ret; |
|
rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); |
if (!rta) |
return -ENOMEM; |
|
rta->rta_type = RTM_NEWQDISC; |
((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; |
ret = q->ops->change(q, rta); |
kfree(rta); |
|
return ret; |
} |
|
/* Setup underlying FIFO discipline */ |
static int dly_change(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
struct tc_dly_qopt *qopt = RTA_DATA(opt); |
int err; |
|
if (q->qdisc == &noop_qdisc) { |
struct Qdisc *child |
= qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops); |
if (!child) |
return -EINVAL; |
q->qdisc = child; |
} |
|
err = change_limit(q->qdisc, qopt->limit); |
if (err) { |
qdisc_destroy(q->qdisc); |
q->qdisc = &noop_qdisc; |
} else { |
q->latency = qopt->latency; |
q->limit = qopt->limit; |
} |
return err; |
} |
|
static int dly_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
int err; |
|
if (!opt) |
return -EINVAL; |
|
MOD_INC_USE_COUNT; |
|
init_timer(&q->timer); |
q->timer.function = dly_timer; |
q->timer.data = (unsigned long) sch; |
q->qdisc = &noop_qdisc; |
|
err = dly_change(sch, opt); |
if (err) |
MOD_DEC_USE_COUNT; |
|
return err; |
} |
|
static void dly_destroy(struct Qdisc *sch) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
|
del_timer(&q->timer); |
qdisc_destroy(q->qdisc); |
q->qdisc = &noop_qdisc; |
|
MOD_DEC_USE_COUNT; |
} |
|
static int dly_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct dly_sched_data *q = (struct dly_sched_data *)sch->data; |
unsigned char *b = skb->tail; |
struct tc_dly_qopt qopt; |
|
qopt.latency = q->latency; |
qopt.limit = q->limit; |
|
RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
struct Qdisc_ops dly_qdisc_ops = { |
.id = "delay", |
.priv_size = sizeof(struct dly_sched_data), |
.enqueue = dly_enqueue, |
.dequeue = dly_dequeue, |
.requeue = dly_requeue, |
.drop = dly_drop, |
.init = dly_init, |
.reset = dly_reset, |
.destroy = dly_destroy, |
.change = dly_change, |
.dump = dly_dump, |
}; |
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&dly_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&dly_qdisc_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |
/Makefile
0,0 → 1,36
# |
# Makefile for the Linux Traffic Control Unit. |
# |
|
O_TARGET := sched.o |
|
obj-y := sch_generic.o |
|
|
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o |
obj-$(CONFIG_NET_ESTIMATOR) += estimator.o |
obj-$(CONFIG_NET_CLS) += cls_api.o |
obj-$(CONFIG_NET_CLS_POLICE) += police.o |
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o |
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o |
obj-$(CONFIG_NET_SCH_CSZ) += sch_csz.o |
obj-$(CONFIG_NET_SCH_DELAY) += sch_delay.o |
obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o |
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o |
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o |
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o |
obj-$(CONFIG_NET_SCH_RED) += sch_red.o |
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o |
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o |
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o |
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o |
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o |
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o |
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o |
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o |
obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o |
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o |
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o |
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o |
|
include $(TOPDIR)/Rules.make |
/sch_csz.c
0,0 → 1,1069
/* |
* net/sched/sch_csz.c Clark-Shenker-Zhang scheduler. |
* |
* This program is free software; you can redistribute it and/or |
* modify it under the terms of the GNU General Public License |
* as published by the Free Software Foundation; either version |
* 2 of the License, or (at your option) any later version. |
* |
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
* |
*/ |
|
#include <linux/config.h> |
#include <linux/module.h> |
#include <asm/uaccess.h> |
#include <asm/system.h> |
#include <asm/bitops.h> |
#include <linux/types.h> |
#include <linux/kernel.h> |
#include <linux/sched.h> |
#include <linux/string.h> |
#include <linux/mm.h> |
#include <linux/socket.h> |
#include <linux/sockios.h> |
#include <linux/in.h> |
#include <linux/errno.h> |
#include <linux/interrupt.h> |
#include <linux/if_ether.h> |
#include <linux/inet.h> |
#include <linux/netdevice.h> |
#include <linux/etherdevice.h> |
#include <linux/notifier.h> |
#include <net/ip.h> |
#include <net/route.h> |
#include <linux/skbuff.h> |
#include <net/sock.h> |
#include <net/pkt_sched.h> |
|
|
/* Clark-Shenker-Zhang algorithm. |
======================================= |
|
SOURCE. |
|
David D. Clark, Scott Shenker and Lixia Zhang |
"Supporting Real-Time Applications in an Integrated Services Packet |
Network: Architecture and Mechanism". |
|
CBQ presents a flexible universal algorithm for packet scheduling, |
but it has pretty poor delay characteristics. |
Round-robin scheduling and link-sharing goals |
apparently contradict minimization of network delay and jitter. |
Moreover, correct handling of predictive flows seems to be |
impossible in CBQ. |
|
CSZ presents a more precise but less flexible and less efficient |
approach. As I understand it, the main idea is to create |
WFQ flows for each guaranteed service and to allocate |
the rest of bandwidth to dummy flow-0. Flow-0 comprises |
the predictive services and the best effort traffic; |
it is handled by a priority scheduler with the highest |
priority band allocated for predictive services, and the rest --- |
to the best effort packets. |
|
Note that in CSZ flows are NOT limited to their bandwidth. It |
is supposed that the flow passed admission control at the edge |
of the QoS network and it doesn't need further shaping. Any |
attempt to improve the flow or to shape it to a token bucket |
at intermediate hops will introduce undesired delays and raise |
jitter. |
|
At the moment CSZ is the only scheduler that provides |
true guaranteed service. Another schemes (including CBQ) |
do not provide guaranteed delay and randomize jitter. |
There is a proof (Sally Floyd), that delay |
can be estimated by a IntServ compliant formula. |
This result is true formally, but it is wrong in principle. |
It takes into account only round-robin delays, |
ignoring delays introduced by link sharing i.e. overlimiting. |
Note that temporary overlimits are inevitable because |
real links are not ideal, and the real algorithm must take this |
into account. |
|
ALGORITHM. |
|
--- Notations. |
|
$B$ is link bandwidth (bits/sec). |
|
$I$ is set of all flows, including flow $0$. |
Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and |
$\sum_{a \in I} r_a = 1$. |
|
--- Flow model. |
|
Let $m_a$ is the number of backlogged bits in flow $a$. |
The flow is {\em active}, if $m_a > 0$. |
This number is a discontinuous function of time; |
when a packet $i$ arrives: |
\[ |
m_a(t_i+0) - m_a(t_i-0) = L^i, |
\] |
where $L^i$ is the length of the arrived packet. |
The flow queue is drained continuously until $m_a == 0$: |
\[ |
{d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}. |
\] |
I.e. flow rates are their allocated rates proportionally |
scaled to take all available link bandwidth. Apparently, |
it is not the only possible policy. F.e. CBQ classes |
without borrowing would be modelled by: |
\[ |
{d m_a \over dt} = - B r_a . |
\] |
More complicated hierarchical bandwidth allocation |
policies are possible, but unfortunately, the basic |
flow equations have a simple solution only for proportional |
scaling. |
|
--- Departure times. |
|
We calculate the time until the last bit of packet is sent: |
\[ |
E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a }, |
\] |
where $\delta_a(t)$ is number of bits drained since $t_i$. |
We have to evaluate $E_a^i$ for all queued packets, |
then find the packet with minimal $E_a^i$ and send it. |
|
This sounds good, but direct implementation of the algorithm |
is absolutely infeasible. Luckily, if flow rates |
are scaled proportionally, the equations have a simple solution. |
|
The differential equation for $E_a^i$ is |
\[ |
{d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} = |
{ B \over \sum_{b \in A} r_b} |
\] |
with initial condition |
\[ |
E_a^i (t_i) = { m_a(t_i) \over r_a } . |
\] |
|
Let's introduce an auxiliary function $R(t)$: |
|
--- Round number. |
|
Consider the following model: we rotate over active flows, |
sending $r_a B$ bits from every flow, so that we send |
$B \sum_{a \in A} r_a$ bits per round, that takes |
$\sum_{a \in A} r_a$ seconds. |
|
Hence, $R(t)$ (round number) is a monotonically increasing |
linear function of time when $A$ is not changed |
\[ |
{ d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a } |
\] |
and it is continuous when $A$ changes. |
|
The central observation is that the quantity |
$F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all! |
$R(t)$ does not depend on flow, so that $F_a^i$ can be |
calculated only once on packet arrival, and we need not |
recalculate $E$ numbers and resorting queues. |
The number $F_a^i$ is called finish number of the packet. |
It is just the value of $R(t)$ when the last bit of packet |
is sent out. |
|
Maximal finish number on flow is called finish number of flow |
and minimal one is "start number of flow". |
Apparently, flow is active if and only if $F_a \leq R$. |
|
When a packet of length $L_i$ bit arrives to flow $a$ at time $t_i$, |
we calculate $F_a^i$ as: |
|
If flow was inactive ($F_a < R$): |
$F_a^i = R(t) + {L_i \over B r_a}$ |
otherwise |
$F_a^i = F_a + {L_i \over B r_a}$ |
|
These equations complete the algorithm specification. |
|
It looks pretty hairy, but there is a simple |
procedure for solving these equations. |
See procedure csz_update(), that is a generalization of |
the algorithm from S. Keshav's thesis Chapter 3 |
"Efficient Implementation of Fair Queeing". |
|
NOTES. |
|
* We implement only the simplest variant of CSZ, |
when flow-0 is a explicit 4band priority fifo. |
This is bad, but we need a "peek" operation in addition |
to "dequeue" to implement complete CSZ. |
I do not want to do that, unless it is absolutely |
necessary. |
|
* A primitive support for token bucket filtering |
presents itself too. It directly contradicts CSZ, but |
even though the Internet is on the globe ... :-) |
"the edges of the network" really exist. |
|
BUGS. |
|
* Fixed point arithmetic is overcomplicated, suboptimal and even |
wrong. Check it later. */ |
|
|
/* This number is arbitrary */ |
|
#define CSZ_GUARANTEED 16 |
#define CSZ_FLOWS (CSZ_GUARANTEED+4) |
|
struct csz_head |
{ |
struct csz_head *snext; |
struct csz_head *sprev; |
struct csz_head *fnext; |
struct csz_head *fprev; |
}; |
|
struct csz_flow |
{ |
struct csz_head *snext; |
struct csz_head *sprev; |
struct csz_head *fnext; |
struct csz_head *fprev; |
|
/* Parameters */ |
struct tc_ratespec rate; |
struct tc_ratespec slice; |
u32 *L_tab; /* Lookup table for L/(B*r_a) values */ |
unsigned long limit; /* Maximal length of queue */ |
#ifdef CSZ_PLUS_TBF |
struct tc_ratespec peakrate; |
__u32 buffer; /* Depth of token bucket, normalized |
as L/(B*r_a) */ |
__u32 mtu; |
#endif |
|
/* Variables */ |
#ifdef CSZ_PLUS_TBF |
unsigned long tokens; /* Tokens number: usecs */ |
psched_time_t t_tbf; |
unsigned long R_tbf; |
int throttled; |
#endif |
unsigned peeked; |
unsigned long start; /* Finish number of the first skb */ |
unsigned long finish; /* Finish number of the flow */ |
|
struct sk_buff_head q; /* FIFO queue */ |
}; |
|
#define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log]) |
|
struct csz_sched_data |
{ |
/* Parameters */ |
unsigned char rate_log; /* fixed point position for rate; |
* really we need not it */ |
unsigned char R_log; /* fixed point position for round number */ |
unsigned char delta_log; /* 1<<delta_log is maximal timeout in usecs; |
* 21 <-> 2.1sec is MAXIMAL value */ |
|
/* Variables */ |
struct tcf_proto *filter_list; |
u8 prio2band[TC_PRIO_MAX+1]; |
#ifdef CSZ_PLUS_TBF |
struct timer_list wd_timer; |
long wd_expires; |
#endif |
psched_time_t t_c; /* Time check-point */ |
unsigned long R_c; /* R-number check-point */ |
unsigned long rate; /* Current sum of rates of active flows */ |
struct csz_head s; /* Flows sorted by "start" */ |
struct csz_head f; /* Flows sorted by "finish" */ |
|
struct sk_buff_head other[4];/* Predicted (0) and the best efforts |
classes (1,2,3) */ |
struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */ |
}; |
|
/* These routines (csz_insert_finish and csz_insert_start) are |
the most time consuming part of all the algorithm. |
|
We insert to sorted list, so that time |
is linear with respect to number of active flows in the worst case. |
Note that we have not very large number of guaranteed flows, |
so that logarithmic algorithms (heap etc.) are useless, |
they are slower than linear one when length of list <= 32. |
|
Heap would take sence if we used WFQ for best efforts |
flows, but SFQ is better choice in this case. |
*/ |
|
|
/* Insert flow "this" to the list "b" before |
flow with greater finish number. |
*/ |
|
#if 0 |
/* Scan forward */ |
extern __inline__ void csz_insert_finish(struct csz_head *b, |
struct csz_flow *this) |
{ |
struct csz_head *f = b->fnext; |
unsigned long finish = this->finish; |
|
while (f != b) { |
if (((struct csz_flow*)f)->finish - finish > 0) |
break; |
f = f->fnext; |
} |
this->fnext = f; |
this->fprev = f->fprev; |
this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this; |
} |
#else |
/* Scan backward */ |
extern __inline__ void csz_insert_finish(struct csz_head *b, |
struct csz_flow *this) |
{ |
struct csz_head *f = b->fprev; |
unsigned long finish = this->finish; |
|
while (f != b) { |
if (((struct csz_flow*)f)->finish - finish <= 0) |
break; |
f = f->fprev; |
} |
this->fnext = f->fnext; |
this->fprev = f; |
this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this; |
} |
#endif |
|
/* Insert flow "this" to the list "b" before |
flow with greater start number. |
*/ |
|
extern __inline__ void csz_insert_start(struct csz_head *b, |
struct csz_flow *this) |
{ |
struct csz_head *f = b->snext; |
unsigned long start = this->start; |
|
while (f != b) { |
if (((struct csz_flow*)f)->start - start > 0) |
break; |
f = f->snext; |
} |
this->snext = f; |
this->sprev = f->sprev; |
this->snext->sprev = this->sprev->snext = (struct csz_head*)this; |
} |
|
|
/* Calculate and return current round number. |
It is another time consuming part, but |
it is impossible to avoid it. |
|
It costs O(N) that make all the algorithm useful only |
to play with closest to ideal fluid model. |
|
There exist less academic, but more practical modifications, |
which might have even better characteristics (WF2Q+, HPFQ, HFSC) |
*/ |
|
static unsigned long csz_update(struct Qdisc *sch) |
{ |
struct csz_sched_data *q = (struct csz_sched_data*)sch->data; |
struct csz_flow *a; |
unsigned long F; |
unsigned long tmp; |
psched_time_t now; |
unsigned long delay; |
unsigned long R_c; |
|
PSCHED_GET_TIME(now); |
delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset); |
|
if (delay>>q->delta_log) { |
do_reset: |
/* Delta is too large. |
It is possible if MTU/BW > 1<<q->delta_log |
(i.e. configuration error) or because of hardware |
fault. We have no choice... |
*/ |
qdisc_reset(sch); |
return 0; |
} |
|
q->t_c = now; |
|
for (;;) { |
a = (struct csz_flow*)q->f.fnext; |
|
/* No more active flows. Reset R and exit. */ |
if (a == (struct csz_flow*)&q->f) { |
#ifdef CSZ_DEBUG |
if (q->rate) { |
printk("csz_update: rate!=0 on inactive csz\n"); |
q->rate = 0; |
} |
#endif |
q->R_c = 0; |
return 0; |
} |
|
F = a->finish; |
|
#ifdef CSZ_DEBUG |
if (q->rate == 0) { |
printk("csz_update: rate=0 on active csz\n"); |
goto do_reset; |
} |
#endif |
|
/* |
* tmp = (t - q->t_c)/q->rate; |
*/ |
|
tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log); |
|
tmp += q->R_c; |
|
/* OK, this flow (and all flows with greater |
finish numbers) is still active */ |
if (F - tmp > 0) |
break; |
|
/* It is more not active */ |
|
a->fprev->fnext = a->fnext; |
a->fnext->fprev = a->fprev; |
|
/* |
* q->t_c += (F - q->R_c)*q->rate |
*/ |
|
tmp = ((F-q->R_c)*q->rate)<<q->R_log; |
R_c = F; |
q->rate -= a->slice.rate; |
|
if ((long)(delay - tmp) >= 0) { |
delay -= tmp; |
continue; |
} |
delay = 0; |
} |
|
q->R_c = tmp; |
return tmp; |
} |
|
unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q) |
{ |
return CSZ_GUARANTEED; |
} |
|
static int |
csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
unsigned flow_id = csz_classify(skb, q); |
unsigned long R; |
int prio = 0; |
struct csz_flow *this; |
|
if (flow_id >= CSZ_GUARANTEED) { |
prio = flow_id - CSZ_GUARANTEED; |
flow_id = 0; |
} |
|
this = &q->flow[flow_id]; |
if (this->q.qlen >= this->limit || this->L_tab == NULL) { |
sch->stats.drops++; |
kfree_skb(skb); |
return NET_XMIT_DROP; |
} |
|
R = csz_update(sch); |
|
if ((long)(this->finish - R) >= 0) { |
/* It was active */ |
this->finish += L2R(this,skb->len); |
} else { |
/* It is inactive; activate it */ |
this->finish = R + L2R(this,skb->len); |
q->rate += this->slice.rate; |
csz_insert_finish(&q->f, this); |
} |
|
/* If this flow was empty, remember start number |
and insert it into start queue */ |
if (this->q.qlen == 0) { |
this->start = this->finish; |
csz_insert_start(&q->s, this); |
} |
if (flow_id) |
skb_queue_tail(&this->q, skb); |
else |
skb_queue_tail(&q->other[prio], skb); |
sch->q.qlen++; |
sch->stats.bytes += skb->len; |
sch->stats.packets++; |
return 0; |
} |
|
static __inline__ struct sk_buff * |
skb_dequeue_best(struct csz_sched_data * q) |
{ |
int i; |
struct sk_buff *skb; |
|
for (i=0; i<4; i++) { |
skb = skb_dequeue(&q->other[i]); |
if (skb) { |
q->flow[0].q.qlen--; |
return skb; |
} |
} |
return NULL; |
} |
|
static __inline__ struct sk_buff * |
skb_peek_best(struct csz_sched_data * q) |
{ |
int i; |
struct sk_buff *skb; |
|
for (i=0; i<4; i++) { |
skb = skb_peek(&q->other[i]); |
if (skb) |
return skb; |
} |
return NULL; |
} |
|
#ifdef CSZ_PLUS_TBF |
|
static void csz_watchdog(unsigned long arg) |
{ |
struct Qdisc *sch = (struct Qdisc*)arg; |
|
qdisc_wakeup(sch->dev); |
} |
|
static __inline__ void |
csz_move_queue(struct csz_flow *this, long delta) |
{ |
this->fprev->fnext = this->fnext; |
this->fnext->fprev = this->fprev; |
|
this->start += delta; |
this->finish += delta; |
|
csz_insert_finish(this); |
} |
|
static __inline__ int csz_enough_tokens(struct csz_sched_data *q, |
struct csz_flow *this, |
struct sk_buff *skb) |
{ |
long toks; |
long shift; |
psched_time_t now; |
|
PSCHED_GET_TIME(now); |
|
toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len); |
|
shift = 0; |
if (this->throttled) { |
/* Remember aposteriory delay */ |
|
unsigned long R = csz_update(q); |
shift = R - this->R_tbf; |
this->R_tbf = R; |
} |
|
if (toks >= 0) { |
/* Now we have enough tokens to proceed */ |
|
this->tokens = toks <= this->depth ? toks : this->depth; |
this->t_tbf = now; |
|
if (!this->throttled) |
return 1; |
|
/* Flow was throttled. Update its start&finish numbers |
with delay calculated aposteriori. |
*/ |
|
this->throttled = 0; |
if (shift > 0) |
csz_move_queue(this, shift); |
return 1; |
} |
|
if (!this->throttled) { |
/* Flow has just been throttled; remember |
current round number to calculate aposteriori delay |
*/ |
this->throttled = 1; |
this->R_tbf = csz_update(q); |
} |
|
/* Move all the queue to the time when it will be allowed to send. |
We should translate time to round number, but it is impossible, |
so that we made the most conservative estimate i.e. we suppose |
that only this flow is active and, hence, R = t. |
Really toks <= R <= toks/r_a. |
|
This apriory shift in R will be adjusted later to reflect |
real delay. We cannot avoid it because of: |
- throttled flow continues to be active from the viewpoint |
of CSZ, so that it would acquire the highest priority, |
if you not adjusted start numbers. |
- Eventually, finish number would become less than round |
number and flow were declared inactive. |
*/ |
|
toks = -toks; |
|
/* Remeber, that we should start watchdog */ |
if (toks < q->wd_expires) |
q->wd_expires = toks; |
|
toks >>= q->R_log; |
shift += toks; |
if (shift > 0) { |
this->R_tbf += toks; |
csz_move_queue(this, shift); |
} |
csz_insert_start(this); |
return 0; |
} |
#endif |
|
|
static struct sk_buff * |
csz_dequeue(struct Qdisc* sch) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
struct sk_buff *skb; |
struct csz_flow *this; |
|
#ifdef CSZ_PLUS_TBF |
q->wd_expires = 0; |
#endif |
this = (struct csz_flow*)q->s.snext; |
|
while (this != (struct csz_flow*)&q->s) { |
|
/* First of all: unlink from start list */ |
this->sprev->snext = this->snext; |
this->snext->sprev = this->sprev; |
|
if (this != &q->flow[0]) { /* Guaranteed flow */ |
skb = __skb_dequeue(&this->q); |
if (skb) { |
#ifdef CSZ_PLUS_TBF |
if (this->depth) { |
if (!csz_enough_tokens(q, this, skb)) |
continue; |
} |
#endif |
if (this->q.qlen) { |
struct sk_buff *nskb = skb_peek(&this->q); |
this->start += L2R(this,nskb->len); |
csz_insert_start(&q->s, this); |
} |
sch->q.qlen--; |
return skb; |
} |
} else { /* Predicted or best effort flow */ |
skb = skb_dequeue_best(q); |
if (skb) { |
unsigned peeked = this->peeked; |
this->peeked = 0; |
|
if (--this->q.qlen) { |
struct sk_buff *nskb; |
unsigned dequeued = L2R(this,skb->len); |
|
/* We got not the same thing that |
peeked earlier; adjust start number |
*/ |
if (peeked != dequeued && peeked) |
this->start += dequeued - peeked; |
|
nskb = skb_peek_best(q); |
peeked = L2R(this,nskb->len); |
this->start += peeked; |
this->peeked = peeked; |
csz_insert_start(&q->s, this); |
} |
sch->q.qlen--; |
return skb; |
} |
} |
} |
#ifdef CSZ_PLUS_TBF |
/* We are about to return no skb. |
Schedule watchdog timer, if it occurred because of shaping. |
*/ |
if (q->wd_expires) { |
unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires); |
if (delay == 0) |
delay = 1; |
mod_timer(&q->wd_timer, jiffies + delay); |
sch->stats.overlimits++; |
} |
#endif |
return NULL; |
} |
|
static void |
csz_reset(struct Qdisc* sch) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
int i; |
|
for (i=0; i<4; i++) |
skb_queue_purge(&q->other[i]); |
|
for (i=0; i<CSZ_GUARANTEED; i++) { |
struct csz_flow *this = q->flow + i; |
skb_queue_purge(&this->q); |
this->snext = this->sprev = |
this->fnext = this->fprev = (struct csz_head*)this; |
this->start = this->finish = 0; |
} |
q->s.snext = q->s.sprev = &q->s; |
q->f.fnext = q->f.fprev = &q->f; |
q->R_c = 0; |
#ifdef CSZ_PLUS_TBF |
PSCHED_GET_TIME(&q->t_tbf); |
q->tokens = q->depth; |
del_timer(&q->wd_timer); |
#endif |
sch->q.qlen = 0; |
} |
|
static void |
csz_destroy(struct Qdisc* sch) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
struct tcf_proto *tp; |
|
while ((tp = q->filter_list) != NULL) { |
q->filter_list = tp->next; |
tcf_destroy(tp); |
} |
|
MOD_DEC_USE_COUNT; |
} |
|
static int csz_init(struct Qdisc *sch, struct rtattr *opt) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
struct rtattr *tb[TCA_CSZ_PTAB]; |
struct tc_csz_qopt *qopt; |
int i; |
|
rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); |
if (tb[TCA_CSZ_PARMS-1] == NULL || |
RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt)) |
return -EINVAL; |
qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]); |
|
q->R_log = qopt->R_log; |
q->delta_log = qopt->delta_log; |
for (i=0; i<=TC_PRIO_MAX; i++) { |
if (qopt->priomap[i] >= CSZ_FLOWS) |
return -EINVAL; |
q->prio2band[i] = qopt->priomap[i]; |
} |
|
for (i=0; i<4; i++) |
skb_queue_head_init(&q->other[i]); |
|
for (i=0; i<CSZ_GUARANTEED; i++) { |
struct csz_flow *this = q->flow + i; |
skb_queue_head_init(&this->q); |
this->snext = this->sprev = |
this->fnext = this->fprev = (struct csz_head*)this; |
this->start = this->finish = 0; |
} |
q->s.snext = q->s.sprev = &q->s; |
q->f.fnext = q->f.fprev = &q->f; |
q->R_c = 0; |
#ifdef CSZ_PLUS_TBF |
init_timer(&q->wd_timer); |
q->wd_timer.data = (unsigned long)sch; |
q->wd_timer.function = csz_watchdog; |
#endif |
MOD_INC_USE_COUNT; |
return 0; |
} |
|
static int csz_dump(struct Qdisc *sch, struct sk_buff *skb) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
struct tc_csz_qopt opt; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
opt.flows = CSZ_FLOWS; |
memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); |
RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt); |
rta->rta_len = skb->tail - b; |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, |
struct Qdisc **old) |
{ |
return -EINVAL; |
} |
|
static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl) |
{ |
return NULL; |
} |
|
|
static unsigned long csz_get(struct Qdisc *sch, u32 classid) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
unsigned long band = TC_H_MIN(classid) - 1; |
|
if (band >= CSZ_FLOWS) |
return 0; |
|
if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL) |
return 0; |
|
return band+1; |
} |
|
static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid) |
{ |
return csz_get(sch, classid); |
} |
|
|
static void csz_put(struct Qdisc *sch, unsigned long cl) |
{ |
return; |
} |
|
static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) |
{ |
unsigned long cl = *arg; |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
struct rtattr *opt = tca[TCA_OPTIONS-1]; |
struct rtattr *tb[TCA_CSZ_PTAB]; |
struct tc_csz_copt *copt; |
|
rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); |
if (tb[TCA_CSZ_PARMS-1] == NULL || |
RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt)) |
return -EINVAL; |
copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]); |
|
if (tb[TCA_CSZ_RTAB-1] && |
RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024) |
return -EINVAL; |
|
if (cl) { |
struct csz_flow *a; |
cl--; |
if (cl >= CSZ_FLOWS) |
return -ENOENT; |
if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL) |
return -EINVAL; |
|
a = &q->flow[cl]; |
|
spin_lock_bh(&sch->dev->queue_lock); |
#if 0 |
a->rate_log = copt->rate_log; |
#endif |
#ifdef CSZ_PLUS_TBF |
a->limit = copt->limit; |
a->rate = copt->rate; |
a->buffer = copt->buffer; |
a->mtu = copt->mtu; |
#endif |
|
if (tb[TCA_CSZ_RTAB-1]) |
memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024); |
|
spin_unlock_bh(&sch->dev->queue_lock); |
return 0; |
} |
/* NI */ |
return 0; |
} |
|
static int csz_delete(struct Qdisc *sch, unsigned long cl) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
struct csz_flow *a; |
|
cl--; |
|
if (cl >= CSZ_FLOWS) |
return -ENOENT; |
if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL) |
return -EINVAL; |
|
a = &q->flow[cl]; |
|
spin_lock_bh(&sch->dev->queue_lock); |
a->fprev->fnext = a->fnext; |
a->fnext->fprev = a->fprev; |
a->sprev->snext = a->snext; |
a->snext->sprev = a->sprev; |
a->start = a->finish = 0; |
kfree(xchg(&q->flow[cl].L_tab, NULL)); |
spin_unlock_bh(&sch->dev->queue_lock); |
|
return 0; |
} |
|
static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
unsigned char *b = skb->tail; |
struct rtattr *rta; |
struct tc_csz_copt opt; |
|
tcm->tcm_handle = sch->handle|cl; |
|
cl--; |
|
if (cl > CSZ_FLOWS) |
goto rtattr_failure; |
|
if (cl < CSZ_GUARANTEED) { |
struct csz_flow *f = &q->flow[cl]; |
|
if (f->L_tab == NULL) |
goto rtattr_failure; |
|
rta = (struct rtattr*)b; |
RTA_PUT(skb, TCA_OPTIONS, 0, NULL); |
|
opt.limit = f->limit; |
opt.rate = f->rate; |
opt.slice = f->slice; |
memset(&opt.peakrate, 0, sizeof(opt.peakrate)); |
#ifdef CSZ_PLUS_TBF |
opt.buffer = f->buffer; |
opt.mtu = f->mtu; |
#else |
opt.buffer = 0; |
opt.mtu = 0; |
#endif |
|
RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt); |
rta->rta_len = skb->tail - b; |
} |
|
return skb->len; |
|
rtattr_failure: |
skb_trim(skb, b - skb->data); |
return -1; |
} |
|
static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
int prio = 0; |
|
if (arg->stop) |
return; |
|
for (prio = 0; prio < CSZ_FLOWS; prio++) { |
if (arg->count < arg->skip) { |
arg->count++; |
continue; |
} |
if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) { |
arg->count++; |
continue; |
} |
if (arg->fn(sch, prio+1, arg) < 0) { |
arg->stop = 1; |
break; |
} |
arg->count++; |
} |
} |
|
static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl) |
{ |
struct csz_sched_data *q = (struct csz_sched_data *)sch->data; |
|
if (cl) |
return NULL; |
|
return &q->filter_list; |
} |
|
struct Qdisc_class_ops csz_class_ops = |
{ |
csz_graft, |
csz_leaf, |
|
csz_get, |
csz_put, |
csz_change, |
csz_delete, |
csz_walk, |
|
csz_find_tcf, |
csz_bind, |
csz_put, |
|
csz_dump_class, |
}; |
|
struct Qdisc_ops csz_qdisc_ops = |
{ |
NULL, |
&csz_class_ops, |
"csz", |
sizeof(struct csz_sched_data), |
|
csz_enqueue, |
csz_dequeue, |
NULL, |
NULL, |
|
csz_init, |
csz_reset, |
csz_destroy, |
NULL /* csz_change */, |
|
csz_dump, |
}; |
|
|
#ifdef MODULE |
int init_module(void) |
{ |
return register_qdisc(&csz_qdisc_ops); |
} |
|
void cleanup_module(void) |
{ |
unregister_qdisc(&csz_qdisc_ops); |
} |
#endif |
MODULE_LICENSE("GPL"); |