OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

Compare Revisions

  • This comparison shows the changes necessary to convert path
    /or1k/trunk/linux/linux-2.4/net/sched
    from Rev 1275 to Rev 1765
    Reverse comparison

Rev 1275 → Rev 1765

/cls_tcindex.c
0,0 → 1,509
/*
* net/sched/cls_tcindex.c Packet classifier for skb->tc_index
*
* Written 1998,1999 by Werner Almesberger, EPFL ICA
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/route.h>
 
 
/*
* Not quite sure if we need all the xchgs Alexey uses when accessing things.
* Can always add them later ... :)
*/
 
/*
* Passing parameters to the root seems to be done more awkwardly than really
* necessary. At least, u32 doesn't seem to use such dirty hacks. To be
* verified. FIXME.
*/
 
#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
 
 
#if 1 /* control */
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define DPRINTK(format,args...)
#endif
 
#if 0 /* data */
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define D2PRINTK(format,args...)
#endif
 
 
#define PRIV(tp) ((struct tcindex_data *) (tp)->root)
 
 
struct tcindex_filter_result {
struct tcf_police *police;
struct tcf_result res;
};
 
struct tcindex_filter {
__u16 key;
struct tcindex_filter_result result;
struct tcindex_filter *next;
};
 
 
struct tcindex_data {
struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
struct tcindex_filter **h; /* imperfect hash; only used if !perfect;
NULL if unused */
__u16 mask; /* AND key with mask */
int shift; /* shift ANDed key to the right */
int hash; /* hash table size; 0 if undefined */
int alloc_hash; /* allocated size */
int fall_through; /* 0: only classify if explicit match */
};
 
 
static struct tcindex_filter_result *lookup(struct tcindex_data *p,__u16 key)
{
struct tcindex_filter *f;
 
if (p->perfect)
return p->perfect[key].res.class ? p->perfect+key : NULL;
if (!p->h)
return NULL;
for (f = p->h[key % p->hash]; f; f = f->next) {
if (f->key == key)
return &f->result;
}
return NULL;
}
 
 
static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *f;
 
D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p);
 
f = lookup(p,(skb->tc_index & p->mask) >> p->shift);
if (!f) {
if (!p->fall_through)
return -1;
res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle),
(skb->tc_index& p->mask) >> p->shift);
res->class = 0;
D2PRINTK("alg 0x%x\n",res->classid);
return 0;
}
*res = f->res;
D2PRINTK("map 0x%x\n",res->classid);
#ifdef CONFIG_NET_CLS_POLICE
if (f->police) {
int result;
 
result = tcf_police(skb,f->police);
D2PRINTK("police %d\n",res);
return result;
}
#endif
return 0;
}
 
 
static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *r;
 
DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle);
if (p->perfect && handle >= p->alloc_hash)
return 0;
r = lookup(PRIV(tp),handle);
return r && r->res.class ? (unsigned long) r : 0;
}
 
 
static void tcindex_put(struct tcf_proto *tp, unsigned long f)
{
DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f);
}
 
 
static int tcindex_init(struct tcf_proto *tp)
{
struct tcindex_data *p;
 
DPRINTK("tcindex_init(tp %p)\n",tp);
MOD_INC_USE_COUNT;
p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL);
if (!p) {
MOD_DEC_USE_COUNT;
return -ENOMEM;
}
tp->root = p;
p->perfect = NULL;
p->h = NULL;
p->hash = 0;
p->mask = 0xffff;
p->shift = 0;
p->fall_through = 1;
return 0;
}
 
 
static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
struct tcindex_filter *f = NULL;
unsigned long cl;
 
DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f);
if (p->perfect) {
if (!r->res.class)
return -ENOENT;
} else {
int i;
struct tcindex_filter **walk = NULL;
 
for (i = 0; i < p->hash; i++)
for (walk = p->h+i; *walk; walk = &(*walk)->next)
if (&(*walk)->result == r)
goto found;
return -ENOENT;
 
found:
f = *walk;
tcf_tree_lock(tp);
*walk = f->next;
tcf_tree_unlock(tp);
}
cl = __cls_set_class(&r->res.class,0);
if (cl)
tp->q->ops->cl_ops->unbind_tcf(tp->q,cl);
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(r->police);
#endif
if (f)
kfree(f);
return 0;
}
 
 
/*
* There are no parameters for tcindex_init, so we overload tcindex_change
*/
 
 
static int tcindex_change(struct tcf_proto *tp,unsigned long base,u32 handle,
struct rtattr **tca,unsigned long *arg)
{
struct tcindex_filter_result new_filter_result = {
NULL, /* no policing */
{ 0,0 }, /* no classification */
};
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_TCINDEX_MAX];
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter *f;
struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
struct tcindex_filter **walk;
int hash,shift;
__u16 mask;
 
DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
"p %p,r %p\n",tp,handle,tca,arg,opt,p,r);
if (arg)
DPRINTK("*arg = 0x%lx\n",*arg);
if (!opt)
return 0;
if (rtattr_parse(tb,TCA_TCINDEX_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0)
return -EINVAL;
if (!tb[TCA_TCINDEX_HASH-1]) {
hash = p->hash;
} else {
if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(int))
return -EINVAL;
hash = *(int *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]);
}
if (!tb[TCA_TCINDEX_MASK-1]) {
mask = p->mask;
} else {
if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(__u16))
return -EINVAL;
mask = *(__u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]);
}
if (!tb[TCA_TCINDEX_SHIFT-1])
shift = p->shift;
else {
if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(__u16))
return -EINVAL;
shift = *(int *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]);
}
if (p->perfect && hash <= (mask >> shift))
return -EBUSY;
if (p->perfect && hash > p->alloc_hash)
return -EBUSY;
if (p->h && hash != p->alloc_hash)
return -EBUSY;
p->hash = hash;
p->mask = mask;
p->shift = shift;
if (tb[TCA_TCINDEX_FALL_THROUGH-1]) {
if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(int))
return -EINVAL;
p->fall_through =
*(int *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]);
}
DPRINTK("classid/police %p/%p\n",tb[TCA_TCINDEX_CLASSID-1],
tb[TCA_TCINDEX_POLICE-1]);
if (!tb[TCA_TCINDEX_CLASSID-1] && !tb[TCA_TCINDEX_POLICE-1])
return 0;
if (!hash) {
if ((mask >> shift) < PERFECT_HASH_THRESHOLD) {
p->hash = (mask >> shift)+1;
} else {
p->hash = DEFAULT_HASH_SIZE;
}
}
if (!p->perfect && !p->h) {
p->alloc_hash = p->hash;
DPRINTK("hash %d mask %d\n",p->hash,p->mask);
if (p->hash > (mask >> shift)) {
p->perfect = kmalloc(p->hash*
sizeof(struct tcindex_filter_result),GFP_KERNEL);
if (!p->perfect)
return -ENOMEM;
memset(p->perfect, 0,
p->hash * sizeof(struct tcindex_filter_result));
} else {
p->h = kmalloc(p->hash*sizeof(struct tcindex_filter *),
GFP_KERNEL);
if (!p->h)
return -ENOMEM;
memset(p->h, 0, p->hash*sizeof(struct tcindex_filter *));
}
}
/*
* Note: this could be as restrictive as
* if (handle & ~(mask >> shift))
* but then, we'd fail handles that may become valid after some
* future mask change. While this is extremely unlikely to ever
* matter, the check below is safer (and also more
* backwards-compatible).
*/
if (p->perfect && handle >= p->alloc_hash)
return -EINVAL;
if (p->perfect) {
r = p->perfect+handle;
} else {
r = lookup(p,handle);
DPRINTK("r=%p\n",r);
if (!r)
r = &new_filter_result;
}
DPRINTK("r=%p\n",r);
if (tb[TCA_TCINDEX_CLASSID-1]) {
unsigned long cl = cls_set_class(tp,&r->res.class,0);
 
if (cl)
tp->q->ops->cl_ops->unbind_tcf(tp->q,cl);
r->res.classid = *(__u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]);
r->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q,base,
r->res.classid);
if (!r->res.class) {
r->res.classid = 0;
return -ENOENT;
}
}
#ifdef CONFIG_NET_CLS_POLICE
{
struct tcf_police *police;
 
police = tb[TCA_TCINDEX_POLICE-1] ?
tcf_police_locate(tb[TCA_TCINDEX_POLICE-1],NULL) : NULL;
tcf_tree_lock(tp);
police = xchg(&r->police,police);
tcf_tree_unlock(tp);
tcf_police_release(police);
}
#endif
if (r != &new_filter_result)
return 0;
f = kmalloc(sizeof(struct tcindex_filter),GFP_KERNEL);
if (!f)
return -ENOMEM;
f->key = handle;
f->result = new_filter_result;
f->next = NULL;
for (walk = p->h+(handle % p->hash); *walk; walk = &(*walk)->next)
/* nothing */;
wmb();
*walk = f;
return 0;
}
 
 
static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter *f,*next;
int i;
 
DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p);
if (p->perfect) {
for (i = 0; i < p->hash; i++) {
if (!p->perfect[i].res.class)
continue;
if (walker->count >= walker->skip) {
if (walker->fn(tp,
(unsigned long) (p->perfect+i), walker)
< 0) {
walker->stop = 1;
return;
}
}
walker->count++;
}
}
if (!p->h)
return;
for (i = 0; i < p->hash; i++) {
for (f = p->h[i]; f; f = next) {
next = f->next;
if (walker->count >= walker->skip) {
if (walker->fn(tp,(unsigned long) &f->result,
walker) < 0) {
walker->stop = 1;
return;
}
}
walker->count++;
}
}
}
 
 
static int tcindex_destroy_element(struct tcf_proto *tp,
unsigned long arg, struct tcf_walker *walker)
{
return tcindex_delete(tp,arg);
}
 
 
static void tcindex_destroy(struct tcf_proto *tp)
{
struct tcindex_data *p = PRIV(tp);
struct tcf_walker walker;
 
DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p);
walker.count = 0;
walker.skip = 0;
walker.fn = &tcindex_destroy_element;
tcindex_walk(tp,&walker);
if (p->perfect)
kfree(p->perfect);
if (p->h)
kfree(p->h);
kfree(p);
tp->root = NULL;
MOD_DEC_USE_COUNT;
}
 
 
static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
unsigned char *b = skb->tail;
struct rtattr *rta;
 
DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
tp,fh,skb,t,p,r,b);
DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h);
rta = (struct rtattr *) b;
RTA_PUT(skb,TCA_OPTIONS,0,NULL);
if (!fh) {
t->tcm_handle = ~0; /* whatever ... */
RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash);
RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask);
RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift);
RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through),
&p->fall_through);
} else {
if (p->perfect) {
t->tcm_handle = r-p->perfect;
} else {
struct tcindex_filter *f;
int i;
 
t->tcm_handle = 0;
for (i = 0; !t->tcm_handle && i < p->hash; i++) {
for (f = p->h[i]; !t->tcm_handle && f;
f = f->next) {
if (&f->result == r)
t->tcm_handle = f->key;
}
}
}
DPRINTK("handle = %d\n",t->tcm_handle);
if (r->res.class)
RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid);
#ifdef CONFIG_NET_CLS_POLICE
if (r->police) {
struct rtattr *p_rta = (struct rtattr *) skb->tail;
 
RTA_PUT(skb,TCA_TCINDEX_POLICE,0,NULL);
if (tcf_police_dump(skb,r->police) < 0)
goto rtattr_failure;
p_rta->rta_len = skb->tail-(u8 *) p_rta;
}
#endif
}
rta->rta_len = skb->tail-b;
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
struct tcf_proto_ops cls_tcindex_ops = {
NULL,
"tcindex",
tcindex_classify,
tcindex_init,
tcindex_destroy,
 
tcindex_get,
tcindex_put,
tcindex_change,
tcindex_delete,
tcindex_walk,
tcindex_dump
};
 
 
#ifdef MODULE
int init_module(void)
{
return register_tcf_proto_ops(&cls_tcindex_ops);
}
 
void cleanup_module(void)
{
unregister_tcf_proto_ops(&cls_tcindex_ops);
}
#endif
MODULE_LICENSE("GPL");
/cls_route.c
0,0 → 1,635
/*
* net/sched/cls_route.c ROUTE4 classifier.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
#include <linux/module.h>
#include <linux/config.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
/*
1. For now we assume that route tags < 256.
It allows to use direct table lookups, instead of hash tables.
2. For now we assume that "from TAG" and "fromdev DEV" statements
are mutually exclusive.
3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
*/
 
struct route4_fastmap
{
struct route4_filter *filter;
u32 id;
int iif;
};
 
struct route4_head
{
struct route4_fastmap fastmap[16];
struct route4_bucket *table[256+1];
};
 
struct route4_bucket
{
struct route4_filter *ht[16+16+1];
};
 
struct route4_filter
{
struct route4_filter *next;
u32 id;
int iif;
 
struct tcf_result res;
#ifdef CONFIG_NET_CLS_POLICE
struct tcf_police *police;
#endif
 
u32 handle;
struct route4_bucket *bkt;
};
 
#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
 
static __inline__ int route4_fastmap_hash(u32 id, int iif)
{
return id&0xF;
}
 
static void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id)
{
spin_lock_bh(&dev->queue_lock);
memset(head->fastmap, 0, sizeof(head->fastmap));
spin_unlock_bh(&dev->queue_lock);
}
 
static void __inline__
route4_set_fastmap(struct route4_head *head, u32 id, int iif,
struct route4_filter *f)
{
int h = route4_fastmap_hash(id, iif);
head->fastmap[h].id = id;
head->fastmap[h].iif = iif;
head->fastmap[h].filter = f;
}
 
static __inline__ int route4_hash_to(u32 id)
{
return id&0xFF;
}
 
static __inline__ int route4_hash_from(u32 id)
{
return (id>>16)&0xF;
}
 
static __inline__ int route4_hash_iif(int iif)
{
return 16 + ((iif>>16)&0xF);
}
 
static __inline__ int route4_hash_wild(void)
{
return 32;
}
 
#ifdef CONFIG_NET_CLS_POLICE
#define IF_ROUTE_POLICE \
if (f->police) { \
int pol_res = tcf_police(skb, f->police); \
if (pol_res >= 0) return pol_res; \
dont_cache = 1; \
continue; \
} \
if (!dont_cache)
#else
#define IF_ROUTE_POLICE
#endif
 
 
static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct route4_head *head = (struct route4_head*)tp->root;
struct dst_entry *dst;
struct route4_bucket *b;
struct route4_filter *f;
#ifdef CONFIG_NET_CLS_POLICE
int dont_cache = 0;
#endif
u32 id, h;
int iif;
 
if ((dst = skb->dst) == NULL)
goto failure;
 
id = dst->tclassid;
if (head == NULL)
goto old_method;
 
iif = ((struct rtable*)dst)->key.iif;
 
h = route4_fastmap_hash(id, iif);
if (id == head->fastmap[h].id &&
iif == head->fastmap[h].iif &&
(f = head->fastmap[h].filter) != NULL) {
if (f == ROUTE4_FAILURE)
goto failure;
 
*res = f->res;
return 0;
}
 
h = route4_hash_to(id);
 
restart:
if ((b = head->table[h]) != NULL) {
f = b->ht[route4_hash_from(id)];
 
for ( ; f; f = f->next) {
if (f->id == id) {
*res = f->res;
IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
return 0;
}
}
 
for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) {
if (f->iif == iif) {
*res = f->res;
IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
return 0;
}
}
 
for (f = b->ht[route4_hash_wild()]; f; f = f->next) {
*res = f->res;
IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
return 0;
}
 
}
if (h < 256) {
h = 256;
id &= ~0xFFFF;
goto restart;
}
 
#ifdef CONFIG_NET_CLS_POLICE
if (!dont_cache)
#endif
route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
failure:
return -1;
 
old_method:
if (id && (TC_H_MAJ(id) == 0 ||
!(TC_H_MAJ(id^tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
}
return -1;
}
 
static u32 to_hash(u32 id)
{
u32 h = id&0xFF;
if (id&0x8000)
h += 256;
return h;
}
 
static u32 from_hash(u32 id)
{
id &= 0xFFFF;
if (id == 0xFFFF)
return 32;
if (!(id & 0x8000)) {
if (id > 255)
return 256;
return id&0xF;
}
return 16 + (id&0xF);
}
 
static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
{
struct route4_head *head = (struct route4_head*)tp->root;
struct route4_bucket *b;
struct route4_filter *f;
unsigned h1, h2;
 
if (!head)
return 0;
 
h1 = to_hash(handle);
if (h1 > 256)
return 0;
 
h2 = from_hash(handle>>16);
if (h2 > 32)
return 0;
 
if ((b = head->table[h1]) != NULL) {
for (f = b->ht[h2]; f; f = f->next)
if (f->handle == handle)
return (unsigned long)f;
}
return 0;
}
 
static void route4_put(struct tcf_proto *tp, unsigned long f)
{
}
 
static int route4_init(struct tcf_proto *tp)
{
MOD_INC_USE_COUNT;
return 0;
}
 
static void route4_destroy(struct tcf_proto *tp)
{
struct route4_head *head = xchg(&tp->root, NULL);
int h1, h2;
 
if (head == NULL) {
MOD_DEC_USE_COUNT;
return;
}
 
for (h1=0; h1<=256; h1++) {
struct route4_bucket *b;
 
if ((b = head->table[h1]) != NULL) {
for (h2=0; h2<=32; h2++) {
struct route4_filter *f;
 
while ((f = b->ht[h2]) != NULL) {
unsigned long cl;
 
b->ht[h2] = f->next;
if ((cl = __cls_set_class(&f->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(f->police);
#endif
kfree(f);
}
}
kfree(b);
}
}
kfree(head);
MOD_DEC_USE_COUNT;
}
 
static int route4_delete(struct tcf_proto *tp, unsigned long arg)
{
struct route4_head *head = (struct route4_head*)tp->root;
struct route4_filter **fp, *f = (struct route4_filter*)arg;
unsigned h = 0;
struct route4_bucket *b;
int i;
 
if (!head || !f)
return -EINVAL;
 
h = f->handle;
b = f->bkt;
 
for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
unsigned long cl;
 
tcf_tree_lock(tp);
*fp = f->next;
tcf_tree_unlock(tp);
 
route4_reset_fastmap(tp->q->dev, head, f->id);
 
if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
 
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(f->police);
#endif
kfree(f);
 
/* Strip tree */
 
for (i=0; i<=32; i++)
if (b->ht[i])
return 0;
 
/* OK, session has no flows */
tcf_tree_lock(tp);
head->table[to_hash(h)] = NULL;
tcf_tree_unlock(tp);
 
kfree(b);
return 0;
}
}
return 0;
}
 
static int route4_change(struct tcf_proto *tp, unsigned long base,
u32 handle,
struct rtattr **tca,
unsigned long *arg)
{
struct route4_head *head = tp->root;
struct route4_filter *f, *f1, **ins_f;
struct route4_bucket *b;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_ROUTE4_MAX];
unsigned h1, h2;
int err;
 
if (opt == NULL)
return handle ? -EINVAL : 0;
 
if (rtattr_parse(tb, TCA_ROUTE4_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
return -EINVAL;
 
if ((f = (struct route4_filter*)*arg) != NULL) {
/* Node exists: adjust only classid */
 
if (f->handle != handle && handle)
return -EINVAL;
if (tb[TCA_ROUTE4_CLASSID-1]) {
unsigned long cl;
 
f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
cl = cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
if (cl)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
}
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_ROUTE4_POLICE-1]) {
struct tcf_police *police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]);
 
tcf_tree_lock(tp);
police = xchg(&f->police, police);
tcf_tree_unlock(tp);
 
tcf_police_release(police);
}
#endif
return 0;
}
 
/* Now more serious part... */
 
if (head == NULL) {
head = kmalloc(sizeof(struct route4_head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
memset(head, 0, sizeof(struct route4_head));
 
tcf_tree_lock(tp);
tp->root = head;
tcf_tree_unlock(tp);
}
 
f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL);
if (f == NULL)
return -ENOBUFS;
 
memset(f, 0, sizeof(*f));
 
err = -EINVAL;
f->handle = 0x8000;
if (tb[TCA_ROUTE4_TO-1]) {
if (handle&0x8000)
goto errout;
if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < 4)
goto errout;
f->id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]);
if (f->id > 0xFF)
goto errout;
f->handle = f->id;
}
if (tb[TCA_ROUTE4_FROM-1]) {
u32 sid;
if (tb[TCA_ROUTE4_IIF-1])
goto errout;
if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < 4)
goto errout;
sid = (*(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]));
if (sid > 0xFF)
goto errout;
f->handle |= sid<<16;
f->id |= sid<<16;
} else if (tb[TCA_ROUTE4_IIF-1]) {
if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < 4)
goto errout;
f->iif = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]);
if (f->iif > 0x7FFF)
goto errout;
f->handle |= (f->iif|0x8000)<<16;
} else
f->handle |= 0xFFFF<<16;
 
if (handle) {
f->handle |= handle&0x7F00;
if (f->handle != handle)
goto errout;
}
 
if (tb[TCA_ROUTE4_CLASSID-1]) {
if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < 4)
goto errout;
f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
}
 
h1 = to_hash(f->handle);
if ((b = head->table[h1]) == NULL) {
err = -ENOBUFS;
b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL);
if (b == NULL)
goto errout;
memset(b, 0, sizeof(*b));
 
tcf_tree_lock(tp);
head->table[h1] = b;
tcf_tree_unlock(tp);
}
f->bkt = b;
 
err = -EEXIST;
h2 = from_hash(f->handle>>16);
for (ins_f = &b->ht[h2]; (f1=*ins_f) != NULL; ins_f = &f1->next) {
if (f->handle < f1->handle)
break;
if (f1->handle == f->handle)
goto errout;
}
 
cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_ROUTE4_POLICE-1])
f->police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]);
#endif
 
f->next = f1;
tcf_tree_lock(tp);
*ins_f = f;
tcf_tree_unlock(tp);
 
route4_reset_fastmap(tp->q->dev, head, f->id);
*arg = (unsigned long)f;
return 0;
 
errout:
if (f)
kfree(f);
return err;
}
 
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct route4_head *head = tp->root;
unsigned h, h1;
 
if (head == NULL)
arg->stop = 1;
 
if (arg->stop)
return;
 
for (h = 0; h <= 256; h++) {
struct route4_bucket *b = head->table[h];
 
if (b) {
for (h1 = 0; h1 <= 32; h1++) {
struct route4_filter *f;
 
for (f = b->ht[h1]; f; f = f->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
}
}
}
 
static int route4_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct route4_filter *f = (struct route4_filter*)fh;
unsigned char *b = skb->tail;
struct rtattr *rta;
u32 id;
 
if (f == NULL)
return skb->len;
 
t->tcm_handle = f->handle;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
if (!(f->handle&0x8000)) {
id = f->id&0xFF;
RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id);
}
if (f->handle&0x80000000) {
if ((f->handle>>16) != 0xFFFF)
RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif);
} else {
id = f->id>>16;
RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id);
}
if (f->res.classid)
RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid);
#ifdef CONFIG_NET_CLS_POLICE
if (f->police) {
struct rtattr * p_rta = (struct rtattr*)skb->tail;
 
RTA_PUT(skb, TCA_ROUTE4_POLICE, 0, NULL);
 
if (tcf_police_dump(skb, f->police) < 0)
goto rtattr_failure;
 
p_rta->rta_len = skb->tail - (u8*)p_rta;
}
#endif
 
rta->rta_len = skb->tail - b;
#ifdef CONFIG_NET_CLS_POLICE
if (f->police) {
if (qdisc_copy_stats(skb, &f->police->stats))
goto rtattr_failure;
}
#endif
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
struct tcf_proto_ops cls_route4_ops = {
NULL,
"route",
route4_classify,
route4_init,
route4_destroy,
 
route4_get,
route4_put,
route4_change,
route4_delete,
route4_walk,
route4_dump
};
 
#ifdef MODULE
int init_module(void)
{
return register_tcf_proto_ops(&cls_route4_ops);
}
 
void cleanup_module(void)
{
unregister_tcf_proto_ops(&cls_route4_ops);
}
#endif
MODULE_LICENSE("GPL");
/cls_rsvp.h
0,0 → 1,698
/*
* net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
/*
Comparing to general packet classification problem,
RSVP needs only sevaral relatively simple rules:
 
* (dst, protocol) are always specified,
so that we are able to hash them.
* src may be exact, or may be wildcard, so that
we can keep a hash table plus one wildcard entry.
* source port (or flow label) is important only if src is given.
 
IMPLEMENTATION.
 
We use a two level hash table: The top level is keyed by
destination address and protocol ID, every bucket contains a list
of "rsvp sessions", identified by destination address, protocol and
DPI(="Destination Port ID"): triple (key, mask, offset).
 
Every bucket has a smaller hash table keyed by source address
(cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
Every bucket is again a list of "RSVP flows", selected by
source address and SPI(="Source Port ID" here rather than
"security parameter index"): triple (key, mask, offset).
 
 
NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
and all fragmented packets go to the best-effort traffic class.
 
 
NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
only one "Generalized Port Identifier". So that for classic
ah, esp (and udp,tcp) both *pi should coincide or one of them
should be wildcard.
 
At first sight, this redundancy is just a waste of CPU
resources. But DPI and SPI add the possibility to assign different
priorities to GPIs. Look also at note 4 about tunnels below.
 
 
NOTE 3. One complication is the case of tunneled packets.
We implement it as following: if the first lookup
matches a special session with "tunnelhdr" value not zero,
flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
In this case, we pull tunnelhdr bytes and restart lookup
with tunnel ID added to the list of keys. Simple and stupid 8)8)
It's enough for PIMREG and IPIP.
 
 
NOTE 4. Two GPIs make it possible to parse even GRE packets.
F.e. DPI can select ETH_P_IP (and necessary flags to make
tunnelhdr correct) in GRE protocol field and SPI matches
GRE key. Is it not nice? 8)8)
 
 
Well, as result, despite its simplicity, we get a pretty
powerful classification engine. */
 
#include <linux/config.h>
 
struct rsvp_head
{
u32 tmap[256/32];
u32 hgenerator;
u8 tgenerator;
struct rsvp_session *ht[256];
};
 
struct rsvp_session
{
struct rsvp_session *next;
u32 dst[RSVP_DST_LEN];
struct tc_rsvp_gpi dpi;
u8 protocol;
u8 tunnelid;
/* 16 (src,sport) hash slots, and one wildcard source slot */
struct rsvp_filter *ht[16+1];
};
 
 
struct rsvp_filter
{
struct rsvp_filter *next;
u32 src[RSVP_DST_LEN];
struct tc_rsvp_gpi spi;
u8 tunnelhdr;
 
struct tcf_result res;
#ifdef CONFIG_NET_CLS_POLICE
struct tcf_police *police;
#endif
 
u32 handle;
struct rsvp_session *sess;
};
 
static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid)
{
unsigned h = dst[RSVP_DST_LEN-1];
h ^= h>>16;
h ^= h>>8;
return (h ^ protocol ^ tunnelid) & 0xFF;
}
 
static __inline__ unsigned hash_src(u32 *src)
{
unsigned h = src[RSVP_DST_LEN-1];
h ^= h>>16;
h ^= h>>8;
h ^= h>>4;
return h & 0xF;
}
 
#ifdef CONFIG_NET_CLS_POLICE
#define RSVP_POLICE() \
if (f->police) { \
int pol_res = tcf_police(skb, f->police); \
if (pol_res < 0) continue; \
if (pol_res) return pol_res; \
}
#else
#define RSVP_POLICE()
#endif
 
 
static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
unsigned h1, h2;
u32 *dst, *src;
u8 protocol;
u8 tunnelid = 0;
u8 *xprt;
#if RSVP_DST_LEN == 4
struct ipv6hdr *nhptr = skb->nh.ipv6h;
#else
struct iphdr *nhptr = skb->nh.iph;
#endif
 
restart:
 
#if RSVP_DST_LEN == 4
src = &nhptr->saddr.s6_addr32[0];
dst = &nhptr->daddr.s6_addr32[0];
protocol = nhptr->nexthdr;
xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr);
#else
src = &nhptr->saddr;
dst = &nhptr->daddr;
protocol = nhptr->protocol;
xprt = ((u8*)nhptr) + (nhptr->ihl<<2);
if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET))
return -1;
#endif
 
h1 = hash_dst(dst, protocol, tunnelid);
h2 = hash_src(src);
 
for (s = sht[h1]; s; s = s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
protocol == s->protocol &&
!(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key))
#if RSVP_DST_LEN == 4
&& dst[0] == s->dst[0]
&& dst[1] == s->dst[1]
&& dst[2] == s->dst[2]
#endif
&& tunnelid == s->tunnelid) {
 
for (f = s->ht[h2]; f; f = f->next) {
if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] &&
!(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key))
#if RSVP_DST_LEN == 4
&& src[0] == f->src[0]
&& src[1] == f->src[1]
&& src[2] == f->src[2]
#endif
) {
*res = f->res;
 
RSVP_POLICE();
 
matched:
if (f->tunnelhdr == 0)
return 0;
 
tunnelid = f->res.classid;
nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr));
goto restart;
}
}
 
/* And wildcard bucket... */
for (f = s->ht[16]; f; f = f->next) {
*res = f->res;
RSVP_POLICE();
goto matched;
}
return -1;
}
}
return -1;
}
 
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
{
struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
unsigned h1 = handle&0xFF;
unsigned h2 = (handle>>8)&0xFF;
 
if (h2 > 16)
return 0;
 
for (s = sht[h1]; s; s = s->next) {
for (f = s->ht[h2]; f; f = f->next) {
if (f->handle == handle)
return (unsigned long)f;
}
}
return 0;
}
 
static void rsvp_put(struct tcf_proto *tp, unsigned long f)
{
}
 
static int rsvp_init(struct tcf_proto *tp)
{
struct rsvp_head *data;
 
MOD_INC_USE_COUNT;
data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL);
if (data) {
memset(data, 0, sizeof(struct rsvp_head));
tp->root = data;
return 0;
}
MOD_DEC_USE_COUNT;
return -ENOBUFS;
}
 
static void rsvp_destroy(struct tcf_proto *tp)
{
struct rsvp_head *data = xchg(&tp->root, NULL);
struct rsvp_session **sht;
int h1, h2;
 
if (data == NULL)
return;
 
sht = data->ht;
 
for (h1=0; h1<256; h1++) {
struct rsvp_session *s;
 
while ((s = sht[h1]) != NULL) {
sht[h1] = s->next;
 
for (h2=0; h2<=16; h2++) {
struct rsvp_filter *f;
 
while ((f = s->ht[h2]) != NULL) {
unsigned long cl;
 
s->ht[h2] = f->next;
if ((cl = __cls_set_class(&f->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(f->police);
#endif
kfree(f);
}
}
kfree(s);
}
}
kfree(data);
MOD_DEC_USE_COUNT;
}
 
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
{
struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg;
unsigned h = f->handle;
struct rsvp_session **sp;
struct rsvp_session *s = f->sess;
int i;
 
for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
unsigned long cl;
 
 
tcf_tree_lock(tp);
*fp = f->next;
tcf_tree_unlock(tp);
 
if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
 
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(f->police);
#endif
 
kfree(f);
 
/* Strip tree */
 
for (i=0; i<=16; i++)
if (s->ht[i])
return 0;
 
/* OK, session has no flows */
for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
*sp; sp = &(*sp)->next) {
if (*sp == s) {
tcf_tree_lock(tp);
*sp = s->next;
tcf_tree_unlock(tp);
 
kfree(s);
return 0;
}
}
 
return 0;
}
}
return 0;
}
 
static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
{
struct rsvp_head *data = tp->root;
int i = 0xFFFF;
 
while (i-- > 0) {
u32 h;
if ((data->hgenerator += 0x10000) == 0)
data->hgenerator = 0x10000;
h = data->hgenerator|salt;
if (rsvp_get(tp, h) == 0)
return h;
}
return 0;
}
 
static int tunnel_bts(struct rsvp_head *data)
{
int n = data->tgenerator>>5;
u32 b = 1<<(data->tgenerator&0x1F);
if (data->tmap[n]&b)
return 0;
data->tmap[n] |= b;
return 1;
}
 
static void tunnel_recycle(struct rsvp_head *data)
{
struct rsvp_session **sht = data->ht;
u32 tmap[256/32];
int h1, h2;
 
memset(tmap, 0, sizeof(tmap));
 
for (h1=0; h1<256; h1++) {
struct rsvp_session *s;
for (s = sht[h1]; s; s = s->next) {
for (h2=0; h2<=16; h2++) {
struct rsvp_filter *f;
 
for (f = s->ht[h2]; f; f = f->next) {
if (f->tunnelhdr == 0)
continue;
data->tgenerator = f->res.classid;
tunnel_bts(data);
}
}
}
}
 
memcpy(data->tmap, tmap, sizeof(tmap));
}
 
static u32 gen_tunnel(struct rsvp_head *data)
{
int i, k;
 
for (k=0; k<2; k++) {
for (i=255; i>0; i--) {
if (++data->tgenerator == 0)
data->tgenerator = 1;
if (tunnel_bts(data))
return data->tgenerator;
}
tunnel_recycle(data);
}
return 0;
}
 
static int rsvp_change(struct tcf_proto *tp, unsigned long base,
u32 handle,
struct rtattr **tca,
unsigned long *arg)
{
struct rsvp_head *data = tp->root;
struct rsvp_filter *f, **fp;
struct rsvp_session *s, **sp;
struct tc_rsvp_pinfo *pinfo = NULL;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_RSVP_MAX];
unsigned h1, h2;
u32 *dst;
int err;
 
if (opt == NULL)
return handle ? -EINVAL : 0;
 
if (rtattr_parse(tb, TCA_RSVP_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
return -EINVAL;
 
if ((f = (struct rsvp_filter*)*arg) != NULL) {
/* Node exists: adjust only classid */
 
if (f->handle != handle && handle)
return -EINVAL;
if (tb[TCA_RSVP_CLASSID-1]) {
unsigned long cl;
 
f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
cl = cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
if (cl)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
}
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_RSVP_POLICE-1]) {
struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]);
 
tcf_tree_lock(tp);
police = xchg(&f->police, police);
tcf_tree_unlock(tp);
 
tcf_police_release(police);
}
#endif
return 0;
}
 
/* Now more serious part... */
if (handle)
return -EINVAL;
if (tb[TCA_RSVP_DST-1] == NULL)
return -EINVAL;
 
f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
if (f == NULL)
return -ENOBUFS;
 
memset(f, 0, sizeof(*f));
h2 = 16;
if (tb[TCA_RSVP_SRC-1]) {
err = -EINVAL;
if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src))
goto errout;
memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src));
h2 = hash_src(f->src);
}
if (tb[TCA_RSVP_PINFO-1]) {
err = -EINVAL;
if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo))
goto errout;
pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]);
f->spi = pinfo->spi;
f->tunnelhdr = pinfo->tunnelhdr;
}
if (tb[TCA_RSVP_CLASSID-1]) {
err = -EINVAL;
if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4)
goto errout;
f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
}
 
err = -EINVAL;
if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src))
goto errout;
dst = RTA_DATA(tb[TCA_RSVP_DST-1]);
h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
 
err = -ENOMEM;
if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
goto errout;
 
if (f->tunnelhdr) {
err = -EINVAL;
if (f->res.classid > 255)
goto errout;
 
err = -ENOMEM;
if (f->res.classid == 0 &&
(f->res.classid = gen_tunnel(data)) == 0)
goto errout;
}
 
for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
pinfo && pinfo->protocol == s->protocol &&
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0
#if RSVP_DST_LEN == 4
&& dst[0] == s->dst[0]
&& dst[1] == s->dst[1]
&& dst[2] == s->dst[2]
#endif
&& pinfo->tunnelid == s->tunnelid) {
 
insert:
/* OK, we found appropriate session */
 
fp = &s->ht[h2];
 
f->sess = s;
if (f->tunnelhdr == 0)
cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_RSVP_POLICE-1])
f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]);
#endif
 
for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
break;
f->next = *fp;
wmb();
*fp = f;
 
*arg = (unsigned long)f;
return 0;
}
}
 
/* No session found. Create new one. */
 
err = -ENOBUFS;
s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL);
if (s == NULL)
goto errout;
memset(s, 0, sizeof(*s));
memcpy(s->dst, dst, sizeof(s->dst));
 
if (pinfo) {
s->dpi = pinfo->dpi;
s->protocol = pinfo->protocol;
s->tunnelid = pinfo->tunnelid;
}
for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) {
if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask)
break;
}
s->next = *sp;
wmb();
*sp = s;
goto insert;
 
errout:
if (f)
kfree(f);
return err;
}
 
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct rsvp_head *head = tp->root;
unsigned h, h1;
 
if (arg->stop)
return;
 
for (h = 0; h < 256; h++) {
struct rsvp_session *s;
 
for (s = head->ht[h]; s; s = s->next) {
for (h1 = 0; h1 <= 16; h1++) {
struct rsvp_filter *f;
 
for (f = s->ht[h1]; f; f = f->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
}
}
}
 
static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct rsvp_filter *f = (struct rsvp_filter*)fh;
struct rsvp_session *s;
unsigned char *b = skb->tail;
struct rtattr *rta;
struct tc_rsvp_pinfo pinfo;
 
if (f == NULL)
return skb->len;
s = f->sess;
 
t->tcm_handle = f->handle;
 
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst);
pinfo.dpi = s->dpi;
pinfo.spi = f->spi;
pinfo.protocol = s->protocol;
pinfo.tunnelid = s->tunnelid;
pinfo.tunnelhdr = f->tunnelhdr;
RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
if (f->res.classid)
RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
if (((f->handle>>8)&0xFF) != 16)
RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
#ifdef CONFIG_NET_CLS_POLICE
if (f->police) {
struct rtattr * p_rta = (struct rtattr*)skb->tail;
 
RTA_PUT(skb, TCA_RSVP_POLICE, 0, NULL);
 
if (tcf_police_dump(skb, f->police) < 0)
goto rtattr_failure;
 
p_rta->rta_len = skb->tail - (u8*)p_rta;
}
#endif
 
rta->rta_len = skb->tail - b;
#ifdef CONFIG_NET_CLS_POLICE
if (f->police) {
if (qdisc_copy_stats(skb, &f->police->stats))
goto rtattr_failure;
}
#endif
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
struct tcf_proto_ops RSVP_OPS = {
NULL,
RSVP_ID,
rsvp_classify,
rsvp_init,
rsvp_destroy,
 
rsvp_get,
rsvp_put,
rsvp_change,
rsvp_delete,
rsvp_walk,
rsvp_dump
};
 
#ifdef MODULE
int init_module(void)
{
return register_tcf_proto_ops(&RSVP_OPS);
}
 
void cleanup_module(void)
{
unregister_tcf_proto_ops(&RSVP_OPS);
}
#endif
/sch_cbq.c
0,0 → 1,2115
/*
* net/sched/sch_cbq.c Class-Based Queueing discipline.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
 
/* Class-Based Queueing (CBQ) algorithm.
=======================================
 
Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
Management Models for Packet Networks",
IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
 
[2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995
 
[3] Sally Floyd, "Notes on Class-Based Queueing: Setting
Parameters", 1996
 
[4] Sally Floyd and Michael Speer, "Experimental Results
for Class-Based Queueing", 1998, not published.
 
-----------------------------------------------------------------------
 
Algorithm skeleton was taken from NS simulator cbq.cc.
If someone wants to check this code against the LBL version,
he should take into account that ONLY the skeleton was borrowed,
the implementation is different. Particularly:
 
--- The WRR algorithm is different. Our version looks more
reasonable (I hope) and works when quanta are allowed to be
less than MTU, which is always the case when real time classes
have small rates. Note, that the statement of [3] is
incomplete, delay may actually be estimated even if class
per-round allotment is less than MTU. Namely, if per-round
allotment is W*r_i, and r_1+...+r_k = r < 1
 
delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
 
In the worst case we have IntServ estimate with D = W*r+k*MTU
and C = MTU*r. The proof (if correct at all) is trivial.
 
 
--- It seems that cbq-2.0 is not very accurate. At least, I cannot
interpret some places, which look like wrong translations
from NS. Anyone is advised to find these differences
and explain to me, why I am wrong 8).
 
--- Linux has no EOI event, so that we cannot estimate true class
idle time. Workaround is to consider the next dequeue event
as sign that previous packet is finished. This is wrong because of
internal device queueing, but on a permanently loaded link it is true.
Moreover, combined with clock integrator, this scheme looks
very close to an ideal solution. */
 
struct cbq_sched_data;
 
 
struct cbq_class
{
struct cbq_class *next; /* hash table link */
struct cbq_class *next_alive; /* next class with backlog in this priority band */
 
/* Parameters */
u32 classid;
unsigned char priority; /* class priority */
unsigned char priority2; /* priority to be used after overlimit */
unsigned char ewma_log; /* time constant for idle time calculation */
unsigned char ovl_strategy;
#ifdef CONFIG_NET_CLS_POLICE
unsigned char police;
#endif
 
u32 defmap;
 
/* Link-sharing scheduler parameters */
long maxidle; /* Class paramters: see below. */
long offtime;
long minidle;
u32 avpkt;
struct qdisc_rate_table *R_tab;
 
/* Overlimit strategy parameters */
void (*overlimit)(struct cbq_class *cl);
long penalty;
 
/* General scheduler (WRR) parameters */
long allot;
long quantum; /* Allotment per WRR round */
long weight; /* Relative allotment: see below */
 
struct Qdisc *qdisc; /* Ptr to CBQ discipline */
struct cbq_class *split; /* Ptr to split node */
struct cbq_class *share; /* Ptr to LS parent in the class tree */
struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
struct cbq_class *borrow; /* NULL if class is bandwidth limited;
parent otherwise */
struct cbq_class *sibling; /* Sibling chain */
struct cbq_class *children; /* Pointer to children chain */
 
struct Qdisc *q; /* Elementary queueing discipline */
 
 
/* Variables */
unsigned char cpriority; /* Effective priority */
unsigned char delayed;
unsigned char level; /* level of the class in hierarchy:
0 for leaf classes, and maximal
level of children + 1 for nodes.
*/
 
psched_time_t last; /* Last end of service */
psched_time_t undertime;
long avgidle;
long deficit; /* Saved deficit for WRR */
unsigned long penalized;
struct tc_stats stats;
struct tc_cbq_xstats xstats;
 
struct tcf_proto *filter_list;
 
int refcnt;
int filters;
 
struct cbq_class *defaults[TC_PRIO_MAX+1];
};
 
struct cbq_sched_data
{
struct cbq_class *classes[16]; /* Hash table of all classes */
int nclasses[TC_CBQ_MAXPRIO+1];
unsigned quanta[TC_CBQ_MAXPRIO+1];
 
struct cbq_class link;
 
unsigned activemask;
struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
with backlog */
 
#ifdef CONFIG_NET_CLS_POLICE
struct cbq_class *rx_class;
#endif
struct cbq_class *tx_class;
struct cbq_class *tx_borrowed;
int tx_len;
psched_time_t now; /* Cached timestamp */
psched_time_t now_rt; /* Cached real time */
unsigned pmask;
 
struct timer_list delay_timer;
struct timer_list wd_timer; /* Watchdog timer,
started when CBQ has
backlog, but cannot
transmit just now */
long wd_expires;
int toplevel;
u32 hgenerator;
};
 
 
#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
 
 
static __inline__ unsigned cbq_hash(u32 h)
{
h ^= h>>8;
h ^= h>>4;
return h&0xF;
}
 
static __inline__ struct cbq_class *
cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
{
struct cbq_class *cl;
 
for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
if (cl->classid == classid)
return cl;
return NULL;
}
 
#ifdef CONFIG_NET_CLS_POLICE
 
static struct cbq_class *
cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
{
struct cbq_class *cl, *new;
 
for (cl = this->tparent; cl; cl = cl->tparent)
if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
return new;
 
return NULL;
}
 
#endif
 
/* Classify packet. The procedure is pretty complicated, but
it allows us to combine link sharing and priority scheduling
transparently.
 
Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
so that it resolves to split nodes. Then packets are classified
by logical priority, or a more specific classifier may be attached
to the split node.
*/
 
static struct cbq_class *
cbq_classify(struct sk_buff *skb, struct Qdisc *sch)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
struct cbq_class *head = &q->link;
struct cbq_class **defmap;
struct cbq_class *cl = NULL;
u32 prio = skb->priority;
struct tcf_result res;
 
/*
* Step 1. If skb->priority points to one of our classes, use it.
*/
if (TC_H_MAJ(prio^sch->handle) == 0 &&
(cl = cbq_class_lookup(q, prio)) != NULL)
return cl;
 
for (;;) {
int result = 0;
 
defmap = head->defaults;
 
/*
* Step 2+n. Apply classifier.
*/
if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
goto fallback;
 
if ((cl = (void*)res.class) == NULL) {
if (TC_H_MAJ(res.classid))
cl = cbq_class_lookup(q, res.classid);
else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
cl = defmap[TC_PRIO_BESTEFFORT];
 
if (cl == NULL || cl->level >= head->level)
goto fallback;
}
 
#ifdef CONFIG_NET_CLS_POLICE
switch (result) {
case TC_POLICE_RECLASSIFY:
return cbq_reclassify(skb, cl);
case TC_POLICE_SHOT:
return NULL;
default:
break;
}
#endif
if (cl->level == 0)
return cl;
 
/*
* Step 3+n. If classifier selected a link sharing class,
* apply agency specific classifier.
* Repeat this procdure until we hit a leaf node.
*/
head = cl;
}
 
fallback:
cl = head;
 
/*
* Step 4. No success...
*/
if (TC_H_MAJ(prio) == 0 &&
!(cl = head->defaults[prio&TC_PRIO_MAX]) &&
!(cl = head->defaults[TC_PRIO_BESTEFFORT]))
return head;
 
return cl;
}
 
/*
A packet has just been enqueued on the empty class.
cbq_activate_class adds it to the tail of active class list
of its priority band.
*/
 
static __inline__ void cbq_activate_class(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
int prio = cl->cpriority;
struct cbq_class *cl_tail;
 
cl_tail = q->active[prio];
q->active[prio] = cl;
 
if (cl_tail != NULL) {
cl->next_alive = cl_tail->next_alive;
cl_tail->next_alive = cl;
} else {
cl->next_alive = cl;
q->activemask |= (1<<prio);
}
}
 
/*
Unlink class from active chain.
Note that this same procedure is done directly in cbq_dequeue*
during round-robin procedure.
*/
 
static void cbq_deactivate_class(struct cbq_class *this)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
int prio = this->cpriority;
struct cbq_class *cl;
struct cbq_class *cl_prev = q->active[prio];
 
do {
cl = cl_prev->next_alive;
if (cl == this) {
cl_prev->next_alive = cl->next_alive;
cl->next_alive = NULL;
 
if (cl == q->active[prio]) {
q->active[prio] = cl_prev;
if (cl == q->active[prio]) {
q->active[prio] = NULL;
q->activemask &= ~(1<<prio);
return;
}
}
 
cl = cl_prev->next_alive;
return;
}
} while ((cl_prev = cl) != q->active[prio]);
}
 
static void
cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
{
int toplevel = q->toplevel;
 
if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
psched_time_t now;
psched_tdiff_t incr;
 
PSCHED_GET_TIME(now);
incr = PSCHED_TDIFF(now, q->now_rt);
PSCHED_TADD2(q->now, incr, now);
 
do {
if (PSCHED_TLESS(cl->undertime, now)) {
q->toplevel = cl->level;
return;
}
} while ((cl=cl->borrow) != NULL && toplevel > cl->level);
}
}
 
static int
cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl = cbq_classify(skb, sch);
int len = skb->len;
int ret = NET_XMIT_POLICED;
 
#ifdef CONFIG_NET_CLS_POLICE
q->rx_class = cl;
#endif
if (cl) {
#ifdef CONFIG_NET_CLS_POLICE
cl->q->__parent = sch;
#endif
if ((ret = cl->q->enqueue(skb, cl->q)) == 0) {
sch->q.qlen++;
sch->stats.packets++;
sch->stats.bytes+=len;
cbq_mark_toplevel(q, cl);
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
}
}
 
sch->stats.drops++;
if (cl == NULL)
kfree_skb(skb);
else {
cbq_mark_toplevel(q, cl);
cl->stats.drops++;
}
return ret;
}
 
static int
cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl;
int ret;
 
if ((cl = q->tx_class) == NULL) {
kfree_skb(skb);
sch->stats.drops++;
return NET_XMIT_CN;
}
q->tx_class = NULL;
 
cbq_mark_toplevel(q, cl);
 
#ifdef CONFIG_NET_CLS_POLICE
q->rx_class = cl;
cl->q->__parent = sch;
#endif
if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
sch->q.qlen++;
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
}
sch->stats.drops++;
cl->stats.drops++;
return ret;
}
 
/* Overlimit actions */
 
/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
 
static void cbq_ovl_classic(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
 
if (!cl->delayed) {
delay += cl->offtime;
 
/*
Class goes to sleep, so that it will have no
chance to work avgidle. Let's forgive it 8)
 
BTW cbq-2.0 has a crap in this
place, apparently they forgot to shift it by cl->ewma_log.
*/
if (cl->avgidle < 0)
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
if (cl->avgidle < cl->minidle)
cl->avgidle = cl->minidle;
if (delay <= 0)
delay = 1;
PSCHED_TADD2(q->now, delay, cl->undertime);
 
cl->xstats.overactions++;
cl->delayed = 1;
}
if (q->wd_expires == 0 || q->wd_expires > delay)
q->wd_expires = delay;
 
/* Dirty work! We must schedule wakeups based on
real available rate, rather than leaf rate,
which may be tiny (even zero).
*/
if (q->toplevel == TC_CBQ_MAXLEVEL) {
struct cbq_class *b;
psched_tdiff_t base_delay = q->wd_expires;
 
for (b = cl->borrow; b; b = b->borrow) {
delay = PSCHED_TDIFF(b->undertime, q->now);
if (delay < base_delay) {
if (delay <= 0)
delay = 1;
base_delay = delay;
}
}
 
q->wd_expires = base_delay;
}
}
 
/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
they go overlimit
*/
 
static void cbq_ovl_rclassic(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
struct cbq_class *this = cl;
 
do {
if (cl->level > q->toplevel) {
cl = NULL;
break;
}
} while ((cl = cl->borrow) != NULL);
 
if (cl == NULL)
cl = this;
cbq_ovl_classic(cl);
}
 
/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
 
static void cbq_ovl_delay(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
 
if (!cl->delayed) {
unsigned long sched = jiffies;
 
delay += cl->offtime;
if (cl->avgidle < 0)
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
if (cl->avgidle < cl->minidle)
cl->avgidle = cl->minidle;
PSCHED_TADD2(q->now, delay, cl->undertime);
 
if (delay > 0) {
sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
cl->penalized = sched;
cl->cpriority = TC_CBQ_MAXPRIO;
q->pmask |= (1<<TC_CBQ_MAXPRIO);
if (del_timer(&q->delay_timer) &&
(long)(q->delay_timer.expires - sched) > 0)
q->delay_timer.expires = sched;
add_timer(&q->delay_timer);
cl->delayed = 1;
cl->xstats.overactions++;
return;
}
delay = 1;
}
if (q->wd_expires == 0 || q->wd_expires > delay)
q->wd_expires = delay;
}
 
/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
 
static void cbq_ovl_lowprio(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
 
cl->penalized = jiffies + cl->penalty;
 
if (cl->cpriority != cl->priority2) {
cl->cpriority = cl->priority2;
q->pmask |= (1<<cl->cpriority);
cl->xstats.overactions++;
}
cbq_ovl_classic(cl);
}
 
/* TC_CBQ_OVL_DROP: penalize class by dropping */
 
static void cbq_ovl_drop(struct cbq_class *cl)
{
if (cl->q->ops->drop)
if (cl->q->ops->drop(cl->q))
cl->qdisc->q.qlen--;
cl->xstats.overactions++;
cbq_ovl_classic(cl);
}
 
static void cbq_watchdog(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
 
sch->flags &= ~TCQ_F_THROTTLED;
netif_schedule(sch->dev);
}
 
static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
{
struct cbq_class *cl;
struct cbq_class *cl_prev = q->active[prio];
unsigned long now = jiffies;
unsigned long sched = now;
 
if (cl_prev == NULL)
return now;
 
do {
cl = cl_prev->next_alive;
if ((long)(now - cl->penalized) > 0) {
cl_prev->next_alive = cl->next_alive;
cl->next_alive = NULL;
cl->cpriority = cl->priority;
cl->delayed = 0;
cbq_activate_class(cl);
 
if (cl == q->active[prio]) {
q->active[prio] = cl_prev;
if (cl == q->active[prio]) {
q->active[prio] = NULL;
return 0;
}
}
 
cl = cl_prev->next_alive;
} else if ((long)(sched - cl->penalized) > 0)
sched = cl->penalized;
} while ((cl_prev = cl) != q->active[prio]);
 
return (long)(sched - now);
}
 
static void cbq_undelay(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
long delay = 0;
unsigned pmask;
 
pmask = q->pmask;
q->pmask = 0;
 
while (pmask) {
int prio = ffz(~pmask);
long tmp;
 
pmask &= ~(1<<prio);
 
tmp = cbq_undelay_prio(q, prio);
if (tmp > 0) {
q->pmask |= 1<<prio;
if (tmp < delay || delay == 0)
delay = tmp;
}
}
 
if (delay) {
q->delay_timer.expires = jiffies + delay;
add_timer(&q->delay_timer);
}
 
sch->flags &= ~TCQ_F_THROTTLED;
netif_schedule(sch->dev);
}
 
 
#ifdef CONFIG_NET_CLS_POLICE
 
static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
{
int len = skb->len;
struct Qdisc *sch = child->__parent;
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl = q->rx_class;
 
q->rx_class = NULL;
 
if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
 
cbq_mark_toplevel(q, cl);
 
q->rx_class = cl;
cl->q->__parent = sch;
 
if (cl->q->enqueue(skb, cl->q) == 0) {
sch->q.qlen++;
sch->stats.packets++;
sch->stats.bytes+=len;
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
}
sch->stats.drops++;
return 0;
}
 
sch->stats.drops++;
return -1;
}
#endif
 
/*
It is mission critical procedure.
 
We "regenerate" toplevel cutoff, if transmitting class
has backlog and it is not regulated. It is not part of
original CBQ description, but looks more reasonable.
Probably, it is wrong. This question needs further investigation.
*/
 
static __inline__ void
cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
struct cbq_class *borrowed)
{
if (cl && q->toplevel >= borrowed->level) {
if (cl->q->q.qlen > 1) {
do {
if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
q->toplevel = borrowed->level;
return;
}
} while ((borrowed=borrowed->borrow) != NULL);
}
#if 0
/* It is not necessary now. Uncommenting it
will save CPU cycles, but decrease fairness.
*/
q->toplevel = TC_CBQ_MAXLEVEL;
#endif
}
}
 
static void
cbq_update(struct cbq_sched_data *q)
{
struct cbq_class *this = q->tx_class;
struct cbq_class *cl = this;
int len = q->tx_len;
 
q->tx_class = NULL;
 
for ( ; cl; cl = cl->share) {
long avgidle = cl->avgidle;
long idle;
 
cl->stats.packets++;
cl->stats.bytes += len;
 
/*
(now - last) is total time between packet right edges.
(last_pktlen/rate) is "virtual" busy time, so that
 
idle = (now - last) - last_pktlen/rate
*/
 
idle = PSCHED_TDIFF(q->now, cl->last);
if ((unsigned long)idle > 128*1024*1024) {
avgidle = cl->maxidle;
} else {
idle -= L2T(cl, len);
 
/* true_avgidle := (1-W)*true_avgidle + W*idle,
where W=2^{-ewma_log}. But cl->avgidle is scaled:
cl->avgidle == true_avgidle/W,
hence:
*/
avgidle += idle - (avgidle>>cl->ewma_log);
}
 
if (avgidle <= 0) {
/* Overlimit or at-limit */
 
if (avgidle < cl->minidle)
avgidle = cl->minidle;
 
cl->avgidle = avgidle;
 
/* Calculate expected time, when this class
will be allowed to send.
It will occur, when:
(1-W)*true_avgidle + W*delay = 0, i.e.
idle = (1/W - 1)*(-true_avgidle)
or
idle = (1 - W)*(-cl->avgidle);
*/
idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 
/*
That is not all.
To maintain the rate allocated to the class,
we add to undertime virtual clock,
necesary to complete transmitted packet.
(len/phys_bandwidth has been already passed
to the moment of cbq_update)
*/
 
idle -= L2T(&q->link, len);
idle += L2T(cl, len);
 
PSCHED_AUDIT_TDIFF(idle);
 
PSCHED_TADD2(q->now, idle, cl->undertime);
} else {
/* Underlimit */
 
PSCHED_SET_PASTPERFECT(cl->undertime);
if (avgidle > cl->maxidle)
cl->avgidle = cl->maxidle;
else
cl->avgidle = avgidle;
}
cl->last = q->now;
}
 
cbq_update_toplevel(q, this, q->tx_borrowed);
}
 
static __inline__ struct cbq_class *
cbq_under_limit(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
struct cbq_class *this_cl = cl;
 
if (cl->tparent == NULL)
return cl;
 
if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
!PSCHED_TLESS(q->now, cl->undertime)) {
cl->delayed = 0;
return cl;
}
 
do {
/* It is very suspicious place. Now overlimit
action is generated for not bounded classes
only if link is completely congested.
Though it is in agree with ancestor-only paradigm,
it looks very stupid. Particularly,
it means that this chunk of code will either
never be called or result in strong amplification
of burstiness. Dangerous, silly, and, however,
no another solution exists.
*/
if ((cl = cl->borrow) == NULL) {
this_cl->stats.overlimits++;
this_cl->overlimit(this_cl);
return NULL;
}
if (cl->level > q->toplevel)
return NULL;
} while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
PSCHED_TLESS(q->now, cl->undertime));
 
cl->delayed = 0;
return cl;
}
 
static __inline__ struct sk_buff *
cbq_dequeue_prio(struct Qdisc *sch, int prio)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl_tail, *cl_prev, *cl;
struct sk_buff *skb;
int deficit;
 
cl_tail = cl_prev = q->active[prio];
cl = cl_prev->next_alive;
 
do {
deficit = 0;
 
/* Start round */
do {
struct cbq_class *borrow = cl;
 
if (cl->q->q.qlen &&
(borrow = cbq_under_limit(cl)) == NULL)
goto skip_class;
 
if (cl->deficit <= 0) {
/* Class exhausted its allotment per
this round. Switch to the next one.
*/
deficit = 1;
cl->deficit += cl->quantum;
goto next_class;
}
 
skb = cl->q->dequeue(cl->q);
 
/* Class did not give us any skb :-(
It could occur even if cl->q->q.qlen != 0
f.e. if cl->q == "tbf"
*/
if (skb == NULL)
goto skip_class;
 
cl->deficit -= skb->len;
q->tx_class = cl;
q->tx_borrowed = borrow;
if (borrow != cl) {
#ifndef CBQ_XSTATS_BORROWS_BYTES
borrow->xstats.borrows++;
cl->xstats.borrows++;
#else
borrow->xstats.borrows += skb->len;
cl->xstats.borrows += skb->len;
#endif
}
q->tx_len = skb->len;
 
if (cl->deficit <= 0) {
q->active[prio] = cl;
cl = cl->next_alive;
cl->deficit += cl->quantum;
}
return skb;
 
skip_class:
if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
/* Class is empty or penalized.
Unlink it from active chain.
*/
cl_prev->next_alive = cl->next_alive;
cl->next_alive = NULL;
 
/* Did cl_tail point to it? */
if (cl == cl_tail) {
/* Repair it! */
cl_tail = cl_prev;
 
/* Was it the last class in this band? */
if (cl == cl_tail) {
/* Kill the band! */
q->active[prio] = NULL;
q->activemask &= ~(1<<prio);
if (cl->q->q.qlen)
cbq_activate_class(cl);
return NULL;
}
 
q->active[prio] = cl_tail;
}
if (cl->q->q.qlen)
cbq_activate_class(cl);
 
cl = cl_prev;
}
 
next_class:
cl_prev = cl;
cl = cl->next_alive;
} while (cl_prev != cl_tail);
} while (deficit);
 
q->active[prio] = cl_prev;
 
return NULL;
}
 
static __inline__ struct sk_buff *
cbq_dequeue_1(struct Qdisc *sch)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct sk_buff *skb;
unsigned activemask;
 
activemask = q->activemask&0xFF;
while (activemask) {
int prio = ffz(~activemask);
activemask &= ~(1<<prio);
skb = cbq_dequeue_prio(sch, prio);
if (skb)
return skb;
}
return NULL;
}
 
static struct sk_buff *
cbq_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
psched_time_t now;
psched_tdiff_t incr;
 
PSCHED_GET_TIME(now);
incr = PSCHED_TDIFF(now, q->now_rt);
 
if (q->tx_class) {
psched_tdiff_t incr2;
/* Time integrator. We calculate EOS time
by adding expected packet transmittion time.
If real time is greater, we warp artificial clock,
so that:
 
cbq_time = max(real_time, work);
*/
incr2 = L2T(&q->link, q->tx_len);
PSCHED_TADD(q->now, incr2);
cbq_update(q);
if ((incr -= incr2) < 0)
incr = 0;
}
PSCHED_TADD(q->now, incr);
q->now_rt = now;
 
for (;;) {
q->wd_expires = 0;
 
skb = cbq_dequeue_1(sch);
if (skb) {
sch->q.qlen--;
sch->flags &= ~TCQ_F_THROTTLED;
return skb;
}
 
/* All the classes are overlimit.
 
It is possible, if:
 
1. Scheduler is empty.
2. Toplevel cutoff inhibited borrowing.
3. Root class is overlimit.
 
Reset 2d and 3d conditions and retry.
 
Note, that NS and cbq-2.0 are buggy, peeking
an arbitrary class is appropriate for ancestor-only
sharing, but not for toplevel algorithm.
 
Our version is better, but slower, because it requires
two passes, but it is unavoidable with top-level sharing.
*/
 
if (q->toplevel == TC_CBQ_MAXLEVEL &&
PSCHED_IS_PASTPERFECT(q->link.undertime))
break;
 
q->toplevel = TC_CBQ_MAXLEVEL;
PSCHED_SET_PASTPERFECT(q->link.undertime);
}
 
/* No packets in scheduler or nobody wants to give them to us :-(
Sigh... start watchdog timer in the last case. */
 
if (sch->q.qlen) {
sch->stats.overlimits++;
if (q->wd_expires && !netif_queue_stopped(sch->dev)) {
long delay = PSCHED_US2JIFFIE(q->wd_expires);
if (delay <= 0)
delay = 1;
mod_timer(&q->wd_timer, jiffies + delay);
sch->flags |= TCQ_F_THROTTLED;
}
}
return NULL;
}
 
/* CBQ class maintanance routines */
 
static void cbq_adjust_levels(struct cbq_class *this)
{
if (this == NULL)
return;
 
do {
int level = 0;
struct cbq_class *cl;
 
if ((cl = this->children) != NULL) {
do {
if (cl->level > level)
level = cl->level;
} while ((cl = cl->sibling) != this->children);
}
this->level = level+1;
} while ((this = this->tparent) != NULL);
}
 
static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
{
struct cbq_class *cl;
unsigned h;
 
if (q->quanta[prio] == 0)
return;
 
for (h=0; h<16; h++) {
for (cl = q->classes[h]; cl; cl = cl->next) {
/* BUGGGG... Beware! This expression suffer of
arithmetic overflows!
*/
if (cl->priority == prio) {
cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
q->quanta[prio];
}
if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
cl->quantum = cl->qdisc->dev->mtu/2 + 1;
}
}
}
}
 
static void cbq_sync_defmap(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
struct cbq_class *split = cl->split;
unsigned h;
int i;
 
if (split == NULL)
return;
 
for (i=0; i<=TC_PRIO_MAX; i++) {
if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
split->defaults[i] = NULL;
}
 
for (i=0; i<=TC_PRIO_MAX; i++) {
int level = split->level;
 
if (split->defaults[i])
continue;
 
for (h=0; h<16; h++) {
struct cbq_class *c;
 
for (c = q->classes[h]; c; c = c->next) {
if (c->split == split && c->level < level &&
c->defmap&(1<<i)) {
split->defaults[i] = c;
level = c->level;
}
}
}
}
}
 
static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
{
struct cbq_class *split = NULL;
 
if (splitid == 0) {
if ((split = cl->split) == NULL)
return;
splitid = split->classid;
}
 
if (split == NULL || split->classid != splitid) {
for (split = cl->tparent; split; split = split->tparent)
if (split->classid == splitid)
break;
}
 
if (split == NULL)
return;
 
if (cl->split != split) {
cl->defmap = 0;
cbq_sync_defmap(cl);
cl->split = split;
cl->defmap = def&mask;
} else
cl->defmap = (cl->defmap&~mask)|(def&mask);
 
cbq_sync_defmap(cl);
}
 
static void cbq_unlink_class(struct cbq_class *this)
{
struct cbq_class *cl, **clp;
struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
 
for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
if (cl == this) {
*clp = cl->next;
cl->next = NULL;
break;
}
}
 
if (this->tparent) {
clp=&this->sibling;
cl = *clp;
do {
if (cl == this) {
*clp = cl->sibling;
break;
}
clp = &cl->sibling;
} while ((cl = *clp) != this->sibling);
 
if (this->tparent->children == this) {
this->tparent->children = this->sibling;
if (this->sibling == this)
this->tparent->children = NULL;
}
} else {
BUG_TRAP(this->sibling == this);
}
}
 
static void cbq_link_class(struct cbq_class *this)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
unsigned h = cbq_hash(this->classid);
struct cbq_class *parent = this->tparent;
 
this->sibling = this;
this->next = q->classes[h];
q->classes[h] = this;
 
if (parent == NULL)
return;
 
if (parent->children == NULL) {
parent->children = this;
} else {
this->sibling = parent->children->sibling;
parent->children->sibling = this;
}
}
 
static unsigned int cbq_drop(struct Qdisc* sch)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl, *cl_head;
int prio;
unsigned int len;
 
for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
if ((cl_head = q->active[prio]) == NULL)
continue;
 
cl = cl_head;
do {
if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
sch->q.qlen--;
return len;
}
} while ((cl = cl->next_alive) != cl_head);
}
return 0;
}
 
static void
cbq_reset(struct Qdisc* sch)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl;
int prio;
unsigned h;
 
q->activemask = 0;
q->pmask = 0;
q->tx_class = NULL;
q->tx_borrowed = NULL;
del_timer(&q->wd_timer);
del_timer(&q->delay_timer);
q->toplevel = TC_CBQ_MAXLEVEL;
PSCHED_GET_TIME(q->now);
q->now_rt = q->now;
 
for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
q->active[prio] = NULL;
 
for (h = 0; h < 16; h++) {
for (cl = q->classes[h]; cl; cl = cl->next) {
qdisc_reset(cl->q);
 
cl->next_alive = NULL;
PSCHED_SET_PASTPERFECT(cl->undertime);
cl->avgidle = cl->maxidle;
cl->deficit = cl->quantum;
cl->cpriority = cl->priority;
}
}
sch->q.qlen = 0;
}
 
 
static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
{
if (lss->change&TCF_CBQ_LSS_FLAGS) {
cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
}
if (lss->change&TCF_CBQ_LSS_EWMA)
cl->ewma_log = lss->ewma_log;
if (lss->change&TCF_CBQ_LSS_AVPKT)
cl->avpkt = lss->avpkt;
if (lss->change&TCF_CBQ_LSS_MINIDLE)
cl->minidle = -(long)lss->minidle;
if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
cl->maxidle = lss->maxidle;
cl->avgidle = lss->maxidle;
}
if (lss->change&TCF_CBQ_LSS_OFFTIME)
cl->offtime = lss->offtime;
return 0;
}
 
static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
{
q->nclasses[cl->priority]--;
q->quanta[cl->priority] -= cl->weight;
cbq_normalize_quanta(q, cl->priority);
}
 
static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
{
q->nclasses[cl->priority]++;
q->quanta[cl->priority] += cl->weight;
cbq_normalize_quanta(q, cl->priority);
}
 
static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
 
if (wrr->allot)
cl->allot = wrr->allot;
if (wrr->weight)
cl->weight = wrr->weight;
if (wrr->priority) {
cl->priority = wrr->priority-1;
cl->cpriority = cl->priority;
if (cl->priority >= cl->priority2)
cl->priority2 = TC_CBQ_MAXPRIO-1;
}
 
cbq_addprio(q, cl);
return 0;
}
 
static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
{
switch (ovl->strategy) {
case TC_CBQ_OVL_CLASSIC:
cl->overlimit = cbq_ovl_classic;
break;
case TC_CBQ_OVL_DELAY:
cl->overlimit = cbq_ovl_delay;
break;
case TC_CBQ_OVL_LOWPRIO:
if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
ovl->priority2-1 <= cl->priority)
return -EINVAL;
cl->priority2 = ovl->priority2-1;
cl->overlimit = cbq_ovl_lowprio;
break;
case TC_CBQ_OVL_DROP:
cl->overlimit = cbq_ovl_drop;
break;
case TC_CBQ_OVL_RCLASSIC:
cl->overlimit = cbq_ovl_rclassic;
break;
default:
return -EINVAL;
}
cl->penalty = (ovl->penalty*HZ)/1000;
return 0;
}
 
#ifdef CONFIG_NET_CLS_POLICE
static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
{
cl->police = p->police;
 
if (cl->q->handle) {
if (p->police == TC_POLICE_RECLASSIFY)
cl->q->reshape_fail = cbq_reshape_fail;
else
cl->q->reshape_fail = NULL;
}
return 0;
}
#endif
 
static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
{
cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
return 0;
}
 
static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
struct rtattr *tb[TCA_CBQ_MAX];
struct tc_ratespec *r;
 
if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 ||
tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
return -EINVAL;
 
if (tb[TCA_CBQ_LSSOPT-1] &&
RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
return -EINVAL;
 
r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
 
MOD_INC_USE_COUNT;
if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) {
MOD_DEC_USE_COUNT;
return -EINVAL;
}
 
q->link.refcnt = 1;
q->link.sibling = &q->link;
q->link.classid = sch->handle;
q->link.qdisc = sch;
if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
q->link.q = &noop_qdisc;
 
q->link.priority = TC_CBQ_MAXPRIO-1;
q->link.priority2 = TC_CBQ_MAXPRIO-1;
q->link.cpriority = TC_CBQ_MAXPRIO-1;
q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
q->link.overlimit = cbq_ovl_classic;
q->link.allot = psched_mtu(sch->dev);
q->link.quantum = q->link.allot;
q->link.weight = q->link.R_tab->rate.rate;
 
q->link.ewma_log = TC_CBQ_DEF_EWMA;
q->link.avpkt = q->link.allot/2;
q->link.minidle = -0x7FFFFFFF;
q->link.stats.lock = &sch->dev->queue_lock;
 
init_timer(&q->wd_timer);
q->wd_timer.data = (unsigned long)sch;
q->wd_timer.function = cbq_watchdog;
init_timer(&q->delay_timer);
q->delay_timer.data = (unsigned long)sch;
q->delay_timer.function = cbq_undelay;
q->toplevel = TC_CBQ_MAXLEVEL;
PSCHED_GET_TIME(q->now);
q->now_rt = q->now;
 
cbq_link_class(&q->link);
 
if (tb[TCA_CBQ_LSSOPT-1])
cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
 
cbq_addprio(q, &q->link);
return 0;
}
 
static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb->tail;
 
RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb->tail;
struct tc_cbq_lssopt opt;
 
opt.flags = 0;
if (cl->borrow == NULL)
opt.flags |= TCF_CBQ_LSS_BOUNDED;
if (cl->share == NULL)
opt.flags |= TCF_CBQ_LSS_ISOLATED;
opt.ewma_log = cl->ewma_log;
opt.level = cl->level;
opt.avpkt = cl->avpkt;
opt.maxidle = cl->maxidle;
opt.minidle = (u32)(-cl->minidle);
opt.offtime = cl->offtime;
opt.change = ~0;
RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb->tail;
struct tc_cbq_wrropt opt;
 
opt.flags = 0;
opt.allot = cl->allot;
opt.priority = cl->priority+1;
opt.cpriority = cl->cpriority+1;
opt.weight = cl->weight;
RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb->tail;
struct tc_cbq_ovl opt;
 
opt.strategy = cl->ovl_strategy;
opt.priority2 = cl->priority2+1;
opt.penalty = (cl->penalty*1000)/HZ;
RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb->tail;
struct tc_cbq_fopt opt;
 
if (cl->split || cl->defmap) {
opt.split = cl->split ? cl->split->classid : 0;
opt.defmap = cl->defmap;
opt.defchange = ~0;
RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
}
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
#ifdef CONFIG_NET_CLS_POLICE
static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb->tail;
struct tc_cbq_police opt;
 
if (cl->police) {
opt.police = cl->police;
RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
}
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
#endif
 
static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
{
if (cbq_dump_lss(skb, cl) < 0 ||
cbq_dump_rate(skb, cl) < 0 ||
cbq_dump_wrr(skb, cl) < 0 ||
cbq_dump_ovl(skb, cl) < 0 ||
#ifdef CONFIG_NET_CLS_POLICE
cbq_dump_police(skb, cl) < 0 ||
#endif
cbq_dump_fopt(skb, cl) < 0)
return -1;
return 0;
}
 
int cbq_copy_xstats(struct sk_buff *skb, struct tc_cbq_xstats *st)
{
RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
return 0;
 
rtattr_failure:
return -1;
}
 
 
static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
unsigned char *b = skb->tail;
struct rtattr *rta;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
if (cbq_dump_attr(skb, &q->link) < 0)
goto rtattr_failure;
rta->rta_len = skb->tail - b;
spin_lock_bh(&sch->dev->queue_lock);
q->link.xstats.avgidle = q->link.avgidle;
if (cbq_copy_xstats(skb, &q->link.xstats)) {
spin_unlock_bh(&sch->dev->queue_lock);
goto rtattr_failure;
}
spin_unlock_bh(&sch->dev->queue_lock);
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int
cbq_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
struct cbq_class *cl = (struct cbq_class*)arg;
unsigned char *b = skb->tail;
struct rtattr *rta;
 
if (cl->tparent)
tcm->tcm_parent = cl->tparent->classid;
else
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle = cl->classid;
tcm->tcm_info = cl->q->handle;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
if (cbq_dump_attr(skb, cl) < 0)
goto rtattr_failure;
rta->rta_len = skb->tail - b;
cl->stats.qlen = cl->q->q.qlen;
if (qdisc_copy_stats(skb, &cl->stats))
goto rtattr_failure;
spin_lock_bh(&sch->dev->queue_lock);
cl->xstats.avgidle = cl->avgidle;
cl->xstats.undertime = 0;
if (!PSCHED_IS_PASTPERFECT(cl->undertime))
cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
q->link.xstats.avgidle = q->link.avgidle;
if (cbq_copy_xstats(skb, &cl->xstats)) {
spin_unlock_bh(&sch->dev->queue_lock);
goto rtattr_failure;
}
spin_unlock_bh(&sch->dev->queue_lock);
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct cbq_class *cl = (struct cbq_class*)arg;
 
if (cl) {
if (new == NULL) {
if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
return -ENOBUFS;
} else {
#ifdef CONFIG_NET_CLS_POLICE
if (cl->police == TC_POLICE_RECLASSIFY)
new->reshape_fail = cbq_reshape_fail;
#endif
}
sch_tree_lock(sch);
*old = cl->q;
cl->q = new;
sch->q.qlen -= (*old)->q.qlen;
qdisc_reset(*old);
sch_tree_unlock(sch);
 
return 0;
}
return -ENOENT;
}
 
static struct Qdisc *
cbq_leaf(struct Qdisc *sch, unsigned long arg)
{
struct cbq_class *cl = (struct cbq_class*)arg;
 
return cl ? cl->q : NULL;
}
 
static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl = cbq_class_lookup(q, classid);
 
if (cl) {
cl->refcnt++;
return (unsigned long)cl;
}
return 0;
}
 
static void cbq_destroy_filters(struct cbq_class *cl)
{
struct tcf_proto *tp;
 
while ((tp = cl->filter_list) != NULL) {
cl->filter_list = tp->next;
tcf_destroy(tp);
}
}
 
static void cbq_destroy_class(struct cbq_class *cl)
{
cbq_destroy_filters(cl);
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
#ifdef CONFIG_NET_ESTIMATOR
qdisc_kill_estimator(&cl->stats);
#endif
kfree(cl);
}
 
static void
cbq_destroy(struct Qdisc* sch)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl;
unsigned h;
 
#ifdef CONFIG_NET_CLS_POLICE
q->rx_class = NULL;
#endif
for (h = 0; h < 16; h++) {
for (cl = q->classes[h]; cl; cl = cl->next)
cbq_destroy_filters(cl);
}
 
for (h = 0; h < 16; h++) {
struct cbq_class *next;
 
for (cl = q->classes[h]; cl; cl = next) {
next = cl->next;
if (cl != &q->link)
cbq_destroy_class(cl);
}
}
 
qdisc_put_rtab(q->link.R_tab);
MOD_DEC_USE_COUNT;
}
 
static void cbq_put(struct Qdisc *sch, unsigned long arg)
{
struct cbq_class *cl = (struct cbq_class*)arg;
 
if (--cl->refcnt == 0) {
#ifdef CONFIG_NET_CLS_POLICE
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
 
spin_lock_bh(&sch->dev->queue_lock);
if (q->rx_class == cl)
q->rx_class = NULL;
spin_unlock_bh(&sch->dev->queue_lock);
#endif
 
cbq_destroy_class(cl);
}
}
 
static int
cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
unsigned long *arg)
{
int err;
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl = (struct cbq_class*)*arg;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_CBQ_MAX];
struct cbq_class *parent;
struct qdisc_rate_table *rtab = NULL;
 
if (opt==NULL ||
rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)))
return -EINVAL;
 
if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
return -EINVAL;
 
if (tb[TCA_CBQ_FOPT-1] &&
RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
return -EINVAL;
 
if (tb[TCA_CBQ_RATE-1] &&
RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
return -EINVAL;
 
if (tb[TCA_CBQ_LSSOPT-1] &&
RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
return -EINVAL;
 
if (tb[TCA_CBQ_WRROPT-1] &&
RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
return -EINVAL;
 
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_CBQ_POLICE-1] &&
RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
return -EINVAL;
#endif
 
if (cl) {
/* Check parent */
if (parentid) {
if (cl->tparent && cl->tparent->classid != parentid)
return -EINVAL;
if (!cl->tparent && parentid != TC_H_ROOT)
return -EINVAL;
}
 
if (tb[TCA_CBQ_RATE-1]) {
rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
if (rtab == NULL)
return -EINVAL;
}
 
/* Change class parameters */
sch_tree_lock(sch);
 
if (cl->next_alive != NULL)
cbq_deactivate_class(cl);
 
if (rtab) {
rtab = xchg(&cl->R_tab, rtab);
qdisc_put_rtab(rtab);
}
 
if (tb[TCA_CBQ_LSSOPT-1])
cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
 
if (tb[TCA_CBQ_WRROPT-1]) {
cbq_rmprio(q, cl);
cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
}
 
if (tb[TCA_CBQ_OVL_STRATEGY-1])
cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
 
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_CBQ_POLICE-1])
cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
#endif
 
if (tb[TCA_CBQ_FOPT-1])
cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
 
if (cl->q->q.qlen)
cbq_activate_class(cl);
 
sch_tree_unlock(sch);
 
#ifdef CONFIG_NET_ESTIMATOR
if (tca[TCA_RATE-1]) {
qdisc_kill_estimator(&cl->stats);
qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
}
#endif
return 0;
}
 
if (parentid == TC_H_ROOT)
return -EINVAL;
 
if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
tb[TCA_CBQ_LSSOPT-1] == NULL)
return -EINVAL;
 
rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
if (rtab == NULL)
return -EINVAL;
 
if (classid) {
err = -EINVAL;
if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
goto failure;
} else {
int i;
classid = TC_H_MAKE(sch->handle,0x8000);
 
for (i=0; i<0x8000; i++) {
if (++q->hgenerator >= 0x8000)
q->hgenerator = 1;
if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
break;
}
err = -ENOSR;
if (i >= 0x8000)
goto failure;
classid = classid|q->hgenerator;
}
 
parent = &q->link;
if (parentid) {
parent = cbq_class_lookup(q, parentid);
err = -EINVAL;
if (parent == NULL)
goto failure;
}
 
err = -ENOBUFS;
cl = kmalloc(sizeof(*cl), GFP_KERNEL);
if (cl == NULL)
goto failure;
memset(cl, 0, sizeof(*cl));
cl->R_tab = rtab;
rtab = NULL;
cl->refcnt = 1;
if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
cl->q = &noop_qdisc;
cl->classid = classid;
cl->tparent = parent;
cl->qdisc = sch;
cl->allot = parent->allot;
cl->quantum = cl->allot;
cl->weight = cl->R_tab->rate.rate;
cl->stats.lock = &sch->dev->queue_lock;
 
sch_tree_lock(sch);
cbq_link_class(cl);
cl->borrow = cl->tparent;
if (cl->tparent != &q->link)
cl->share = cl->tparent;
cbq_adjust_levels(parent);
cl->minidle = -0x7FFFFFFF;
cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
if (cl->ewma_log==0)
cl->ewma_log = q->link.ewma_log;
if (cl->maxidle==0)
cl->maxidle = q->link.maxidle;
if (cl->avpkt==0)
cl->avpkt = q->link.avpkt;
cl->overlimit = cbq_ovl_classic;
if (tb[TCA_CBQ_OVL_STRATEGY-1])
cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_CBQ_POLICE-1])
cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
#endif
if (tb[TCA_CBQ_FOPT-1])
cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
sch_tree_unlock(sch);
 
#ifdef CONFIG_NET_ESTIMATOR
if (tca[TCA_RATE-1])
qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
#endif
 
*arg = (unsigned long)cl;
return 0;
 
failure:
qdisc_put_rtab(rtab);
return err;
}
 
static int cbq_delete(struct Qdisc *sch, unsigned long arg)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl = (struct cbq_class*)arg;
 
if (cl->filters || cl->children || cl == &q->link)
return -EBUSY;
 
sch_tree_lock(sch);
 
if (cl->next_alive)
cbq_deactivate_class(cl);
 
if (q->tx_borrowed == cl)
q->tx_borrowed = q->tx_class;
if (q->tx_class == cl) {
q->tx_class = NULL;
q->tx_borrowed = NULL;
}
#ifdef CONFIG_NET_CLS_POLICE
if (q->rx_class == cl)
q->rx_class = NULL;
#endif
 
cbq_unlink_class(cl);
cbq_adjust_levels(cl->tparent);
cl->defmap = 0;
cbq_sync_defmap(cl);
 
cbq_rmprio(q, cl);
sch_tree_unlock(sch);
 
if (--cl->refcnt == 0)
cbq_destroy_class(cl);
 
return 0;
}
 
static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl = (struct cbq_class *)arg;
 
if (cl == NULL)
cl = &q->link;
 
return &cl->filter_list;
}
 
static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *p = (struct cbq_class*)parent;
struct cbq_class *cl = cbq_class_lookup(q, classid);
 
if (cl) {
if (p && p->level <= cl->level)
return 0;
cl->filters++;
return (unsigned long)cl;
}
return 0;
}
 
static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
{
struct cbq_class *cl = (struct cbq_class*)arg;
 
cl->filters--;
}
 
static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
unsigned h;
 
if (arg->stop)
return;
 
for (h = 0; h < 16; h++) {
struct cbq_class *cl;
 
for (cl = q->classes[h]; cl; cl = cl->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
 
static struct Qdisc_class_ops cbq_class_ops =
{
cbq_graft,
cbq_leaf,
cbq_get,
cbq_put,
cbq_change_class,
cbq_delete,
cbq_walk,
 
cbq_find_tcf,
cbq_bind_filter,
cbq_unbind_filter,
 
cbq_dump_class,
};
 
struct Qdisc_ops cbq_qdisc_ops =
{
NULL,
&cbq_class_ops,
"cbq",
sizeof(struct cbq_sched_data),
 
cbq_enqueue,
cbq_dequeue,
cbq_requeue,
cbq_drop,
 
cbq_init,
cbq_reset,
cbq_destroy,
NULL /* cbq_change */,
 
cbq_dump,
};
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&cbq_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&cbq_qdisc_ops);
}
#endif
MODULE_LICENSE("GPL");
/sch_teql.c
0,0 → 1,496
/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
/*
How to setup it.
----------------
 
After loading this module you will find a new device teqlN
and new qdisc with the same name. To join a slave to the equalizer
you should just set this qdisc on a device f.e.
 
# tc qdisc add dev eth0 root teql0
# tc qdisc add dev eth1 root teql0
 
That's all. Full PnP 8)
 
Applicability.
--------------
 
1. Slave devices MUST be active devices, i.e., they must raise the tbusy
signal and generate EOI events. If you want to equalize virtual devices
like tunnels, use a normal eql device.
2. This device puts no limitations on physical slave characteristics
f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
Certainly, large difference in link speeds will make the resulting
eqalized link unusable, because of huge packet reordering.
I estimate an upper useful difference as ~10 times.
3. If the slave requires address resolution, only protocols using
neighbour cache (IPv4/IPv6) will work over the equalized link.
Other protocols are still allowed to use the slave device directly,
which will not break load balancing, though native slave
traffic will have the highest priority. */
 
struct teql_master
{
struct Qdisc_ops qops;
struct net_device dev;
struct Qdisc *slaves;
struct net_device_stats stats;
};
 
struct teql_sched_data
{
struct Qdisc *next;
struct teql_master *m;
struct neighbour *ncache;
struct sk_buff_head q;
};
 
#define NEXT_SLAVE(q) (((struct teql_sched_data*)((q)->data))->next)
 
#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST)
 
/* "teql*" qdisc routines */
 
static int
teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct net_device *dev = sch->dev;
struct teql_sched_data *q = (struct teql_sched_data *)sch->data;
 
__skb_queue_tail(&q->q, skb);
if (q->q.qlen <= dev->tx_queue_len) {
sch->stats.bytes += skb->len;
sch->stats.packets++;
return 0;
}
 
__skb_unlink(skb, &q->q);
kfree_skb(skb);
sch->stats.drops++;
return NET_XMIT_DROP;
}
 
static int
teql_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
struct teql_sched_data *q = (struct teql_sched_data *)sch->data;
 
__skb_queue_head(&q->q, skb);
return 0;
}
 
static struct sk_buff *
teql_dequeue(struct Qdisc* sch)
{
struct teql_sched_data *dat = (struct teql_sched_data *)sch->data;
struct sk_buff *skb;
 
skb = __skb_dequeue(&dat->q);
if (skb == NULL) {
struct net_device *m = dat->m->dev.qdisc->dev;
if (m) {
dat->m->slaves = sch;
netif_wake_queue(m);
}
}
sch->q.qlen = dat->q.qlen + dat->m->dev.qdisc->q.qlen;
return skb;
}
 
static __inline__ void
teql_neigh_release(struct neighbour *n)
{
if (n)
neigh_release(n);
}
 
static void
teql_reset(struct Qdisc* sch)
{
struct teql_sched_data *dat = (struct teql_sched_data *)sch->data;
 
skb_queue_purge(&dat->q);
sch->q.qlen = 0;
teql_neigh_release(xchg(&dat->ncache, NULL));
}
 
static void
teql_destroy(struct Qdisc* sch)
{
struct Qdisc *q, *prev;
struct teql_sched_data *dat = (struct teql_sched_data *)sch->data;
struct teql_master *master = dat->m;
 
if ((prev = master->slaves) != NULL) {
do {
q = NEXT_SLAVE(prev);
if (q == sch) {
NEXT_SLAVE(prev) = NEXT_SLAVE(q);
if (q == master->slaves) {
master->slaves = NEXT_SLAVE(q);
if (q == master->slaves) {
master->slaves = NULL;
spin_lock_bh(&master->dev.queue_lock);
qdisc_reset(master->dev.qdisc);
spin_unlock_bh(&master->dev.queue_lock);
}
}
skb_queue_purge(&dat->q);
teql_neigh_release(xchg(&dat->ncache, NULL));
break;
}
} while ((prev = q) != master->slaves);
}
 
MOD_DEC_USE_COUNT;
}
 
static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt)
{
struct net_device *dev = sch->dev;
struct teql_master *m = (struct teql_master*)sch->ops;
struct teql_sched_data *q = (struct teql_sched_data *)sch->data;
 
if (dev->hard_header_len > m->dev.hard_header_len)
return -EINVAL;
 
if (&m->dev == dev)
return -ELOOP;
 
q->m = m;
 
skb_queue_head_init(&q->q);
 
if (m->slaves) {
if (m->dev.flags & IFF_UP) {
if ((m->dev.flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT))
|| (m->dev.flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST))
|| (m->dev.flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST))
|| dev->mtu < m->dev.mtu)
return -EINVAL;
} else {
if (!(dev->flags&IFF_POINTOPOINT))
m->dev.flags &= ~IFF_POINTOPOINT;
if (!(dev->flags&IFF_BROADCAST))
m->dev.flags &= ~IFF_BROADCAST;
if (!(dev->flags&IFF_MULTICAST))
m->dev.flags &= ~IFF_MULTICAST;
if (dev->mtu < m->dev.mtu)
m->dev.mtu = dev->mtu;
}
q->next = NEXT_SLAVE(m->slaves);
NEXT_SLAVE(m->slaves) = sch;
} else {
q->next = sch;
m->slaves = sch;
m->dev.mtu = dev->mtu;
m->dev.flags = (m->dev.flags&~FMASK)|(dev->flags&FMASK);
}
MOD_INC_USE_COUNT;
return 0;
}
 
/* "teql*" netdevice routines */
 
static int
__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
{
struct teql_sched_data *q = (void*)dev->qdisc->data;
struct neighbour *mn = skb->dst->neighbour;
struct neighbour *n = q->ncache;
 
if (mn->tbl == NULL)
return -EINVAL;
if (n && n->tbl == mn->tbl &&
memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
atomic_inc(&n->refcnt);
} else {
n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
if (IS_ERR(n))
return PTR_ERR(n);
}
if (neigh_event_send(n, skb_res) == 0) {
int err;
read_lock(&n->lock);
err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len);
read_unlock(&n->lock);
if (err < 0) {
neigh_release(n);
return -EINVAL;
}
teql_neigh_release(xchg(&q->ncache, n));
return 0;
}
neigh_release(n);
return (skb_res == NULL) ? -EAGAIN : 1;
}
 
static __inline__ int
teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
{
if (dev->hard_header == NULL ||
skb->dst == NULL ||
skb->dst->neighbour == NULL)
return 0;
return __teql_resolve(skb, skb_res, dev);
}
 
static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct teql_master *master = (void*)dev->priv;
struct Qdisc *start, *q;
int busy;
int nores;
int len = skb->len;
struct sk_buff *skb_res = NULL;
 
start = master->slaves;
 
restart:
nores = 0;
busy = 0;
 
if ((q = start) == NULL)
goto drop;
 
do {
struct net_device *slave = q->dev;
if (slave->qdisc_sleeping != q)
continue;
if (netif_queue_stopped(slave) || ! netif_running(slave)) {
busy = 1;
continue;
}
 
switch (teql_resolve(skb, skb_res, slave)) {
case 0:
if (spin_trylock(&slave->xmit_lock)) {
slave->xmit_lock_owner = smp_processor_id();
if (!netif_queue_stopped(slave) &&
slave->hard_start_xmit(skb, slave) == 0) {
slave->xmit_lock_owner = -1;
spin_unlock(&slave->xmit_lock);
master->slaves = NEXT_SLAVE(q);
netif_wake_queue(dev);
master->stats.tx_packets++;
master->stats.tx_bytes += len;
return 0;
}
slave->xmit_lock_owner = -1;
spin_unlock(&slave->xmit_lock);
}
if (netif_queue_stopped(dev))
busy = 1;
break;
case 1:
master->slaves = NEXT_SLAVE(q);
return 0;
default:
nores = 1;
break;
}
__skb_pull(skb, skb->nh.raw - skb->data);
} while ((q = NEXT_SLAVE(q)) != start);
 
if (nores && skb_res == NULL) {
skb_res = skb;
goto restart;
}
 
if (busy) {
netif_stop_queue(dev);
return 1;
}
master->stats.tx_errors++;
 
drop:
master->stats.tx_dropped++;
dev_kfree_skb(skb);
return 0;
}
 
static int teql_master_open(struct net_device *dev)
{
struct Qdisc * q;
struct teql_master *m = (void*)dev->priv;
int mtu = 0xFFFE;
unsigned flags = IFF_NOARP|IFF_MULTICAST;
 
if (m->slaves == NULL)
return -EUNATCH;
 
flags = FMASK;
 
q = m->slaves;
do {
struct net_device *slave = q->dev;
 
if (slave == NULL)
return -EUNATCH;
 
if (slave->mtu < mtu)
mtu = slave->mtu;
if (slave->hard_header_len > LL_MAX_HEADER)
return -EINVAL;
 
/* If all the slaves are BROADCAST, master is BROADCAST
If all the slaves are PtP, master is PtP
Otherwise, master is NBMA.
*/
if (!(slave->flags&IFF_POINTOPOINT))
flags &= ~IFF_POINTOPOINT;
if (!(slave->flags&IFF_BROADCAST))
flags &= ~IFF_BROADCAST;
if (!(slave->flags&IFF_MULTICAST))
flags &= ~IFF_MULTICAST;
} while ((q = NEXT_SLAVE(q)) != m->slaves);
 
m->dev.mtu = mtu;
m->dev.flags = (m->dev.flags&~FMASK) | flags;
netif_start_queue(&m->dev);
MOD_INC_USE_COUNT;
return 0;
}
 
static int teql_master_close(struct net_device *dev)
{
netif_stop_queue(dev);
MOD_DEC_USE_COUNT;
return 0;
}
 
static struct net_device_stats *teql_master_stats(struct net_device *dev)
{
struct teql_master *m = (void*)dev->priv;
return &m->stats;
}
 
static int teql_master_mtu(struct net_device *dev, int new_mtu)
{
struct teql_master *m = (void*)dev->priv;
struct Qdisc *q;
 
if (new_mtu < 68)
return -EINVAL;
 
q = m->slaves;
if (q) {
do {
if (new_mtu > q->dev->mtu)
return -EINVAL;
} while ((q=NEXT_SLAVE(q)) != m->slaves);
}
 
dev->mtu = new_mtu;
return 0;
}
 
static int teql_master_init(struct net_device *dev)
{
dev->open = teql_master_open;
dev->hard_start_xmit = teql_master_xmit;
dev->stop = teql_master_close;
dev->get_stats = teql_master_stats;
dev->change_mtu = teql_master_mtu;
dev->type = ARPHRD_VOID;
dev->mtu = 1500;
dev->tx_queue_len = 100;
dev->flags = IFF_NOARP;
dev->hard_header_len = LL_MAX_HEADER;
return 0;
}
 
static struct teql_master the_master = {
{
NULL,
NULL,
"",
sizeof(struct teql_sched_data),
 
teql_enqueue,
teql_dequeue,
teql_requeue,
NULL,
 
teql_qdisc_init,
teql_reset,
teql_destroy,
NULL,
},};
 
 
#ifdef MODULE
int init_module(void)
#else
int __init teql_init(void)
#endif
{
int err;
 
rtnl_lock();
 
the_master.dev.priv = (void*)&the_master;
err = dev_alloc_name(&the_master.dev, "teql%d");
if (err < 0)
return err;
memcpy(the_master.qops.id, the_master.dev.name, IFNAMSIZ);
the_master.dev.init = teql_master_init;
 
err = register_netdevice(&the_master.dev);
if (err == 0) {
err = register_qdisc(&the_master.qops);
if (err)
unregister_netdevice(&the_master.dev);
}
rtnl_unlock();
return err;
}
 
#ifdef MODULE
void cleanup_module(void)
{
rtnl_lock();
unregister_qdisc(&the_master.qops);
unregister_netdevice(&the_master.dev);
rtnl_unlock();
}
#endif
MODULE_LICENSE("GPL");
/sch_api.c
0,0 → 1,1256
/*
* net/sched/sch_api.c Packet scheduler API.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Fixes:
*
* Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
* Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
*/
 
#include <linux/config.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/kmod.h>
 
#include <net/sock.h>
#include <net/pkt_sched.h>
 
#include <asm/processor.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
 
static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
struct Qdisc *old, struct Qdisc *new);
static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
struct Qdisc *q, unsigned long cl, int event);
 
/*
 
Short review.
-------------
 
This file consists of two interrelated parts:
 
1. queueing disciplines manager frontend.
2. traffic classes manager frontend.
 
Generally, queueing discipline ("qdisc") is a black box,
which is able to enqueue packets and to dequeue them (when
device is ready to send something) in order and at times
determined by algorithm hidden in it.
 
qdisc's are divided to two categories:
- "queues", which have no internal structure visible from outside.
- "schedulers", which split all the packets to "traffic classes",
using "packet classifiers" (look at cls_api.c)
 
In turn, classes may have child qdiscs (as rule, queues)
attached to them etc. etc. etc.
 
The goal of the routines in this file is to translate
information supplied by user in the form of handles
to more intelligible for kernel form, to make some sanity
checks and part of work, which is common to all qdiscs
and to provide rtnetlink notifications.
 
All real intelligent work is done inside qdisc modules.
 
 
 
Every discipline has two major routines: enqueue and dequeue.
 
---dequeue
 
dequeue usually returns a skb to send. It is allowed to return NULL,
but it does not mean that queue is empty, it just means that
discipline does not want to send anything this time.
Queue is really empty if q->q.qlen == 0.
For complicated disciplines with multiple queues q->q is not
real packet queue, but however q->q.qlen must be valid.
 
---enqueue
 
enqueue returns 0, if packet was enqueued successfully.
If packet (this one or another one) was dropped, it returns
not zero error code.
NET_XMIT_DROP - this packet dropped
Expected action: do not backoff, but wait until queue will clear.
NET_XMIT_CN - probably this packet enqueued, but another one dropped.
Expected action: backoff or ignore
NET_XMIT_POLICED - dropped by police.
Expected action: backoff or error to real-time apps.
 
Auxiliary routines:
 
---requeue
 
requeues once dequeued packet. It is used for non-standard or
just buggy devices, which can defer output even if dev->tbusy=0.
 
---reset
 
returns qdisc to initial state: purge all buffers, clear all
timers, counters (except for statistics) etc.
 
---init
 
initializes newly created qdisc.
 
---destroy
 
destroys resources allocated by init and during lifetime of qdisc.
 
---change
 
changes qdisc parameters.
*/
 
/* Protects list of registered TC modules. It is pure SMP lock. */
static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
 
 
/************************************************
* Queueing disciplines manipulation. *
************************************************/
 
 
/* The list of all installed queueing disciplines. */
 
static struct Qdisc_ops *qdisc_base = NULL;
 
/* Register/uregister queueing discipline */
 
int register_qdisc(struct Qdisc_ops *qops)
{
struct Qdisc_ops *q, **qp;
 
write_lock(&qdisc_mod_lock);
for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) {
if (strcmp(qops->id, q->id) == 0) {
write_unlock(&qdisc_mod_lock);
return -EEXIST;
}
}
 
if (qops->enqueue == NULL)
qops->enqueue = noop_qdisc_ops.enqueue;
if (qops->requeue == NULL)
qops->requeue = noop_qdisc_ops.requeue;
if (qops->dequeue == NULL)
qops->dequeue = noop_qdisc_ops.dequeue;
 
qops->next = NULL;
*qp = qops;
write_unlock(&qdisc_mod_lock);
return 0;
}
 
int unregister_qdisc(struct Qdisc_ops *qops)
{
struct Qdisc_ops *q, **qp;
int err = -ENOENT;
 
write_lock(&qdisc_mod_lock);
for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
if (q == qops)
break;
if (q) {
*qp = q->next;
q->next = NULL;
err = 0;
}
write_unlock(&qdisc_mod_lock);
return err;
}
 
/* We know handle. Find qdisc among all qdisc's attached to device
(root qdisc, all its children, children of children etc.)
*/
 
struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
{
struct Qdisc *q;
 
for (q = dev->qdisc_list; q; q = q->next) {
if (q->handle == handle)
return q;
}
return NULL;
}
 
struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
{
unsigned long cl;
struct Qdisc *leaf;
struct Qdisc_class_ops *cops = p->ops->cl_ops;
 
if (cops == NULL)
return NULL;
cl = cops->get(p, classid);
 
if (cl == 0)
return NULL;
leaf = cops->leaf(p, cl);
cops->put(p, cl);
return leaf;
}
 
/* Find queueing discipline by name */
 
struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
{
struct Qdisc_ops *q = NULL;
 
if (kind) {
read_lock(&qdisc_mod_lock);
for (q = qdisc_base; q; q = q->next) {
if (rtattr_strcmp(kind, q->id) == 0)
break;
}
read_unlock(&qdisc_mod_lock);
}
return q;
}
 
static struct qdisc_rate_table *qdisc_rtab_list;
 
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
{
struct qdisc_rate_table *rtab;
 
for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
rtab->refcnt++;
return rtab;
}
}
 
if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
return NULL;
 
rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
if (rtab) {
rtab->rate = *r;
rtab->refcnt = 1;
memcpy(rtab->data, RTA_DATA(tab), 1024);
rtab->next = qdisc_rtab_list;
qdisc_rtab_list = rtab;
}
return rtab;
}
 
void qdisc_put_rtab(struct qdisc_rate_table *tab)
{
struct qdisc_rate_table *rtab, **rtabp;
 
if (!tab || --tab->refcnt)
return;
 
for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
if (rtab == tab) {
*rtabp = rtab->next;
kfree(rtab);
return;
}
}
}
 
 
/* Allocate an unique handle from space managed by kernel */
 
u32 qdisc_alloc_handle(struct net_device *dev)
{
int i = 0x10000;
static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 
do {
autohandle += TC_H_MAKE(0x10000U, 0);
if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
autohandle = TC_H_MAKE(0x80000000U, 0);
} while (qdisc_lookup(dev, autohandle) && --i > 0);
 
return i>0 ? autohandle : 0;
}
 
/* Attach toplevel qdisc to device dev */
 
static struct Qdisc *
dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
{
struct Qdisc *oqdisc;
 
if (dev->flags & IFF_UP)
dev_deactivate(dev);
 
write_lock(&qdisc_tree_lock);
spin_lock_bh(&dev->queue_lock);
if (qdisc && qdisc->flags&TCQ_F_INGRES) {
oqdisc = dev->qdisc_ingress;
/* Prune old scheduler */
if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
/* delete */
qdisc_reset(oqdisc);
dev->qdisc_ingress = NULL;
} else { /* new */
dev->qdisc_ingress = qdisc;
}
 
} else {
 
oqdisc = dev->qdisc_sleeping;
 
/* Prune old scheduler */
if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
qdisc_reset(oqdisc);
 
/* ... and graft new one */
if (qdisc == NULL)
qdisc = &noop_qdisc;
dev->qdisc_sleeping = qdisc;
dev->qdisc = &noop_qdisc;
}
 
spin_unlock_bh(&dev->queue_lock);
write_unlock(&qdisc_tree_lock);
 
if (dev->flags & IFF_UP)
dev_activate(dev);
 
return oqdisc;
}
 
 
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
to device "dev".
 
Old qdisc is not destroyed but returned in *old.
*/
 
int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
struct Qdisc *new, struct Qdisc **old)
{
int err = 0;
struct Qdisc *q = *old;
 
 
if (parent == NULL) {
if (q && q->flags&TCQ_F_INGRES) {
*old = dev_graft_qdisc(dev, q);
} else {
*old = dev_graft_qdisc(dev, new);
}
} else {
struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 
err = -EINVAL;
 
if (cops) {
unsigned long cl = cops->get(parent, classid);
if (cl) {
err = cops->graft(parent, cl, new, old);
cops->put(parent, cl);
}
}
}
return err;
}
 
/*
Allocate and initialize new qdisc.
 
Parameters are passed via opt.
*/
 
static struct Qdisc *
qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
{
int err;
struct rtattr *kind = tca[TCA_KIND-1];
struct Qdisc *sch = NULL;
struct Qdisc_ops *ops;
int size;
 
ops = qdisc_lookup_ops(kind);
#ifdef CONFIG_KMOD
if (ops==NULL && tca[TCA_KIND-1] != NULL) {
char module_name[4 + IFNAMSIZ + 1];
 
if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
request_module (module_name);
ops = qdisc_lookup_ops(kind);
}
}
#endif
 
err = -EINVAL;
if (ops == NULL)
goto err_out;
 
size = sizeof(*sch) + ops->priv_size;
 
sch = kmalloc(size, GFP_KERNEL);
err = -ENOBUFS;
if (!sch)
goto err_out;
 
/* Grrr... Resolve race condition with module unload */
 
err = -EINVAL;
if (ops != qdisc_lookup_ops(kind))
goto err_out;
 
memset(sch, 0, size);
 
skb_queue_head_init(&sch->q);
 
if (handle == TC_H_INGRESS)
sch->flags |= TCQ_F_INGRES;
 
sch->ops = ops;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev = dev;
atomic_set(&sch->refcnt, 1);
sch->stats.lock = &dev->queue_lock;
if (handle == 0) {
handle = qdisc_alloc_handle(dev);
err = -ENOMEM;
if (handle == 0)
goto err_out;
}
 
if (handle == TC_H_INGRESS)
sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
else
sch->handle = handle;
 
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
write_lock(&qdisc_tree_lock);
sch->next = dev->qdisc_list;
dev->qdisc_list = sch;
write_unlock(&qdisc_tree_lock);
#ifdef CONFIG_NET_ESTIMATOR
if (tca[TCA_RATE-1])
qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
#endif
return sch;
}
 
err_out:
*errp = err;
if (sch)
kfree(sch);
return NULL;
}
 
static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
{
if (tca[TCA_OPTIONS-1]) {
int err;
 
if (sch->ops->change == NULL)
return -EINVAL;
err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
if (err)
return err;
}
#ifdef CONFIG_NET_ESTIMATOR
if (tca[TCA_RATE-1]) {
qdisc_kill_estimator(&sch->stats);
qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
}
#endif
return 0;
}
 
struct check_loop_arg
{
struct qdisc_walker w;
struct Qdisc *p;
int depth;
};
 
static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 
static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
{
struct check_loop_arg arg;
 
if (q->ops->cl_ops == NULL)
return 0;
 
arg.w.stop = arg.w.skip = arg.w.count = 0;
arg.w.fn = check_loop_fn;
arg.depth = depth;
arg.p = p;
q->ops->cl_ops->walk(q, &arg.w);
return arg.w.stop ? -ELOOP : 0;
}
 
static int
check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
{
struct Qdisc *leaf;
struct Qdisc_class_ops *cops = q->ops->cl_ops;
struct check_loop_arg *arg = (struct check_loop_arg *)w;
 
leaf = cops->leaf(q, cl);
if (leaf) {
if (leaf == arg->p || arg->depth > 7)
return -ELOOP;
return check_loop(leaf, arg->p, arg->depth + 1);
}
return 0;
}
 
/*
* Delete/get qdisc.
*/
 
static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct tcmsg *tcm = NLMSG_DATA(n);
struct rtattr **tca = arg;
struct net_device *dev;
u32 clid = tcm->tcm_parent;
struct Qdisc *q = NULL;
struct Qdisc *p = NULL;
int err;
 
if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
return -ENODEV;
 
if (clid) {
if (clid != TC_H_ROOT) {
if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
return -ENOENT;
q = qdisc_leaf(p, clid);
} else { /* ingress */
q = dev->qdisc_ingress;
}
} else {
q = dev->qdisc_sleeping;
}
if (!q)
return -ENOENT;
 
if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
return -EINVAL;
} else {
if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
return -ENOENT;
}
 
if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
return -EINVAL;
 
if (n->nlmsg_type == RTM_DELQDISC) {
if (!clid)
return -EINVAL;
if (q->handle == 0)
return -ENOENT;
if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
return err;
if (q) {
qdisc_notify(skb, n, clid, q, NULL);
spin_lock_bh(&dev->queue_lock);
qdisc_destroy(q);
spin_unlock_bh(&dev->queue_lock);
}
} else {
qdisc_notify(skb, n, clid, NULL, q);
}
return 0;
}
 
/*
Create/change qdisc.
*/
 
static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct tcmsg *tcm = NLMSG_DATA(n);
struct rtattr **tca = arg;
struct net_device *dev;
u32 clid = tcm->tcm_parent;
struct Qdisc *q = NULL;
struct Qdisc *p = NULL;
int err;
 
if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
return -ENODEV;
 
if (clid) {
if (clid != TC_H_ROOT) {
if (clid != TC_H_INGRESS) {
if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
return -ENOENT;
q = qdisc_leaf(p, clid);
} else { /*ingress */
q = dev->qdisc_ingress;
}
} else {
q = dev->qdisc_sleeping;
}
 
/* It may be default qdisc, ignore it */
if (q && q->handle == 0)
q = NULL;
 
if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
if (tcm->tcm_handle) {
if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
return -EEXIST;
if (TC_H_MIN(tcm->tcm_handle))
return -EINVAL;
if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
goto create_n_graft;
if (n->nlmsg_flags&NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
return -EINVAL;
if (q == p ||
(p && check_loop(q, p, 0)))
return -ELOOP;
atomic_inc(&q->refcnt);
goto graft;
} else {
if (q == NULL)
goto create_n_graft;
 
/* This magic test requires explanation.
*
* We know, that some child q is already
* attached to this parent and have choice:
* either to change it or to create/graft new one.
*
* 1. We are allowed to create/graft only
* if CREATE and REPLACE flags are set.
*
* 2. If EXCL is set, requestor wanted to say,
* that qdisc tcm_handle is not expected
* to exist, so that we choose create/graft too.
*
* 3. The last case is when no flags are set.
* Alas, it is sort of hole in API, we
* cannot decide what to do unambiguously.
* For now we select create/graft, if
* user gave KIND, which does not match existing.
*/
if ((n->nlmsg_flags&NLM_F_CREATE) &&
(n->nlmsg_flags&NLM_F_REPLACE) &&
((n->nlmsg_flags&NLM_F_EXCL) ||
(tca[TCA_KIND-1] &&
rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
goto create_n_graft;
}
}
} else {
if (!tcm->tcm_handle)
return -EINVAL;
q = qdisc_lookup(dev, tcm->tcm_handle);
}
 
/* Change qdisc parameters */
if (q == NULL)
return -ENOENT;
if (n->nlmsg_flags&NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
return -EINVAL;
err = qdisc_change(q, tca);
if (err == 0)
qdisc_notify(skb, n, clid, NULL, q);
return err;
 
create_n_graft:
if (!(n->nlmsg_flags&NLM_F_CREATE))
return -ENOENT;
if (clid == TC_H_INGRESS)
q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
else
q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
if (q == NULL)
return err;
 
graft:
if (1) {
struct Qdisc *old_q = NULL;
err = qdisc_graft(dev, p, clid, q, &old_q);
if (err) {
if (q) {
spin_lock_bh(&dev->queue_lock);
qdisc_destroy(q);
spin_unlock_bh(&dev->queue_lock);
}
return err;
}
qdisc_notify(skb, n, clid, old_q, q);
if (old_q) {
spin_lock_bh(&dev->queue_lock);
qdisc_destroy(old_q);
spin_unlock_bh(&dev->queue_lock);
}
}
return 0;
}
 
int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st)
{
spin_lock_bh(st->lock);
RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st);
spin_unlock_bh(st->lock);
return 0;
 
rtattr_failure:
spin_unlock_bh(st->lock);
return -1;
}
 
 
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
u32 pid, u32 seq, unsigned flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
 
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
nlh->nlmsg_flags = flags;
tcm = NLMSG_DATA(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
tcm->tcm_parent = clid;
tcm->tcm_handle = q->handle;
tcm->tcm_info = atomic_read(&q->refcnt);
RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
if (q->ops->dump && q->ops->dump(q, skb) < 0)
goto rtattr_failure;
q->stats.qlen = q->q.qlen;
if (qdisc_copy_stats(skb, &q->stats))
goto rtattr_failure;
nlh->nlmsg_len = skb->tail - b;
return skb->len;
 
nlmsg_failure:
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
u32 clid, struct Qdisc *old, struct Qdisc *new)
{
struct sk_buff *skb;
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
 
if (old && old->handle) {
if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
goto err_out;
}
if (new) {
if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
goto err_out;
}
 
if (skb->len)
return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 
err_out:
kfree_skb(skb);
return -EINVAL;
}
 
static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx, q_idx;
int s_idx, s_q_idx;
struct net_device *dev;
struct Qdisc *q;
 
s_idx = cb->args[0];
s_q_idx = q_idx = cb->args[1];
read_lock(&dev_base_lock);
for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
if (idx < s_idx)
continue;
if (idx > s_idx)
s_q_idx = 0;
read_lock(&qdisc_tree_lock);
for (q = dev->qdisc_list, q_idx = 0; q;
q = q->next, q_idx++) {
if (q_idx < s_q_idx)
continue;
if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
read_unlock(&qdisc_tree_lock);
goto done;
}
}
read_unlock(&qdisc_tree_lock);
}
 
done:
read_unlock(&dev_base_lock);
 
cb->args[0] = idx;
cb->args[1] = q_idx;
 
return skb->len;
}
 
 
 
/************************************************
* Traffic classes manipulation. *
************************************************/
 
 
 
static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct tcmsg *tcm = NLMSG_DATA(n);
struct rtattr **tca = arg;
struct net_device *dev;
struct Qdisc *q = NULL;
struct Qdisc_class_ops *cops;
unsigned long cl = 0;
unsigned long new_cl;
u32 pid = tcm->tcm_parent;
u32 clid = tcm->tcm_handle;
u32 qid = TC_H_MAJ(clid);
int err;
 
if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
return -ENODEV;
 
/*
parent == TC_H_UNSPEC - unspecified parent.
parent == TC_H_ROOT - class is root, which has no parent.
parent == X:0 - parent is root class.
parent == X:Y - parent is a node in hierarchy.
parent == 0:Y - parent is X:Y, where X:0 is qdisc.
 
handle == 0:0 - generate handle from kernel pool.
handle == 0:Y - class is X:Y, where X:0 is qdisc.
handle == X:Y - clear.
handle == X:0 - root class.
*/
 
/* Step 1. Determine qdisc handle X:0 */
 
if (pid != TC_H_ROOT) {
u32 qid1 = TC_H_MAJ(pid);
 
if (qid && qid1) {
/* If both majors are known, they must be identical. */
if (qid != qid1)
return -EINVAL;
} else if (qid1) {
qid = qid1;
} else if (qid == 0)
qid = dev->qdisc_sleeping->handle;
 
/* Now qid is genuine qdisc handle consistent
both with parent and child.
 
TC_H_MAJ(pid) still may be unspecified, complete it now.
*/
if (pid)
pid = TC_H_MAKE(qid, pid);
} else {
if (qid == 0)
qid = dev->qdisc_sleeping->handle;
}
 
/* OK. Locate qdisc */
if ((q = qdisc_lookup(dev, qid)) == NULL)
return -ENOENT;
 
/* An check that it supports classes */
cops = q->ops->cl_ops;
if (cops == NULL)
return -EINVAL;
 
/* Now try to get class */
if (clid == 0) {
if (pid == TC_H_ROOT)
clid = qid;
} else
clid = TC_H_MAKE(qid, clid);
 
if (clid)
cl = cops->get(q, clid);
 
if (cl == 0) {
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
goto out;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTCLASS:
err = -EEXIST;
if (n->nlmsg_flags&NLM_F_EXCL)
goto out;
break;
case RTM_DELTCLASS:
err = cops->delete(q, cl);
if (err == 0)
tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
goto out;
case RTM_GETTCLASS:
err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
goto out;
default:
err = -EINVAL;
goto out;
}
}
 
new_cl = cl;
err = cops->change(q, clid, pid, tca, &new_cl);
if (err == 0)
tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
 
out:
if (cl)
cops->put(q, cl);
 
return err;
}
 
 
static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
unsigned long cl,
u32 pid, u32 seq, unsigned flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
 
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
nlh->nlmsg_flags = flags;
tcm = NLMSG_DATA(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
tcm->tcm_parent = q->handle;
tcm->tcm_handle = q->handle;
tcm->tcm_info = 0;
RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
goto rtattr_failure;
nlh->nlmsg_len = skb->tail - b;
return skb->len;
 
nlmsg_failure:
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
struct Qdisc *q, unsigned long cl, int event)
{
struct sk_buff *skb;
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
 
if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
kfree_skb(skb);
return -EINVAL;
}
 
return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
}
 
struct qdisc_dump_args
{
struct qdisc_walker w;
struct sk_buff *skb;
struct netlink_callback *cb;
};
 
static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
{
struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
 
return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
}
 
static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
{
int t;
int s_t;
struct net_device *dev;
struct Qdisc *q;
struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
struct qdisc_dump_args arg;
 
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
return 0;
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
return 0;
 
s_t = cb->args[0];
 
read_lock(&qdisc_tree_lock);
for (q=dev->qdisc_list, t=0; q; q = q->next, t++) {
if (t < s_t) continue;
if (!q->ops->cl_ops) continue;
if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
arg.w.fn = qdisc_class_dump;
arg.skb = skb;
arg.cb = cb;
arg.w.stop = 0;
arg.w.skip = cb->args[1];
arg.w.count = 0;
q->ops->cl_ops->walk(q, &arg.w);
cb->args[1] = arg.w.count;
if (arg.w.stop)
break;
}
read_unlock(&qdisc_tree_lock);
 
cb->args[0] = t;
 
dev_put(dev);
return skb->len;
}
 
int psched_us_per_tick = 1;
int psched_tick_per_us = 1;
 
#ifdef CONFIG_PROC_FS
static int psched_read_proc(char *buffer, char **start, off_t offset,
int length, int *eof, void *data)
{
int len;
 
len = sprintf(buffer, "%08x %08x %08x %08x\n",
psched_tick_per_us, psched_us_per_tick,
1000000, HZ);
 
len -= offset;
 
if (len > length)
len = length;
if(len < 0)
len = 0;
 
*start = buffer + offset;
*eof = 1;
 
return len;
}
#endif
 
#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
int psched_tod_diff(int delta_sec, int bound)
{
int delta;
 
if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
return bound;
delta = delta_sec * 1000000;
if (delta > bound)
delta = bound;
return delta;
}
#endif
 
psched_time_t psched_time_base;
 
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
psched_tdiff_t psched_clock_per_hz;
int psched_clock_scale;
#endif
 
#ifdef PSCHED_WATCHER
PSCHED_WATCHER psched_time_mark;
 
static void psched_tick(unsigned long);
 
static struct timer_list psched_timer =
{ function: psched_tick };
 
static void psched_tick(unsigned long dummy)
{
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
psched_time_t dummy_stamp;
PSCHED_GET_TIME(dummy_stamp);
/* It is OK up to 4GHz cpu */
psched_timer.expires = jiffies + 1*HZ;
#else
unsigned long now = jiffies;
psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE;
psched_time_mark = now;
psched_timer.expires = now + 60*60*HZ;
#endif
add_timer(&psched_timer);
}
#endif
 
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
int __init psched_calibrate_clock(void)
{
psched_time_t stamp, stamp1;
struct timeval tv, tv1;
psched_tdiff_t delay;
long rdelay;
unsigned long stop;
 
#ifdef PSCHED_WATCHER
psched_tick(0);
#endif
stop = jiffies + HZ/10;
PSCHED_GET_TIME(stamp);
do_gettimeofday(&tv);
while (time_before(jiffies, stop)) {
barrier();
cpu_relax();
}
PSCHED_GET_TIME(stamp1);
do_gettimeofday(&tv1);
 
delay = PSCHED_TDIFF(stamp1, stamp);
rdelay = tv1.tv_usec - tv.tv_usec;
rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
if (rdelay > delay)
return -1;
delay /= rdelay;
psched_tick_per_us = delay;
while ((delay>>=1) != 0)
psched_clock_scale++;
psched_us_per_tick = 1<<psched_clock_scale;
psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
return 0;
}
#endif
 
int __init pktsched_init(void)
{
struct rtnetlink_link *link_p;
 
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
if (psched_calibrate_clock() < 0)
return -1;
#elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
psched_tick_per_us = HZ<<PSCHED_JSCALE;
psched_us_per_tick = 1000000;
#ifdef PSCHED_WATCHER
psched_tick(0);
#endif
#endif
 
link_p = rtnetlink_links[PF_UNSPEC];
 
/* Setup rtnetlink links. It is made here to avoid
exporting large number of public symbols.
*/
 
if (link_p) {
link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
}
 
#define INIT_QDISC(name) { \
extern struct Qdisc_ops name##_qdisc_ops; \
register_qdisc(& name##_qdisc_ops); \
}
 
INIT_QDISC(pfifo);
INIT_QDISC(bfifo);
 
#ifdef CONFIG_NET_SCH_CBQ
INIT_QDISC(cbq);
#endif
#ifdef CONFIG_NET_SCH_HTB
INIT_QDISC(htb);
#endif
#ifdef CONFIG_NET_SCH_CSZ
INIT_QDISC(csz);
#endif
#ifdef CONFIG_NET_SCH_HPFQ
INIT_QDISC(hpfq);
#endif
#ifdef CONFIG_NET_SCH_HFSC
INIT_QDISC(hfsc);
#endif
#ifdef CONFIG_NET_SCH_RED
INIT_QDISC(red);
#endif
#ifdef CONFIG_NET_SCH_GRED
INIT_QDISC(gred);
#endif
#ifdef CONFIG_NET_SCH_INGRESS
INIT_QDISC(ingress);
#endif
#ifdef CONFIG_NET_SCH_DSMARK
INIT_QDISC(dsmark);
#endif
#ifdef CONFIG_NET_SCH_SFQ
INIT_QDISC(sfq);
#endif
#ifdef CONFIG_NET_SCH_TBF
INIT_QDISC(tbf);
#endif
#ifdef CONFIG_NET_SCH_TEQL
teql_init();
#endif
#ifdef CONFIG_NET_SCH_PRIO
INIT_QDISC(prio);
#endif
#ifdef CONFIG_NET_SCH_ATM
INIT_QDISC(atm);
#endif
#ifdef CONFIG_NET_CLS
tc_filter_init();
#endif
 
#ifdef CONFIG_PROC_FS
create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL);
#endif
 
return 0;
}
/sch_prio.c
0,0 → 1,424
/*
* net/sched/sch_prio.c Simple 3-band priority "scheduler".
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
* Init -- EINVAL when opt undefined
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
 
struct prio_sched_data
{
int bands;
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
struct Qdisc *queues[TCQ_PRIO_BANDS];
};
 
 
static __inline__ unsigned prio_classify(struct sk_buff *skb, struct Qdisc *sch)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
struct tcf_result res;
u32 band;
 
band = skb->priority;
if (TC_H_MAJ(skb->priority) != sch->handle) {
if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) {
if (TC_H_MAJ(band))
band = 0;
return q->prio2band[band&TC_PRIO_MAX];
}
band = res.classid;
}
band = TC_H_MIN(band) - 1;
return band < q->bands ? band : q->prio2band[0];
}
 
static int
prio_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
struct Qdisc *qdisc;
int ret;
 
qdisc = q->queues[prio_classify(skb, sch)];
 
if ((ret = qdisc->enqueue(skb, qdisc)) == 0) {
sch->stats.bytes += skb->len;
sch->stats.packets++;
sch->q.qlen++;
return 0;
}
sch->stats.drops++;
return ret;
}
 
 
static int
prio_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
struct Qdisc *qdisc;
int ret;
 
qdisc = q->queues[prio_classify(skb, sch)];
 
if ((ret = qdisc->ops->requeue(skb, qdisc)) == 0) {
sch->q.qlen++;
return 0;
}
sch->stats.drops++;
return ret;
}
 
 
static struct sk_buff *
prio_dequeue(struct Qdisc* sch)
{
struct sk_buff *skb;
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
int prio;
struct Qdisc *qdisc;
 
for (prio = 0; prio < q->bands; prio++) {
qdisc = q->queues[prio];
skb = qdisc->dequeue(qdisc);
if (skb) {
sch->q.qlen--;
return skb;
}
}
return NULL;
 
}
 
static unsigned int prio_drop(struct Qdisc* sch)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
int prio;
unsigned int len;
struct Qdisc *qdisc;
 
for (prio = q->bands-1; prio >= 0; prio--) {
qdisc = q->queues[prio];
if ((len = qdisc->ops->drop(qdisc)) != 0) {
sch->q.qlen--;
return len;
}
}
return 0;
}
 
 
static void
prio_reset(struct Qdisc* sch)
{
int prio;
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
 
for (prio=0; prio<q->bands; prio++)
qdisc_reset(q->queues[prio]);
sch->q.qlen = 0;
}
 
static void
prio_destroy(struct Qdisc* sch)
{
int prio;
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
struct tcf_proto *tp;
 
while ((tp = q->filter_list) != NULL) {
q->filter_list = tp->next;
tcf_destroy(tp);
}
 
for (prio=0; prio<q->bands; prio++) {
qdisc_destroy(q->queues[prio]);
q->queues[prio] = &noop_qdisc;
}
MOD_DEC_USE_COUNT;
}
 
static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
struct tc_prio_qopt *qopt = RTA_DATA(opt);
int i;
 
if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
return -EINVAL;
if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
return -EINVAL;
 
for (i=0; i<=TC_PRIO_MAX; i++) {
if (qopt->priomap[i] >= qopt->bands)
return -EINVAL;
}
 
sch_tree_lock(sch);
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
 
for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
if (child != &noop_qdisc)
qdisc_destroy(child);
}
sch_tree_unlock(sch);
 
for (i=0; i<=TC_PRIO_MAX; i++) {
int band = q->prio2band[i];
if (q->queues[band] == &noop_qdisc) {
struct Qdisc *child;
child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
if (child) {
sch_tree_lock(sch);
child = xchg(&q->queues[band], child);
 
if (child != &noop_qdisc)
qdisc_destroy(child);
sch_tree_unlock(sch);
}
}
}
return 0;
}
 
static int prio_init(struct Qdisc *sch, struct rtattr *opt)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
int i;
 
for (i=0; i<TCQ_PRIO_BANDS; i++)
q->queues[i] = &noop_qdisc;
 
if (opt == NULL) {
return -EINVAL;
} else {
int err;
 
if ((err= prio_tune(sch, opt)) != 0)
return err;
}
MOD_INC_USE_COUNT;
return 0;
}
 
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
unsigned char *b = skb->tail;
struct tc_prio_qopt opt;
 
opt.bands = q->bands;
memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
unsigned long band = arg - 1;
 
if (band >= q->bands)
return -EINVAL;
 
if (new == NULL)
new = &noop_qdisc;
 
sch_tree_lock(sch);
*old = q->queues[band];
q->queues[band] = new;
sch->q.qlen -= (*old)->q.qlen;
qdisc_reset(*old);
sch_tree_unlock(sch);
 
return 0;
}
 
static struct Qdisc *
prio_leaf(struct Qdisc *sch, unsigned long arg)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
unsigned long band = arg - 1;
 
if (band >= q->bands)
return NULL;
 
return q->queues[band];
}
 
static unsigned long prio_get(struct Qdisc *sch, u32 classid)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
unsigned long band = TC_H_MIN(classid);
 
if (band - 1 >= q->bands)
return 0;
return band;
}
 
static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
{
return prio_get(sch, classid);
}
 
 
static void prio_put(struct Qdisc *q, unsigned long cl)
{
return;
}
 
static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
{
unsigned long cl = *arg;
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
 
if (cl - 1 > q->bands)
return -ENOENT;
return 0;
}
 
static int prio_delete(struct Qdisc *sch, unsigned long cl)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
if (cl - 1 > q->bands)
return -ENOENT;
return 0;
}
 
 
static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
struct tcmsg *tcm)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
 
if (cl - 1 > q->bands)
return -ENOENT;
tcm->tcm_handle |= TC_H_MIN(cl);
if (q->queues[cl-1])
tcm->tcm_info = q->queues[cl-1]->handle;
return 0;
}
 
static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
int prio;
 
if (arg->stop)
return;
 
for (prio = 0; prio < q->bands; prio++) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, prio+1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
 
static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
 
if (cl)
return NULL;
return &q->filter_list;
}
 
static struct Qdisc_class_ops prio_class_ops =
{
prio_graft,
prio_leaf,
 
prio_get,
prio_put,
prio_change,
prio_delete,
prio_walk,
 
prio_find_tcf,
prio_bind,
prio_put,
 
prio_dump_class,
};
 
struct Qdisc_ops prio_qdisc_ops =
{
NULL,
&prio_class_ops,
"prio",
sizeof(struct prio_sched_data),
 
prio_enqueue,
prio_dequeue,
prio_requeue,
prio_drop,
 
prio_init,
prio_reset,
prio_destroy,
prio_tune,
 
prio_dump,
};
 
#ifdef MODULE
 
int init_module(void)
{
return register_qdisc(&prio_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&prio_qdisc_ops);
}
 
#endif
MODULE_LICENSE("GPL");
/estimator.c
0,0 → 1,197
/*
* net/sched/estimator.c Simple rate estimator.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
/*
This code is NOT intended to be used for statistics collection,
its purpose is to provide a base for statistical multiplexing
for controlled load service.
If you need only statistics, run a user level daemon which
periodically reads byte counters.
 
Unfortunately, rate estimation is not a very easy task.
F.e. I did not find a simple way to estimate the current peak rate
and even failed to formulate the problem 8)8)
 
So I preferred not to built an estimator into the scheduler,
but run this task separately.
Ideally, it should be kernel thread(s), but for now it runs
from timers, which puts apparent top bounds on the number of rated
flows, has minimal overhead on small, but is enough
to handle controlled load service, sets of aggregates.
 
We measure rate over A=(1<<interval) seconds and evaluate EWMA:
 
avrate = avrate*(1-W) + rate*W
 
where W is chosen as negative power of 2: W = 2^(-ewma_log)
 
The resulting time constant is:
 
T = A/(-ln(1-W))
 
 
NOTES.
 
* The stored value for avbps is scaled by 2^5, so that maximal
rate is ~1Gbit, avpps is scaled by 2^10.
 
* Minimal interval is HZ/4=250msec (it is the greatest common divisor
for HZ=100 and HZ=1024 8)), maximal interval
is (HZ/4)*2^EST_MAX_INTERVAL = 8sec. Shorter intervals
are too expensive, longer ones can be implemented
at user level painlessly.
*/
 
#if (HZ%4) != 0
#error Bad HZ value.
#endif
 
#define EST_MAX_INTERVAL 5
 
struct qdisc_estimator
{
struct qdisc_estimator *next;
struct tc_stats *stats;
unsigned interval;
int ewma_log;
u64 last_bytes;
u32 last_packets;
u32 avpps;
u32 avbps;
};
 
struct qdisc_estimator_head
{
struct timer_list timer;
struct qdisc_estimator *list;
};
 
static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1];
 
/* Estimator array lock */
static rwlock_t est_lock = RW_LOCK_UNLOCKED;
 
static void est_timer(unsigned long arg)
{
int idx = (int)arg;
struct qdisc_estimator *e;
 
read_lock(&est_lock);
for (e = elist[idx].list; e; e = e->next) {
struct tc_stats *st = e->stats;
u64 nbytes;
u32 npackets;
u32 rate;
 
spin_lock(st->lock);
nbytes = st->bytes;
npackets = st->packets;
rate = (nbytes - e->last_bytes)<<(7 - idx);
e->last_bytes = nbytes;
e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
st->bps = (e->avbps+0xF)>>5;
 
rate = (npackets - e->last_packets)<<(12 - idx);
e->last_packets = npackets;
e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
e->stats->pps = (e->avpps+0x1FF)>>10;
spin_unlock(st->lock);
}
 
mod_timer(&elist[idx].timer, jiffies + ((HZ/4)<<idx));
read_unlock(&est_lock);
}
 
int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt)
{
struct qdisc_estimator *est;
struct tc_estimator *parm = RTA_DATA(opt);
 
if (RTA_PAYLOAD(opt) < sizeof(*parm))
return -EINVAL;
 
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
 
est = kmalloc(sizeof(*est), GFP_KERNEL);
if (est == NULL)
return -ENOBUFS;
 
memset(est, 0, sizeof(*est));
est->interval = parm->interval + 2;
est->stats = stats;
est->ewma_log = parm->ewma_log;
est->last_bytes = stats->bytes;
est->avbps = stats->bps<<5;
est->last_packets = stats->packets;
est->avpps = stats->pps<<10;
 
est->next = elist[est->interval].list;
if (est->next == NULL) {
init_timer(&elist[est->interval].timer);
elist[est->interval].timer.data = est->interval;
elist[est->interval].timer.expires = jiffies + ((HZ/4)<<est->interval);
elist[est->interval].timer.function = est_timer;
add_timer(&elist[est->interval].timer);
}
write_lock_bh(&est_lock);
elist[est->interval].list = est;
write_unlock_bh(&est_lock);
return 0;
}
 
void qdisc_kill_estimator(struct tc_stats *stats)
{
int idx;
struct qdisc_estimator *est, **pest;
 
for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
int killed = 0;
pest = &elist[idx].list;
while ((est=*pest) != NULL) {
if (est->stats != stats) {
pest = &est->next;
continue;
}
 
write_lock_bh(&est_lock);
*pest = est->next;
write_unlock_bh(&est_lock);
 
kfree(est);
killed++;
}
if (killed && elist[idx].list == NULL)
del_timer(&elist[idx].timer);
}
}
 
/sch_red.c
0,0 → 1,481
/*
* net/sched/sch_red.c Random Early Detection queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* J Hadi Salim <hadi@nortel.com> 980914: computation fixes
* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
* J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
 
 
/* Random Early Detection (RED) algorithm.
=======================================
 
Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
 
This file codes a "divisionless" version of RED algorithm
as written down in Fig.17 of the paper.
 
Short description.
------------------
 
When a new packet arrives we calculate the average queue length:
 
avg = (1-W)*avg + W*current_queue_len,
 
W is the filter time constant (choosen as 2^(-Wlog)), it controls
the inertia of the algorithm. To allow larger bursts, W should be
decreased.
 
if (avg > th_max) -> packet marked (dropped).
if (avg < th_min) -> packet passes.
if (th_min < avg < th_max) we calculate probability:
 
Pb = max_P * (avg - th_min)/(th_max-th_min)
 
and mark (drop) packet with this probability.
Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
max_P should be small (not 1), usually 0.01..0.02 is good value.
 
max_P is chosen as a number, so that max_P/(th_max-th_min)
is a negative power of two in order arithmetics to contain
only shifts.
 
 
Parameters, settable by user:
-----------------------------
 
limit - bytes (must be > qth_max + burst)
 
Hard limit on queue length, should be chosen >qth_max
to allow packet bursts. This parameter does not
affect the algorithms behaviour and can be chosen
arbitrarily high (well, less than ram size)
Really, this limit will never be reached
if RED works correctly.
 
qth_min - bytes (should be < qth_max/2)
qth_max - bytes (should be at least 2*qth_min and less limit)
Wlog - bits (<32) log(1/W).
Plog - bits (<32)
 
Plog is related to max_P by formula:
 
max_P = (qth_max-qth_min)/2^Plog;
 
F.e. if qth_max=128K and qth_min=32K, then Plog=22
corresponds to max_P=0.02
 
Scell_log
Stab
 
Lookup table for log((1-W)^(t/t_ave).
 
 
NOTES:
 
Upper bound on W.
-----------------
 
If you want to allow bursts of L packets of size S,
you should choose W:
 
L + 1 - th_min/S < (1-(1-W)^L)/W
 
th_min/S = 32 th_min/S = 4
log(W) L
-1 33
-2 35
-3 39
-4 46
-5 57
-6 75
-7 101
-8 135
-9 190
etc.
*/
 
struct red_sched_data
{
/* Parameters */
u32 limit; /* HARD maximal queue length */
u32 qth_min; /* Min average length threshold: A scaled */
u32 qth_max; /* Max average length threshold: A scaled */
u32 Rmask;
u32 Scell_max;
unsigned char flags;
char Wlog; /* log(W) */
char Plog; /* random number bits */
char Scell_log;
u8 Stab[256];
 
/* Variables */
unsigned long qave; /* Average queue length: A scaled */
int qcount; /* Packets since last random number generation */
u32 qR; /* Cached random number */
 
psched_time_t qidlestart; /* Start of idle period */
struct tc_red_xstats st;
};
 
static int red_ecn_mark(struct sk_buff *skb)
{
if (skb->nh.raw + 20 > skb->tail)
return 0;
 
switch (skb->protocol) {
case __constant_htons(ETH_P_IP):
if (!INET_ECN_is_capable(skb->nh.iph->tos))
return 0;
if (INET_ECN_is_not_ce(skb->nh.iph->tos))
IP_ECN_set_ce(skb->nh.iph);
return 1;
case __constant_htons(ETH_P_IPV6):
if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h)))
return 0;
IP6_ECN_set_ce(skb->nh.ipv6h);
return 1;
default:
return 0;
}
}
 
static int
red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct red_sched_data *q = (struct red_sched_data *)sch->data;
 
psched_time_t now;
 
if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
long us_idle;
int shift;
 
PSCHED_GET_TIME(now);
us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0);
PSCHED_SET_PASTPERFECT(q->qidlestart);
 
/*
The problem: ideally, average length queue recalcultion should
be done over constant clock intervals. This is too expensive, so that
the calculation is driven by outgoing packets.
When the queue is idle we have to model this clock by hand.
 
SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
dummy packets as a burst after idle time, i.e.
 
q->qave *= (1-W)^m
 
This is an apparently overcomplicated solution (f.e. we have to precompute
a table to make this calculation in reasonable time)
I believe that a simpler model may be used here,
but it is field for experiments.
*/
shift = q->Stab[us_idle>>q->Scell_log];
 
if (shift) {
q->qave >>= shift;
} else {
/* Approximate initial part of exponent
with linear function:
(1-W)^m ~= 1-mW + ...
 
Seems, it is the best solution to
problem of too coarce exponent tabulation.
*/
 
us_idle = (q->qave * us_idle)>>q->Scell_log;
if (us_idle < q->qave/2)
q->qave -= us_idle;
else
q->qave >>= 1;
}
} else {
q->qave += sch->stats.backlog - (q->qave >> q->Wlog);
/* NOTE:
q->qave is fixed point number with point at Wlog.
The formulae above is equvalent to floating point
version:
 
qave = qave*(1-W) + sch->stats.backlog*W;
--ANK (980924)
*/
}
 
if (q->qave < q->qth_min) {
q->qcount = -1;
enqueue:
if (sch->stats.backlog + skb->len <= q->limit) {
__skb_queue_tail(&sch->q, skb);
sch->stats.backlog += skb->len;
sch->stats.bytes += skb->len;
sch->stats.packets++;
return NET_XMIT_SUCCESS;
} else {
q->st.pdrop++;
}
kfree_skb(skb);
sch->stats.drops++;
return NET_XMIT_DROP;
}
if (q->qave >= q->qth_max) {
q->qcount = -1;
sch->stats.overlimits++;
mark:
if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
q->st.early++;
goto drop;
}
q->st.marked++;
goto enqueue;
}
 
if (++q->qcount) {
/* The formula used below causes questions.
 
OK. qR is random number in the interval 0..Rmask
i.e. 0..(2^Plog). If we used floating point
arithmetics, it would be: (2^Plog)*rnd_num,
where rnd_num is less 1.
 
Taking into account, that qave have fixed
point at Wlog, and Plog is related to max_P by
max_P = (qth_max-qth_min)/2^Plog; two lines
below have the following floating point equivalent:
max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
 
Any questions? --ANK (980924)
*/
if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
goto enqueue;
q->qcount = 0;
q->qR = net_random()&q->Rmask;
sch->stats.overlimits++;
goto mark;
}
q->qR = net_random()&q->Rmask;
goto enqueue;
 
drop:
kfree_skb(skb);
sch->stats.drops++;
return NET_XMIT_CN;
}
 
static int
red_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
struct red_sched_data *q = (struct red_sched_data *)sch->data;
 
PSCHED_SET_PASTPERFECT(q->qidlestart);
 
__skb_queue_head(&sch->q, skb);
sch->stats.backlog += skb->len;
return 0;
}
 
static struct sk_buff *
red_dequeue(struct Qdisc* sch)
{
struct sk_buff *skb;
struct red_sched_data *q = (struct red_sched_data *)sch->data;
 
skb = __skb_dequeue(&sch->q);
if (skb) {
sch->stats.backlog -= skb->len;
return skb;
}
PSCHED_GET_TIME(q->qidlestart);
return NULL;
}
 
static unsigned int red_drop(struct Qdisc* sch)
{
struct sk_buff *skb;
struct red_sched_data *q = (struct red_sched_data *)sch->data;
 
skb = __skb_dequeue_tail(&sch->q);
if (skb) {
unsigned int len = skb->len;
sch->stats.backlog -= len;
sch->stats.drops++;
q->st.other++;
kfree_skb(skb);
return len;
}
PSCHED_GET_TIME(q->qidlestart);
return 0;
}
 
static void red_reset(struct Qdisc* sch)
{
struct red_sched_data *q = (struct red_sched_data *)sch->data;
 
__skb_queue_purge(&sch->q);
sch->stats.backlog = 0;
PSCHED_SET_PASTPERFECT(q->qidlestart);
q->qave = 0;
q->qcount = -1;
}
 
static int red_change(struct Qdisc *sch, struct rtattr *opt)
{
struct red_sched_data *q = (struct red_sched_data *)sch->data;
struct rtattr *tb[TCA_RED_STAB];
struct tc_red_qopt *ctl;
 
if (opt == NULL ||
rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
return -EINVAL;
 
ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
 
sch_tree_lock(sch);
q->flags = ctl->flags;
q->Wlog = ctl->Wlog;
q->Plog = ctl->Plog;
q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
q->Scell_log = ctl->Scell_log;
q->Scell_max = (255<<q->Scell_log);
q->qth_min = ctl->qth_min<<ctl->Wlog;
q->qth_max = ctl->qth_max<<ctl->Wlog;
q->limit = ctl->limit;
memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
 
q->qcount = -1;
if (skb_queue_len(&sch->q) == 0)
PSCHED_SET_PASTPERFECT(q->qidlestart);
sch_tree_unlock(sch);
return 0;
}
 
static int red_init(struct Qdisc* sch, struct rtattr *opt)
{
int err;
 
MOD_INC_USE_COUNT;
 
if ((err = red_change(sch, opt)) != 0) {
MOD_DEC_USE_COUNT;
}
return err;
}
 
 
int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st)
{
RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
return 0;
 
rtattr_failure:
return 1;
}
 
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct red_sched_data *q = (struct red_sched_data *)sch->data;
unsigned char *b = skb->tail;
struct rtattr *rta;
struct tc_red_qopt opt;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
opt.limit = q->limit;
opt.qth_min = q->qth_min>>q->Wlog;
opt.qth_max = q->qth_max>>q->Wlog;
opt.Wlog = q->Wlog;
opt.Plog = q->Plog;
opt.Scell_log = q->Scell_log;
opt.flags = q->flags;
RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
rta->rta_len = skb->tail - b;
 
if (red_copy_xstats(skb, &q->st))
goto rtattr_failure;
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static void red_destroy(struct Qdisc *sch)
{
MOD_DEC_USE_COUNT;
}
 
struct Qdisc_ops red_qdisc_ops =
{
NULL,
NULL,
"red",
sizeof(struct red_sched_data),
 
red_enqueue,
red_dequeue,
red_requeue,
red_drop,
 
red_init,
red_reset,
red_destroy,
red_change,
 
red_dump,
};
 
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&red_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&red_qdisc_ops);
}
#endif
MODULE_LICENSE("GPL");
/sch_ingress.c
0,0 → 1,386
/* net/sched/sch_ingress.c - Ingress qdisc
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim 1999
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter.h>
#include <linux/smp.h>
#include <net/pkt_sched.h>
#include <asm/byteorder.h>
#include <asm/uaccess.h>
#include <linux/kmod.h>
#include <linux/stat.h>
#include <linux/interrupt.h>
#include <linux/list.h>
 
 
#undef DEBUG_INGRESS
 
#ifdef DEBUG_INGRESS /* control */
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define DPRINTK(format,args...)
#endif
 
#if 0 /* data */
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define D2PRINTK(format,args...)
#endif
 
 
#define PRIV(sch) ((struct ingress_qdisc_data *) (sch)->data)
 
 
/* Thanks to Doron Oz for this hack
*/
static int nf_registered = 0;
 
struct ingress_qdisc_data {
struct Qdisc *q;
struct tcf_proto *filter_list;
};
 
 
/* ------------------------- Class/flow operations ------------------------- */
 
 
static int ingress_graft(struct Qdisc *sch,unsigned long arg,
struct Qdisc *new,struct Qdisc **old)
{
#ifdef DEBUG_INGRESS
struct ingress_qdisc_data *p = PRIV(sch);
#endif
 
DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n",
sch, p, new, old);
DPRINTK("\n ingress_graft: You cannot add qdiscs to classes");
return 1;
}
 
 
static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
{
return NULL;
}
 
 
static unsigned long ingress_get(struct Qdisc *sch,u32 classid)
{
#ifdef DEBUG_INGRESS
struct ingress_qdisc_data *p = PRIV(sch);
#endif
DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
return TC_H_MIN(classid) + 1;
}
 
 
static unsigned long ingress_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return ingress_get(sch, classid);
}
 
 
static void ingress_put(struct Qdisc *sch, unsigned long cl)
{
}
 
 
static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent,
struct rtattr **tca, unsigned long *arg)
{
#ifdef DEBUG_INGRESS
struct ingress_qdisc_data *p = PRIV(sch);
#endif
DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x),"
"arg 0x%lx\n", sch, p, classid, parent, *arg);
DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment");
return 0;
}
 
 
 
static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker)
{
#ifdef DEBUG_INGRESS
struct ingress_qdisc_data *p = PRIV(sch);
#endif
DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment");
}
 
 
static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl)
{
struct ingress_qdisc_data *p = PRIV(sch);
 
return &p->filter_list;
}
 
 
/* --------------------------- Qdisc operations ---------------------------- */
 
 
static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
{
struct ingress_qdisc_data *p = PRIV(sch);
struct tcf_result res;
int result;
 
D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
result = tc_classify(skb, p->filter_list, &res);
D2PRINTK("result %d class 0x%04x\n", result, res.classid);
/*
* Unlike normal "enqueue" functions, ingress_enqueue returns a
* firewall FW_* code.
*/
#ifdef CONFIG_NET_CLS_POLICE
switch (result) {
case TC_POLICE_SHOT:
result = NF_DROP;
sch->stats.drops++;
break;
case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */
case TC_POLICE_OK:
case TC_POLICE_UNSPEC:
default:
sch->stats.packets++;
sch->stats.bytes += skb->len;
result = NF_ACCEPT;
break;
};
#else
sch->stats.packets++;
sch->stats.bytes += skb->len;
#endif
 
skb->tc_index = TC_H_MIN(res.classid);
return result;
}
 
 
static struct sk_buff *ingress_dequeue(struct Qdisc *sch)
{
/*
struct ingress_qdisc_data *p = PRIV(sch);
D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p));
*/
return NULL;
}
 
 
static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch)
{
/*
struct ingress_qdisc_data *p = PRIV(sch);
D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p));
*/
return 0;
}
 
static unsigned int ingress_drop(struct Qdisc *sch)
{
#ifdef DEBUG_INGRESS
struct ingress_qdisc_data *p = PRIV(sch);
#endif
DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p);
return 0;
}
 
static unsigned int
ing_hook(unsigned int hook, struct sk_buff **pskb,
const struct net_device *indev,
const struct net_device *outdev,
int (*okfn)(struct sk_buff *))
{
struct Qdisc *q;
struct sk_buff *skb = *pskb;
struct net_device *dev = skb->dev;
int fwres=NF_ACCEPT;
 
DPRINTK("ing_hook: skb %s dev=%s len=%u\n",
skb->sk ? "(owned)" : "(unowned)",
skb->dev ? (*pskb)->dev->name : "(no dev)",
skb->len);
 
/*
revisit later: Use a private since lock dev->queue_lock is also
used on the egress (might slow things for an iota)
*/
 
if (dev->qdisc_ingress) {
spin_lock(&dev->queue_lock);
if ((q = dev->qdisc_ingress) != NULL)
fwres = q->enqueue(skb, q);
spin_unlock(&dev->queue_lock);
}
return fwres;
}
 
/* after ipt_filter */
static struct nf_hook_ops ing_ops =
{
{ NULL, NULL},
ing_hook,
PF_INET,
NF_IP_PRE_ROUTING,
NF_IP_PRI_FILTER + 1
};
 
int ingress_init(struct Qdisc *sch,struct rtattr *opt)
{
struct ingress_qdisc_data *p = PRIV(sch);
 
if (!nf_registered) {
if (nf_register_hook(&ing_ops) < 0) {
printk("ingress qdisc registration error \n");
goto error;
}
nf_registered++;
}
 
DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
memset(p, 0, sizeof(*p));
p->filter_list = NULL;
p->q = &noop_qdisc;
MOD_INC_USE_COUNT;
return 0;
error:
return -EINVAL;
}
 
 
static void ingress_reset(struct Qdisc *sch)
{
struct ingress_qdisc_data *p = PRIV(sch);
 
DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p);
 
/*
#if 0
*/
/* for future use */
qdisc_reset(p->q);
/*
#endif
*/
}
 
/* ------------------------------------------------------------- */
 
 
/* ------------------------------------------------------------- */
 
static void ingress_destroy(struct Qdisc *sch)
{
struct ingress_qdisc_data *p = PRIV(sch);
struct tcf_proto *tp;
 
DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p);
while (p->filter_list) {
tp = p->filter_list;
p->filter_list = tp->next;
tcf_destroy(tp);
}
memset(p, 0, sizeof(*p));
p->filter_list = NULL;
 
#if 0
/* for future use */
qdisc_destroy(p->q);
#endif
MOD_DEC_USE_COUNT;
 
}
 
 
static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
{
unsigned char *b = skb->tail;
struct rtattr *rta;
 
rta = (struct rtattr *) b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
rta->rta_len = skb->tail - b;
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static struct Qdisc_class_ops ingress_class_ops =
{
ingress_graft, /* graft */
ingress_leaf, /* leaf */
ingress_get, /* get */
ingress_put, /* put */
ingress_change, /* change */
NULL, /* delete */
ingress_walk, /* walk */
 
ingress_find_tcf, /* tcf_chain */
ingress_bind_filter, /* bind_tcf */
ingress_put, /* unbind_tcf */
 
NULL, /* dump */
};
 
struct Qdisc_ops ingress_qdisc_ops =
{
NULL, /* next */
&ingress_class_ops, /* cl_ops */
"ingress",
sizeof(struct ingress_qdisc_data),
 
ingress_enqueue, /* enqueue */
ingress_dequeue, /* dequeue */
ingress_requeue, /* requeue */
ingress_drop, /* drop */
 
ingress_init, /* init */
ingress_reset, /* reset */
ingress_destroy, /* destroy */
NULL, /* change */
 
ingress_dump, /* dump */
};
 
 
#ifdef MODULE
int init_module(void)
{
int ret = 0;
 
if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) {
printk("Unable to register Ingress qdisc\n");
return ret;
}
 
return ret;
}
 
 
void cleanup_module(void)
{
unregister_qdisc(&ingress_qdisc_ops);
if (nf_registered)
nf_unregister_hook(&ing_ops);
}
#endif
MODULE_LICENSE("GPL");
/sch_tbf.c
0,0 → 1,550
/*
* net/sched/sch_tbf.c Token Bucket Filter queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
* original idea by Martin Devera
*
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
 
/* Simple Token Bucket Filter.
=======================================
 
SOURCE.
-------
 
None.
 
Description.
------------
 
A data flow obeys TBF with rate R and depth B, if for any
time interval t_i...t_f the number of transmitted bits
does not exceed B + R*(t_f-t_i).
 
Packetized version of this definition:
The sequence of packets of sizes s_i served at moments t_i
obeys TBF, if for any i<=k:
 
s_i+....+s_k <= B + R*(t_k - t_i)
 
Algorithm.
----------
 
Let N(t_i) be B/R initially and N(t) grow continuously with time as:
 
N(t+delta) = min{B/R, N(t) + delta}
 
If the first packet in queue has length S, it may be
transmitted only at the time t_* when S/R <= N(t_*),
and in this case N(t) jumps:
 
N(t_* + 0) = N(t_* - 0) - S/R.
 
 
 
Actually, QoS requires two TBF to be applied to a data stream.
One of them controls steady state burst size, another
one with rate P (peak rate) and depth M (equal to link MTU)
limits bursts at a smaller time scale.
 
It is easy to see that P>R, and B>M. If P is infinity, this double
TBF is equivalent to a single one.
 
When TBF works in reshaping mode, latency is estimated as:
 
lat = max ((L-B)/R, (L-M)/P)
 
 
NOTES.
------
 
If TBF throttles, it starts a watchdog timer, which will wake it up
when it is ready to transmit.
Note that the minimal timer resolution is 1/HZ.
If no new packets arrive during this period,
or if the device is not awaken by EOI for some previous packet,
TBF can stop its activity for 1/HZ.
 
 
This means, that with depth B, the maximal rate is
 
R_crit = B*HZ
 
F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 
Note that the peak rate TBF is much more tough: with MTU 1500
P_crit = 150Kbytes/sec. So, if you need greater peak
rates, use alpha with HZ=1000 :-)
 
With classful TBF, limit is just kept for backwards compatibility.
It is passed to the default bfifo qdisc - if the inner qdisc is
changed the limit is not effective anymore.
*/
 
struct tbf_sched_data
{
/* Parameters */
u32 limit; /* Maximal length of backlog: bytes */
u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
u32 mtu;
u32 max_size;
struct qdisc_rate_table *R_tab;
struct qdisc_rate_table *P_tab;
 
/* Variables */
long tokens; /* Current number of B tokens */
long ptokens; /* Current number of P tokens */
psched_time_t t_c; /* Time check-point */
struct timer_list wd_timer; /* Watchdog timer */
struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
};
 
#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
 
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
int ret;
 
if (skb->len > q->max_size) {
sch->stats.drops++;
#ifdef CONFIG_NET_CLS_POLICE
if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
#endif
kfree_skb(skb);
 
return NET_XMIT_DROP;
}
 
if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
sch->stats.drops++;
return ret;
}
 
sch->q.qlen++;
sch->stats.bytes += skb->len;
sch->stats.packets++;
return 0;
}
 
static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
int ret;
 
if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0)
sch->q.qlen++;
 
return ret;
}
 
static unsigned int tbf_drop(struct Qdisc* sch)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
unsigned int len;
 
if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
sch->q.qlen--;
sch->stats.drops++;
}
return len;
}
 
static void tbf_watchdog(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
 
sch->flags &= ~TCQ_F_THROTTLED;
netif_schedule(sch->dev);
}
 
static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
struct sk_buff *skb;
 
skb = q->qdisc->dequeue(q->qdisc);
 
if (skb) {
psched_time_t now;
long toks;
long ptoks = 0;
unsigned int len = skb->len;
 
PSCHED_GET_TIME(now);
 
toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0);
 
if (q->P_tab) {
ptoks = toks + q->ptokens;
if (ptoks > (long)q->mtu)
ptoks = q->mtu;
ptoks -= L2T_P(q, len);
}
toks += q->tokens;
if (toks > (long)q->buffer)
toks = q->buffer;
toks -= L2T(q, len);
 
if ((toks|ptoks) >= 0) {
q->t_c = now;
q->tokens = toks;
q->ptokens = ptoks;
sch->q.qlen--;
sch->flags &= ~TCQ_F_THROTTLED;
return skb;
}
 
if (!netif_queue_stopped(sch->dev)) {
long delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks));
 
if (delay == 0)
delay = 1;
 
mod_timer(&q->wd_timer, jiffies+delay);
}
 
/* Maybe we have a shorter packet in the queue,
which can be sent now. It sounds cool,
but, however, this is wrong in principle.
We MUST NOT reorder packets under these circumstances.
 
Really, if we split the flow into independent
subflows, it would be a very good solution.
This is the main idea of all FQ algorithms
(cf. CSZ, HPFQ, HFSC)
*/
 
if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
/* When requeue fails skb is dropped */
sch->q.qlen--;
sch->stats.drops++;
}
 
sch->flags |= TCQ_F_THROTTLED;
sch->stats.overlimits++;
}
return NULL;
}
 
static void tbf_reset(struct Qdisc* sch)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
PSCHED_GET_TIME(q->t_c);
q->tokens = q->buffer;
q->ptokens = q->mtu;
sch->flags &= ~TCQ_F_THROTTLED;
del_timer(&q->wd_timer);
}
 
static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit)
{
struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops);
struct rtattr *rta;
int ret;
 
if (q) {
rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (rta) {
rta->rta_type = RTM_NEWQDISC;
rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
 
ret = q->ops->change(q, rta);
kfree(rta);
 
if (ret == 0)
return q;
}
qdisc_destroy(q);
}
 
return NULL;
}
 
static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
{
int err = -EINVAL;
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
struct rtattr *tb[TCA_TBF_PTAB];
struct tc_tbf_qopt *qopt;
struct qdisc_rate_table *rtab = NULL;
struct qdisc_rate_table *ptab = NULL;
struct Qdisc *child = NULL;
int max_size,n;
 
if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
tb[TCA_TBF_PARMS-1] == NULL ||
RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
goto done;
 
qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
if (rtab == NULL)
goto done;
 
if (qopt->peakrate.rate) {
if (qopt->peakrate.rate > qopt->rate.rate)
ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
if (ptab == NULL)
goto done;
}
 
for (n = 0; n < 256; n++)
if (rtab->data[n] > qopt->buffer) break;
max_size = (n << qopt->rate.cell_log)-1;
if (ptab) {
int size;
 
for (n = 0; n < 256; n++)
if (ptab->data[n] > qopt->mtu) break;
size = (n << qopt->peakrate.cell_log)-1;
if (size < max_size) max_size = size;
}
if (max_size < 0)
goto done;
 
if (q->qdisc == &noop_qdisc) {
if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL)
goto done;
}
 
sch_tree_lock(sch);
if (child) q->qdisc = child;
q->limit = qopt->limit;
q->mtu = qopt->mtu;
q->max_size = max_size;
q->buffer = qopt->buffer;
q->tokens = q->buffer;
q->ptokens = q->mtu;
rtab = xchg(&q->R_tab, rtab);
ptab = xchg(&q->P_tab, ptab);
sch_tree_unlock(sch);
err = 0;
done:
if (rtab)
qdisc_put_rtab(rtab);
if (ptab)
qdisc_put_rtab(ptab);
return err;
}
 
static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
{
int err;
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 
if (opt == NULL)
return -EINVAL;
 
MOD_INC_USE_COUNT;
 
PSCHED_GET_TIME(q->t_c);
init_timer(&q->wd_timer);
q->wd_timer.function = tbf_watchdog;
q->wd_timer.data = (unsigned long)sch;
 
q->qdisc = &noop_qdisc;
 
if ((err = tbf_change(sch, opt)) != 0) {
MOD_DEC_USE_COUNT;
}
return err;
}
 
static void tbf_destroy(struct Qdisc *sch)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 
del_timer(&q->wd_timer);
 
if (q->P_tab)
qdisc_put_rtab(q->P_tab);
if (q->R_tab)
qdisc_put_rtab(q->R_tab);
 
qdisc_destroy(q->qdisc);
q->qdisc = &noop_qdisc;
 
MOD_DEC_USE_COUNT;
}
 
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
unsigned char *b = skb->tail;
struct rtattr *rta;
struct tc_tbf_qopt opt;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
opt.limit = q->limit;
opt.rate = q->R_tab->rate;
if (q->P_tab)
opt.peakrate = q->P_tab->rate;
else
memset(&opt.peakrate, 0, sizeof(opt.peakrate));
opt.mtu = q->mtu;
opt.buffer = q->buffer;
RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
rta->rta_len = skb->tail - b;
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct tbf_sched_data *q = (struct tbf_sched_data*)sch->data;
 
if (cl != 1) /* only one class */
return -ENOENT;
 
tcm->tcm_handle |= TC_H_MIN(1);
tcm->tcm_info = q->qdisc->handle;
 
return 0;
}
 
static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 
if (new == NULL)
new = &noop_qdisc;
 
sch_tree_lock(sch);
*old = xchg(&q->qdisc, new);
qdisc_reset(*old);
sch->q.qlen = 0;
sch_tree_unlock(sch);
 
return 0;
}
 
static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
return q->qdisc;
}
 
static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
{
return 1;
}
 
static void tbf_put(struct Qdisc *sch, unsigned long arg)
{
}
 
static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
struct rtattr **tca, unsigned long *arg)
{
return -ENOSYS;
}
 
static int tbf_delete(struct Qdisc *sch, unsigned long arg)
{
return -ENOSYS;
}
 
static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
if (walker->count >= walker->skip)
if (walker->fn(sch, 1, walker) < 0) {
walker->stop = 1;
return;
}
walker->count++;
}
}
 
static struct Qdisc_class_ops tbf_class_ops =
{
.graft = tbf_graft,
.leaf = tbf_leaf,
.get = tbf_get,
.put = tbf_put,
.change = tbf_change_class,
.delete = tbf_delete,
.walk = tbf_walk,
.dump = tbf_dump_class,
};
 
struct Qdisc_ops tbf_qdisc_ops =
{
NULL,
&tbf_class_ops,
"tbf",
sizeof(struct tbf_sched_data),
 
tbf_enqueue,
tbf_dequeue,
tbf_requeue,
tbf_drop,
 
tbf_init,
tbf_reset,
tbf_destroy,
tbf_change,
 
tbf_dump,
};
 
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&tbf_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&tbf_qdisc_ops);
}
#endif
MODULE_LICENSE("GPL");
/sch_generic.c
0,0 → 1,543
/*
* net/sched/sch_generic.c Generic packet scheduler routines.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
* - Ingress support
*/
 
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/config.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
/* Main transmission queue. */
 
/* Main qdisc structure lock.
 
However, modifications
to data, participating in scheduling must be additionally
protected with dev->queue_lock spinlock.
 
The idea is the following:
- enqueue, dequeue are serialized via top level device
spinlock dev->queue_lock.
- tree walking is protected by read_lock(qdisc_tree_lock)
and this lock is used only in process context.
- updates to tree are made only under rtnl semaphore,
hence this lock may be made without local bh disabling.
 
qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
*/
rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED;
 
/*
dev->queue_lock serializes queue accesses for this device
AND dev->qdisc pointer itself.
 
dev->xmit_lock serializes accesses to device driver.
 
dev->queue_lock and dev->xmit_lock are mutually exclusive,
if one is grabbed, another must be free.
*/
 
 
/* Kick device.
Note, that this procedure can be called by a watchdog timer, so that
we do not check dev->tbusy flag here.
 
Returns: 0 - queue is empty.
>0 - queue is not empty, but throttled.
<0 - queue is not empty. Device is throttled, if dev->tbusy != 0.
 
NOTE: Called under dev->queue_lock with locally disabled BH.
*/
 
int qdisc_restart(struct net_device *dev)
{
struct Qdisc *q = dev->qdisc;
struct sk_buff *skb;
 
/* Dequeue packet */
if ((skb = q->dequeue(q)) != NULL) {
if (spin_trylock(&dev->xmit_lock)) {
/* Remember that the driver is grabbed by us. */
dev->xmit_lock_owner = smp_processor_id();
 
/* And release queue */
spin_unlock(&dev->queue_lock);
 
if (!netif_queue_stopped(dev)) {
if (netdev_nit)
dev_queue_xmit_nit(skb, dev);
 
if (dev->hard_start_xmit(skb, dev) == 0) {
dev->xmit_lock_owner = -1;
spin_unlock(&dev->xmit_lock);
 
spin_lock(&dev->queue_lock);
return -1;
}
}
 
/* Release the driver */
dev->xmit_lock_owner = -1;
spin_unlock(&dev->xmit_lock);
spin_lock(&dev->queue_lock);
q = dev->qdisc;
} else {
/* So, someone grabbed the driver. */
 
/* It may be transient configuration error,
when hard_start_xmit() recurses. We detect
it by checking xmit owner and drop the
packet when deadloop is detected.
*/
if (dev->xmit_lock_owner == smp_processor_id()) {
kfree_skb(skb);
if (net_ratelimit())
printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
return -1;
}
netdev_rx_stat[smp_processor_id()].cpu_collision++;
}
 
/* Device kicked us out :(
This is possible in three cases:
 
0. driver is locked
1. fastroute is enabled
2. device cannot determine busy state
before start of transmission (f.e. dialout)
3. device is buggy (ppp)
*/
 
q->ops->requeue(skb, q);
netif_schedule(dev);
return 1;
}
return q->q.qlen;
}
 
static void dev_watchdog(unsigned long arg)
{
struct net_device *dev = (struct net_device *)arg;
 
spin_lock(&dev->xmit_lock);
if (dev->qdisc != &noop_qdisc) {
if (netif_device_present(dev) &&
netif_running(dev) &&
netif_carrier_ok(dev)) {
if (netif_queue_stopped(dev) &&
(jiffies - dev->trans_start) > dev->watchdog_timeo) {
printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
dev->tx_timeout(dev);
}
if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
dev_hold(dev);
}
}
spin_unlock(&dev->xmit_lock);
 
dev_put(dev);
}
 
static void dev_watchdog_init(struct net_device *dev)
{
init_timer(&dev->watchdog_timer);
dev->watchdog_timer.data = (unsigned long)dev;
dev->watchdog_timer.function = dev_watchdog;
}
 
void __netdev_watchdog_up(struct net_device *dev)
{
if (dev->tx_timeout) {
if (dev->watchdog_timeo <= 0)
dev->watchdog_timeo = 5*HZ;
if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
dev_hold(dev);
}
}
 
static void dev_watchdog_up(struct net_device *dev)
{
spin_lock_bh(&dev->xmit_lock);
__netdev_watchdog_up(dev);
spin_unlock_bh(&dev->xmit_lock);
}
 
static void dev_watchdog_down(struct net_device *dev)
{
spin_lock_bh(&dev->xmit_lock);
if (del_timer(&dev->watchdog_timer))
__dev_put(dev);
spin_unlock_bh(&dev->xmit_lock);
}
 
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
under all circumstances. It is difficult to invent anything faster or
cheaper.
*/
 
static int
noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
{
kfree_skb(skb);
return NET_XMIT_CN;
}
 
static struct sk_buff *
noop_dequeue(struct Qdisc * qdisc)
{
return NULL;
}
 
static int
noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
{
if (net_ratelimit())
printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
kfree_skb(skb);
return NET_XMIT_CN;
}
 
struct Qdisc_ops noop_qdisc_ops =
{
NULL,
NULL,
"noop",
0,
 
noop_enqueue,
noop_dequeue,
noop_requeue,
};
 
struct Qdisc noop_qdisc =
{
noop_enqueue,
noop_dequeue,
TCQ_F_BUILTIN,
&noop_qdisc_ops,
};
 
 
struct Qdisc_ops noqueue_qdisc_ops =
{
NULL,
NULL,
"noqueue",
0,
 
noop_enqueue,
noop_dequeue,
noop_requeue,
 
};
 
struct Qdisc noqueue_qdisc =
{
NULL,
noop_dequeue,
TCQ_F_BUILTIN,
&noqueue_qdisc_ops,
};
 
 
static const u8 prio2band[TC_PRIO_MAX+1] =
{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
 
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio+fifo combination.
*/
 
static int
pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
{
struct sk_buff_head *list;
 
list = ((struct sk_buff_head*)qdisc->data) +
prio2band[skb->priority&TC_PRIO_MAX];
 
if (list->qlen < qdisc->dev->tx_queue_len) {
__skb_queue_tail(list, skb);
qdisc->q.qlen++;
qdisc->stats.bytes += skb->len;
qdisc->stats.packets++;
return 0;
}
qdisc->stats.drops++;
kfree_skb(skb);
return NET_XMIT_DROP;
}
 
static struct sk_buff *
pfifo_fast_dequeue(struct Qdisc* qdisc)
{
int prio;
struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
struct sk_buff *skb;
 
for (prio = 0; prio < 3; prio++, list++) {
skb = __skb_dequeue(list);
if (skb) {
qdisc->q.qlen--;
return skb;
}
}
return NULL;
}
 
static int
pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
{
struct sk_buff_head *list;
 
list = ((struct sk_buff_head*)qdisc->data) +
prio2band[skb->priority&TC_PRIO_MAX];
 
__skb_queue_head(list, skb);
qdisc->q.qlen++;
return 0;
}
 
static void
pfifo_fast_reset(struct Qdisc* qdisc)
{
int prio;
struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
 
for (prio=0; prio < 3; prio++)
skb_queue_purge(list+prio);
qdisc->q.qlen = 0;
}
 
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
unsigned char *b = skb->tail;
struct tc_prio_qopt opt;
 
opt.bands = 3;
memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
{
int i;
struct sk_buff_head *list;
 
list = ((struct sk_buff_head*)qdisc->data);
 
for (i=0; i<3; i++)
skb_queue_head_init(list+i);
 
return 0;
}
 
static struct Qdisc_ops pfifo_fast_ops =
{
NULL,
NULL,
"pfifo_fast",
3 * sizeof(struct sk_buff_head),
 
pfifo_fast_enqueue,
pfifo_fast_dequeue,
pfifo_fast_requeue,
NULL,
 
pfifo_fast_init,
pfifo_fast_reset,
NULL,
NULL,
pfifo_fast_dump,
 
};
 
struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
{
struct Qdisc *sch;
int size = sizeof(*sch) + ops->priv_size;
 
sch = kmalloc(size, GFP_KERNEL);
if (!sch)
return NULL;
memset(sch, 0, size);
 
skb_queue_head_init(&sch->q);
sch->ops = ops;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev = dev;
sch->stats.lock = &dev->queue_lock;
atomic_set(&sch->refcnt, 1);
if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
 
kfree(sch);
return NULL;
}
 
/* Under dev->queue_lock and BH! */
 
void qdisc_reset(struct Qdisc *qdisc)
{
struct Qdisc_ops *ops = qdisc->ops;
 
if (ops->reset)
ops->reset(qdisc);
}
 
/* Under dev->queue_lock and BH! */
 
void qdisc_destroy(struct Qdisc *qdisc)
{
struct Qdisc_ops *ops = qdisc->ops;
struct net_device *dev;
 
if (!atomic_dec_and_test(&qdisc->refcnt))
return;
 
dev = qdisc->dev;
 
if (dev) {
struct Qdisc *q, **qp;
for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) {
if (q == qdisc) {
*qp = q->next;
break;
}
}
}
#ifdef CONFIG_NET_ESTIMATOR
qdisc_kill_estimator(&qdisc->stats);
#endif
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
ops->destroy(qdisc);
if (!(qdisc->flags&TCQ_F_BUILTIN))
kfree(qdisc);
}
 
 
void dev_activate(struct net_device *dev)
{
/* No queueing discipline is attached to device;
create default one i.e. pfifo_fast for devices,
which need queueing and noqueue_qdisc for
virtual interfaces
*/
 
if (dev->qdisc_sleeping == &noop_qdisc) {
struct Qdisc *qdisc;
if (dev->tx_queue_len) {
qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
if (qdisc == NULL) {
printk(KERN_INFO "%s: activation failed\n", dev->name);
return;
}
 
write_lock(&qdisc_tree_lock);
qdisc->next = dev->qdisc_list;
dev->qdisc_list = qdisc;
write_unlock(&qdisc_tree_lock);
 
} else {
qdisc = &noqueue_qdisc;
}
write_lock(&qdisc_tree_lock);
dev->qdisc_sleeping = qdisc;
write_unlock(&qdisc_tree_lock);
}
 
spin_lock_bh(&dev->queue_lock);
if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) {
dev->trans_start = jiffies;
dev_watchdog_up(dev);
}
spin_unlock_bh(&dev->queue_lock);
}
 
void dev_deactivate(struct net_device *dev)
{
struct Qdisc *qdisc;
 
spin_lock_bh(&dev->queue_lock);
qdisc = dev->qdisc;
dev->qdisc = &noop_qdisc;
 
qdisc_reset(qdisc);
 
spin_unlock_bh(&dev->queue_lock);
 
dev_watchdog_down(dev);
 
while (test_bit(__LINK_STATE_SCHED, &dev->state))
yield();
 
spin_unlock_wait(&dev->xmit_lock);
}
 
void dev_init_scheduler(struct net_device *dev)
{
write_lock(&qdisc_tree_lock);
spin_lock_bh(&dev->queue_lock);
dev->qdisc = &noop_qdisc;
spin_unlock_bh(&dev->queue_lock);
dev->qdisc_sleeping = &noop_qdisc;
dev->qdisc_list = NULL;
write_unlock(&qdisc_tree_lock);
 
dev_watchdog_init(dev);
}
 
void dev_shutdown(struct net_device *dev)
{
struct Qdisc *qdisc;
 
write_lock(&qdisc_tree_lock);
spin_lock_bh(&dev->queue_lock);
qdisc = dev->qdisc_sleeping;
dev->qdisc = &noop_qdisc;
dev->qdisc_sleeping = &noop_qdisc;
qdisc_destroy(qdisc);
#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
if ((qdisc = dev->qdisc_ingress) != NULL) {
dev->qdisc_ingress = NULL;
qdisc_destroy(qdisc);
}
#endif
BUG_TRAP(dev->qdisc_list == NULL);
BUG_TRAP(!timer_pending(&dev->watchdog_timer));
dev->qdisc_list = NULL;
spin_unlock_bh(&dev->queue_lock);
write_unlock(&qdisc_tree_lock);
}
/cls_u32.c
0,0 → 1,718
/*
* net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* The filters are packed to hash tables of key nodes
* with a set of 32bit key/mask pairs at every node.
* Nodes reference next level hash tables etc.
*
* This scheme is the best universal classifier I managed to
* invent; it is not super-fast, but it is not slow (provided you
* program it correctly), and general enough. And its relative
* speed grows as the number of rules becomes larger.
*
* It seems that it represents the best middle point between
* speed and manageability both by human and by machine.
*
* It is especially useful for link sharing combined with QoS;
* pure RSVP doesn't need such a general approach and can use
* much simpler (and faster) schemes, sort of cls_rsvp.c.
*/
 
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <linux/rtnetlink.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
 
struct tc_u_knode
{
struct tc_u_knode *next;
u32 handle;
struct tc_u_hnode *ht_up;
#ifdef CONFIG_NET_CLS_POLICE
struct tcf_police *police;
#endif
struct tcf_result res;
struct tc_u_hnode *ht_down;
struct tc_u32_sel sel;
};
 
struct tc_u_hnode
{
struct tc_u_hnode *next;
u32 handle;
struct tc_u_common *tp_c;
int refcnt;
unsigned divisor;
u32 hgenerator;
struct tc_u_knode *ht[1];
};
 
struct tc_u_common
{
struct tc_u_common *next;
struct tc_u_hnode *hlist;
struct Qdisc *q;
int refcnt;
u32 hgenerator;
};
 
static struct tc_u_common *u32_list;
 
static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel)
{
unsigned h = key & sel->hmask;
 
h ^= h>>16;
h ^= h>>8;
return h;
}
 
static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res)
{
struct {
struct tc_u_knode *knode;
u8 *ptr;
} stack[TC_U32_MAXDEPTH];
 
struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
u8 *ptr = skb->nh.raw;
struct tc_u_knode *n;
int sdepth = 0;
int off2 = 0;
int sel = 0;
int i;
 
next_ht:
n = ht->ht[sel];
 
next_knode:
if (n) {
struct tc_u32_key *key = n->sel.keys;
 
for (i = n->sel.nkeys; i>0; i--, key++) {
if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
n = n->next;
goto next_knode;
}
}
if (n->ht_down == NULL) {
check_terminal:
if (n->sel.flags&TC_U32_TERMINAL) {
*res = n->res;
#ifdef CONFIG_NET_CLS_POLICE
if (n->police) {
int pol_res = tcf_police(skb, n->police);
if (pol_res >= 0)
return pol_res;
} else
#endif
return 0;
}
n = n->next;
goto next_knode;
}
 
/* PUSH */
if (sdepth >= TC_U32_MAXDEPTH)
goto deadloop;
stack[sdepth].knode = n;
stack[sdepth].ptr = ptr;
sdepth++;
 
ht = n->ht_down;
sel = 0;
if (ht->divisor)
sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel);
 
if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
goto next_ht;
 
if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
off2 = n->sel.off + 3;
if (n->sel.flags&TC_U32_VAROFFSET)
off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
off2 &= ~3;
}
if (n->sel.flags&TC_U32_EAT) {
ptr += off2;
off2 = 0;
}
 
if (ptr < skb->tail)
goto next_ht;
}
 
/* POP */
if (sdepth--) {
n = stack[sdepth].knode;
ht = n->ht_up;
ptr = stack[sdepth].ptr;
goto check_terminal;
}
return -1;
 
deadloop:
if (net_ratelimit())
printk("cls_u32: dead loop\n");
return -1;
}
 
static __inline__ struct tc_u_hnode *
u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
 
for (ht = tp_c->hlist; ht; ht = ht->next)
if (ht->handle == handle)
break;
 
return ht;
}
 
static __inline__ struct tc_u_knode *
u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
unsigned sel;
struct tc_u_knode *n;
 
sel = TC_U32_HASH(handle);
if (sel > ht->divisor)
return 0;
 
for (n = ht->ht[sel]; n; n = n->next)
if (n->handle == handle)
return n;
 
return NULL;
}
 
 
static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
{
struct tc_u_hnode *ht;
struct tc_u_common *tp_c = tp->data;
 
if (TC_U32_HTID(handle) == TC_U32_ROOT)
ht = tp->root;
else
ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
 
if (!ht)
return 0;
 
if (TC_U32_KEY(handle) == 0)
return (unsigned long)ht;
 
return (unsigned long)u32_lookup_key(ht, handle);
}
 
static void u32_put(struct tcf_proto *tp, unsigned long f)
{
}
 
static u32 gen_new_htid(struct tc_u_common *tp_c)
{
int i = 0x800;
 
do {
if (++tp_c->hgenerator == 0x7FF)
tp_c->hgenerator = 1;
} while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
 
return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
}
 
static int u32_init(struct tcf_proto *tp)
{
struct tc_u_hnode *root_ht;
struct tc_u_common *tp_c;
 
MOD_INC_USE_COUNT;
 
for (tp_c = u32_list; tp_c; tp_c = tp_c->next)
if (tp_c->q == tp->q)
break;
 
root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL);
if (root_ht == NULL) {
MOD_DEC_USE_COUNT;
return -ENOBUFS;
}
memset(root_ht, 0, sizeof(*root_ht));
root_ht->divisor = 0;
root_ht->refcnt++;
root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
 
if (tp_c == NULL) {
tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL);
if (tp_c == NULL) {
kfree(root_ht);
MOD_DEC_USE_COUNT;
return -ENOBUFS;
}
memset(tp_c, 0, sizeof(*tp_c));
tp_c->q = tp->q;
tp_c->next = u32_list;
u32_list = tp_c;
}
 
tp_c->refcnt++;
root_ht->next = tp_c->hlist;
tp_c->hlist = root_ht;
root_ht->tp_c = tp_c;
 
tp->root = root_ht;
tp->data = tp_c;
return 0;
}
 
static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
{
unsigned long cl;
 
if ((cl = __cls_set_class(&n->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(n->police);
#endif
if (n->ht_down)
n->ht_down->refcnt--;
kfree(n);
return 0;
}
 
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
{
struct tc_u_knode **kp;
struct tc_u_hnode *ht = key->ht_up;
 
if (ht) {
for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
if (*kp == key) {
tcf_tree_lock(tp);
*kp = key->next;
tcf_tree_unlock(tp);
 
u32_destroy_key(tp, key);
return 0;
}
}
}
BUG_TRAP(0);
return 0;
}
 
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_knode *n;
unsigned h;
 
for (h=0; h<=ht->divisor; h++) {
while ((n = ht->ht[h]) != NULL) {
ht->ht[h] = n->next;
 
u32_destroy_key(tp, n);
}
}
}
 
static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode **hn;
 
BUG_TRAP(!ht->refcnt);
 
u32_clear_hnode(tp, ht);
 
for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) {
if (*hn == ht) {
*hn = ht->next;
kfree(ht);
return 0;
}
}
 
BUG_TRAP(0);
return -ENOENT;
}
 
static void u32_destroy(struct tcf_proto *tp)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *root_ht = xchg(&tp->root, NULL);
 
BUG_TRAP(root_ht != NULL);
 
if (root_ht && --root_ht->refcnt == 0)
u32_destroy_hnode(tp, root_ht);
 
if (--tp_c->refcnt == 0) {
struct tc_u_hnode *ht;
struct tc_u_common **tp_cp;
 
for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) {
if (*tp_cp == tp_c) {
*tp_cp = tp_c->next;
break;
}
}
 
for (ht=tp_c->hlist; ht; ht = ht->next)
u32_clear_hnode(tp, ht);
 
while ((ht = tp_c->hlist) != NULL) {
tp_c->hlist = ht->next;
 
BUG_TRAP(ht->refcnt == 0);
 
kfree(ht);
};
 
kfree(tp_c);
}
 
MOD_DEC_USE_COUNT;
tp->data = NULL;
}
 
static int u32_delete(struct tcf_proto *tp, unsigned long arg)
{
struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
 
if (ht == NULL)
return 0;
 
if (TC_U32_KEY(ht->handle))
return u32_delete_key(tp, (struct tc_u_knode*)ht);
 
if (tp->root == ht)
return -EINVAL;
 
if (--ht->refcnt == 0)
u32_destroy_hnode(tp, ht);
 
return 0;
}
 
static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
{
struct tc_u_knode *n;
unsigned i = 0x7FF;
 
for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
if (i < TC_U32_NODE(n->handle))
i = TC_U32_NODE(n->handle);
i++;
 
return handle|(i>0xFFF ? 0xFFF : i);
}
 
static int u32_set_parms(struct Qdisc *q, unsigned long base,
struct tc_u_hnode *ht,
struct tc_u_knode *n, struct rtattr **tb,
struct rtattr *est)
{
if (tb[TCA_U32_LINK-1]) {
u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]);
struct tc_u_hnode *ht_down = NULL;
 
if (TC_U32_KEY(handle))
return -EINVAL;
 
if (handle) {
ht_down = u32_lookup_ht(ht->tp_c, handle);
 
if (ht_down == NULL)
return -EINVAL;
ht_down->refcnt++;
}
 
sch_tree_lock(q);
ht_down = xchg(&n->ht_down, ht_down);
sch_tree_unlock(q);
 
if (ht_down)
ht_down->refcnt--;
}
if (tb[TCA_U32_CLASSID-1]) {
unsigned long cl;
 
n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
sch_tree_lock(q);
cl = __cls_set_class(&n->res.class, q->ops->cl_ops->bind_tcf(q, base, n->res.classid));
sch_tree_unlock(q);
if (cl)
q->ops->cl_ops->unbind_tcf(q, cl);
}
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_U32_POLICE-1]) {
struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1], est);
 
sch_tree_lock(q);
police = xchg(&n->police, police);
sch_tree_unlock(q);
 
tcf_police_release(police);
}
#endif
return 0;
}
 
static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
struct rtattr **tca,
unsigned long *arg)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
struct tc_u_knode *n;
struct tc_u32_sel *s;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_U32_MAX];
u32 htid;
int err;
 
if (opt == NULL)
return handle ? -EINVAL : 0;
 
if (rtattr_parse(tb, TCA_U32_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
return -EINVAL;
 
if ((n = (struct tc_u_knode*)*arg) != NULL) {
if (TC_U32_KEY(n->handle) == 0)
return -EINVAL;
 
return u32_set_parms(tp->q, base, n->ht_up, n, tb, tca[TCA_RATE-1]);
}
 
if (tb[TCA_U32_DIVISOR-1]) {
unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]);
 
if (--divisor > 0x100)
return -EINVAL;
if (TC_U32_KEY(handle))
return -EINVAL;
if (handle == 0) {
handle = gen_new_htid(tp->data);
if (handle == 0)
return -ENOMEM;
}
ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
if (ht == NULL)
return -ENOBUFS;
memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*));
ht->tp_c = tp_c;
ht->refcnt = 0;
ht->divisor = divisor;
ht->handle = handle;
ht->next = tp_c->hlist;
tp_c->hlist = ht;
*arg = (unsigned long)ht;
return 0;
}
 
if (tb[TCA_U32_HASH-1]) {
htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]);
if (TC_U32_HTID(htid) == TC_U32_ROOT) {
ht = tp->root;
htid = ht->handle;
} else {
ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
if (ht == NULL)
return -EINVAL;
}
} else {
ht = tp->root;
htid = ht->handle;
}
 
if (ht->divisor < TC_U32_HASH(htid))
return -EINVAL;
 
if (handle) {
if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
return -EINVAL;
handle = htid | TC_U32_NODE(handle);
} else
handle = gen_new_kid(ht, htid);
 
if (tb[TCA_U32_SEL-1] == 0 ||
RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel))
return -EINVAL;
 
s = RTA_DATA(tb[TCA_U32_SEL-1]);
n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
if (n == NULL)
return -ENOBUFS;
memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key));
memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
n->ht_up = ht;
n->handle = handle;
err = u32_set_parms(tp->q, base, ht, n, tb, tca[TCA_RATE-1]);
if (err == 0) {
struct tc_u_knode **ins;
for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
break;
 
n->next = *ins;
wmb();
*ins = n;
 
*arg = (unsigned long)n;
return 0;
}
kfree(n);
return err;
}
 
static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
struct tc_u_knode *n;
unsigned h;
 
if (arg->stop)
return;
 
for (ht = tp_c->hlist; ht; ht = ht->next) {
if (arg->count >= arg->skip) {
if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
arg->stop = 1;
return;
}
}
arg->count++;
for (h = 0; h <= ht->divisor; h++) {
for (n = ht->ht[h]; n; n = n->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)n, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
}
 
static int u32_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct tc_u_knode *n = (struct tc_u_knode*)fh;
unsigned char *b = skb->tail;
struct rtattr *rta;
 
if (n == NULL)
return skb->len;
 
t->tcm_handle = n->handle;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
if (TC_U32_KEY(n->handle) == 0) {
struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
u32 divisor = ht->divisor+1;
RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor);
} else {
RTA_PUT(skb, TCA_U32_SEL,
sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
&n->sel);
if (n->ht_up) {
u32 htid = n->handle & 0xFFFFF000;
RTA_PUT(skb, TCA_U32_HASH, 4, &htid);
}
if (n->res.classid)
RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid);
if (n->ht_down)
RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle);
#ifdef CONFIG_NET_CLS_POLICE
if (n->police) {
struct rtattr * p_rta = (struct rtattr*)skb->tail;
 
RTA_PUT(skb, TCA_U32_POLICE, 0, NULL);
 
if (tcf_police_dump(skb, n->police) < 0)
goto rtattr_failure;
 
p_rta->rta_len = skb->tail - (u8*)p_rta;
}
#endif
}
 
rta->rta_len = skb->tail - b;
#ifdef CONFIG_NET_CLS_POLICE
if (TC_U32_KEY(n->handle) && n->police) {
if (qdisc_copy_stats(skb, &n->police->stats))
goto rtattr_failure;
}
#endif
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
struct tcf_proto_ops cls_u32_ops = {
NULL,
"u32",
u32_classify,
u32_init,
u32_destroy,
 
u32_get,
u32_put,
u32_change,
u32_delete,
u32_walk,
u32_dump
};
 
#ifdef MODULE
int init_module(void)
{
return register_tcf_proto_ops(&cls_u32_ops);
}
 
void cleanup_module(void)
{
unregister_tcf_proto_ops(&cls_u32_ops);
}
#endif
MODULE_LICENSE("GPL");
/cls_api.c
0,0 → 1,468
/*
* net/sched/cls_api.c Packet classifier API.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
*
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
*/
 
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/config.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
/* The list of all installed classifier types */
 
static struct tcf_proto_ops *tcf_proto_base;
 
/* Protects list of registered TC modules. It is pure SMP lock. */
static rwlock_t cls_mod_lock = RW_LOCK_UNLOCKED;
 
/* Find classifier type by string name */
 
struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind)
{
struct tcf_proto_ops *t = NULL;
 
if (kind) {
read_lock(&cls_mod_lock);
for (t = tcf_proto_base; t; t = t->next) {
if (rtattr_strcmp(kind, t->kind) == 0)
break;
}
read_unlock(&cls_mod_lock);
}
return t;
}
 
/* Register(unregister) new classifier type */
 
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t, **tp;
 
write_lock(&cls_mod_lock);
for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) {
if (strcmp(ops->kind, t->kind) == 0) {
write_unlock(&cls_mod_lock);
return -EEXIST;
}
}
 
ops->next = NULL;
*tp = ops;
write_unlock(&cls_mod_lock);
return 0;
}
 
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t, **tp;
 
write_lock(&cls_mod_lock);
for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)
if (t == ops)
break;
 
if (!t) {
write_unlock(&cls_mod_lock);
return -ENOENT;
}
*tp = t->next;
write_unlock(&cls_mod_lock);
return 0;
}
 
static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
struct tcf_proto *tp, unsigned long fh, int event);
 
 
/* Select new prio value from the range, managed by kernel. */
 
static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp)
{
u32 first = TC_H_MAKE(0xC0000000U,0U);
 
if (tp)
first = tp->prio-1;
 
return first;
}
 
/* Add/change/delete/get a filter node */
 
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct rtattr **tca = arg;
struct tcmsg *t = NLMSG_DATA(n);
u32 protocol = TC_H_MIN(t->tcm_info);
u32 prio = TC_H_MAJ(t->tcm_info);
u32 nprio = prio;
u32 parent = t->tcm_parent;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto **back, **chain;
struct tcf_proto *tp = NULL;
struct tcf_proto_ops *tp_ops;
struct Qdisc_class_ops *cops;
unsigned long cl = 0;
unsigned long fh;
int err;
 
if (prio == 0) {
/* If no priority is given, user wants we allocated it. */
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
return -ENOENT;
prio = TC_H_MAKE(0x80000000U,0U);
}
 
/* Find head of filter chain. */
 
/* Find link */
if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL)
return -ENODEV;
 
/* Find qdisc */
if (!parent) {
q = dev->qdisc_sleeping;
parent = q->handle;
} else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL)
return -EINVAL;
 
/* Is it classful? */
if ((cops = q->ops->cl_ops) == NULL)
return -EINVAL;
 
/* Do we search for filter, attached to class? */
if (TC_H_MIN(parent)) {
cl = cops->get(q, parent);
if (cl == 0)
return -ENOENT;
}
 
/* And the last stroke */
chain = cops->tcf_chain(q, cl);
err = -EINVAL;
if (chain == NULL)
goto errout;
 
/* Check the chain for existence of proto-tcf with this priority */
for (back = chain; (tp=*back) != NULL; back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
if (!nprio || (tp->protocol != protocol && protocol))
goto errout;
} else
tp = NULL;
break;
}
}
 
if (tp == NULL) {
/* Proto-tcf does not exist, create new one */
 
if (tca[TCA_KIND-1] == NULL || !protocol)
goto errout;
 
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
goto errout;
 
 
/* Create new proto tcf */
 
err = -ENOBUFS;
if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL)
goto errout;
tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]);
#ifdef CONFIG_KMOD
if (tp_ops==NULL && tca[TCA_KIND-1] != NULL) {
struct rtattr *kind = tca[TCA_KIND-1];
char module_name[4 + IFNAMSIZ + 1];
 
if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
sprintf(module_name, "cls_%s", (char*)RTA_DATA(kind));
request_module (module_name);
tp_ops = tcf_proto_lookup_ops(kind);
}
}
#endif
if (tp_ops == NULL) {
err = -EINVAL;
kfree(tp);
goto errout;
}
memset(tp, 0, sizeof(*tp));
tp->ops = tp_ops;
tp->protocol = protocol;
tp->prio = nprio ? : tcf_auto_prio(*back);
tp->q = q;
tp->classify = tp_ops->classify;
tp->classid = parent;
err = tp_ops->init(tp);
if (err) {
kfree(tp);
goto errout;
}
write_lock(&qdisc_tree_lock);
spin_lock_bh(&dev->queue_lock);
tp->next = *back;
*back = tp;
spin_unlock_bh(&dev->queue_lock);
write_unlock(&qdisc_tree_lock);
} else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind))
goto errout;
 
fh = tp->ops->get(tp, t->tcm_handle);
 
if (fh == 0) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
write_lock(&qdisc_tree_lock);
spin_lock_bh(&dev->queue_lock);
*back = tp->next;
spin_unlock_bh(&dev->queue_lock);
write_unlock(&qdisc_tree_lock);
tcf_destroy(tp);
err = 0;
goto errout;
}
 
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
goto errout;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTFILTER:
err = -EEXIST;
if (n->nlmsg_flags&NLM_F_EXCL)
goto errout;
break;
case RTM_DELTFILTER:
err = tp->ops->delete(tp, fh);
goto errout;
case RTM_GETTFILTER:
err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
goto errout;
default:
err = -EINVAL;
goto errout;
}
}
 
err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
if (err == 0)
tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
 
errout:
if (cl)
cops->put(q, cl);
return err;
}
 
static int
tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
u32 pid, u32 seq, unsigned flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
 
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
nlh->nlmsg_flags = flags;
tcm = NLMSG_DATA(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm_ifindex = tp->q->dev->ifindex;
tcm->tcm_parent = tp->classid;
tcm->tcm_handle = 0;
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind);
if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0)
goto rtattr_failure;
nlh->nlmsg_len = skb->tail - b;
return skb->len;
 
nlmsg_failure:
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
struct tcf_proto *tp, unsigned long fh, int event)
{
struct sk_buff *skb;
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
 
if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
 
return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
}
 
struct tcf_dump_args
{
struct tcf_walker w;
struct sk_buff *skb;
struct netlink_callback *cb;
};
 
static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg)
{
struct tcf_dump_args *a = (void*)arg;
 
return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
}
 
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
{
int t;
int s_t;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto *tp, **chain;
struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
unsigned long cl = 0;
struct Qdisc_class_ops *cops;
struct tcf_dump_args arg;
 
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
return skb->len;
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
return skb->len;
 
read_lock(&qdisc_tree_lock);
if (!tcm->tcm_parent)
q = dev->qdisc_sleeping;
else
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
if (q == NULL) {
read_unlock(&qdisc_tree_lock);
dev_put(dev);
return skb->len;
}
if ((cops = q->ops->cl_ops) == NULL)
goto errout;
if (TC_H_MIN(tcm->tcm_parent)) {
cl = cops->get(q, tcm->tcm_parent);
if (cl == 0)
goto errout;
}
chain = cops->tcf_chain(q, cl);
if (chain == NULL)
goto errout;
 
s_t = cb->args[0];
 
for (tp=*chain, t=0; tp; tp = tp->next, t++) {
if (t < s_t) continue;
if (TC_H_MAJ(tcm->tcm_info) &&
TC_H_MAJ(tcm->tcm_info) != tp->prio)
continue;
if (TC_H_MIN(tcm->tcm_info) &&
TC_H_MIN(tcm->tcm_info) != tp->protocol)
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
if (cb->args[1] == 0) {
if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) {
break;
}
cb->args[1] = 1;
}
if (tp->ops->walk == NULL)
continue;
arg.w.fn = tcf_node_dump;
arg.skb = skb;
arg.cb = cb;
arg.w.stop = 0;
arg.w.skip = cb->args[1]-1;
arg.w.count = 0;
tp->ops->walk(tp, &arg.w);
cb->args[1] = arg.w.count+1;
if (arg.w.stop)
break;
}
 
cb->args[0] = t;
 
errout:
if (cl)
cops->put(q, cl);
 
read_unlock(&qdisc_tree_lock);
dev_put(dev);
return skb->len;
}
 
 
int __init tc_filter_init(void)
{
struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
 
/* Setup rtnetlink links. It is made here to avoid
exporting large number of public symbols.
*/
 
if (link_p) {
link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter;
}
#define INIT_TC_FILTER(name) { \
extern struct tcf_proto_ops cls_##name##_ops; \
register_tcf_proto_ops(&cls_##name##_ops); \
}
 
#ifdef CONFIG_NET_CLS_U32
INIT_TC_FILTER(u32);
#endif
#ifdef CONFIG_NET_CLS_ROUTE4
INIT_TC_FILTER(route4);
#endif
#ifdef CONFIG_NET_CLS_FW
INIT_TC_FILTER(fw);
#endif
#ifdef CONFIG_NET_CLS_RSVP
INIT_TC_FILTER(rsvp);
#endif
#ifdef CONFIG_NET_CLS_TCINDEX
INIT_TC_FILTER(tcindex);
#endif
#ifdef CONFIG_NET_CLS_RSVP6
INIT_TC_FILTER(rsvp6);
#endif
return 0;
}
/sch_htb.c
0,0 → 1,1698
/* vim: ts=8 sw=8
* net/sched/sch_htb.c Hierarchical token bucket, feed tree version
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Martin Devera, <devik@cdi.cz>
*
* Credits (in time order) for older HTB versions:
* Stef Coene <stef.coene@docum.org>
* HTB support at LARTC mailing list
* Ondrej Kraus, <krauso@barr.cz>
* found missing INIT_QDISC(htb)
* Vladimir Smelhaus, Aamer Akhter, Bert Hubert
* helped a lot to locate nasty class stall bug
* Andi Kleen, Jamal Hadi, Bert Hubert
* code review and helpful comments on shaping
* Tomasz Wrona, <tw@eter.tym.pl>
* created test case so that I was able to fix nasty bug
* Wilfried Weissmann
* spotted bug in dequeue code and helped with fix
* Jiri Fojtasek
* fixed requeue routine
* and many others. thanks.
*
* $Id: sch_htb.c,v 1.1.1.1 2004-04-15 01:16:18 phoenix Exp $
*/
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/version.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/compiler.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <linux/rbtree.h>
 
/* HTB algorithm.
Author: devik@cdi.cz
========================================================================
HTB is like TBF with multiple classes. It is also similar to CBQ because
it allows to assign priority to each class in hierarchy.
In fact it is another implementation of Floyd's formal sharing.
 
Levels:
Each class is assigned level. Leaf has ALWAYS level 0 and root
classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
one less than their parent.
*/
 
#define HTB_HSIZE 16 /* classid hash size */
#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */
#define HTB_DEBUG 1 /* compile debugging support (activated by tc tool) */
#define HTB_RATECM 1 /* whether to use rate computer */
#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */
#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock)
#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock)
#define HTB_VER 0x30010 /* major must be matched with number suplied by TC as version */
 
#if HTB_VER >> 16 != TC_HTB_PROTOVER
#error "Mismatched sch_htb.c and pkt_sch.h"
#endif
 
/* debugging support; S is subsystem, these are defined:
0 - netlink messages
1 - enqueue
2 - drop & requeue
3 - dequeue main
4 - dequeue one prio DRR part
5 - dequeue class accounting
6 - class overlimit status computation
7 - hint tree
8 - event queue
10 - rate estimator
11 - classifier
12 - fast dequeue cache
 
L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full
q->debug uint32 contains 16 2-bit fields one for subsystem starting
from LSB
*/
#ifdef HTB_DEBUG
#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L)
#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \
printk(KERN_DEBUG FMT,##ARG)
#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC)
#define HTB_PASSQ q,
#define HTB_ARGQ struct htb_sched *q,
#define static
#undef __inline__
#define __inline__
#undef inline
#define inline
#define HTB_CMAGIC 0xFEFAFEF1
#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \
if ((N)->rb_color == -1) break; \
rb_erase(N,R); \
(N)->rb_color = -1; } while (0)
#else
#define HTB_DBG_COND(S,L) (0)
#define HTB_DBG(S,L,FMT,ARG...)
#define HTB_PASSQ
#define HTB_ARGQ
#define HTB_CHCL(cl)
#define htb_safe_rb_erase(N,R) rb_erase(N,R)
#endif
 
 
/* used internaly to keep status of single class */
enum htb_cmode {
HTB_CANT_SEND, /* class can't send and can't borrow */
HTB_MAY_BORROW, /* class can't send but may borrow */
HTB_CAN_SEND /* class can send */
};
 
/* interior & leaf nodes; props specific to leaves are marked L: */
struct htb_class
{
#ifdef HTB_DEBUG
unsigned magic;
#endif
/* general class parameters */
u32 classid;
struct tc_stats stats; /* generic stats */
struct tc_htb_xstats xstats;/* our special stats */
int refcnt; /* usage count of this class */
 
#ifdef HTB_RATECM
/* rate measurement counters */
unsigned long rate_bytes,sum_bytes;
unsigned long rate_packets,sum_packets;
#endif
 
/* topology */
int level; /* our level (see above) */
struct htb_class *parent; /* parent class */
struct list_head hlist; /* classid hash list item */
struct list_head sibling; /* sibling list item */
struct list_head children; /* children list */
 
union {
struct htb_class_leaf {
struct Qdisc *q;
int prio;
int aprio;
int quantum;
int deficit[TC_HTB_MAXDEPTH];
struct list_head drop_list;
} leaf;
struct htb_class_inner {
rb_root_t feed[TC_HTB_NUMPRIO]; /* feed trees */
rb_node_t *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
} inner;
} un;
rb_node_t node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
rb_node_t pq_node; /* node for event queue */
unsigned long pq_key; /* the same type as jiffies global */
int prio_activity; /* for which prios are we active */
enum htb_cmode cmode; /* current mode of the class */
 
/* class attached filters */
struct tcf_proto *filter_list;
int filter_cnt;
 
int warned; /* only one warning about non work conserving .. */
 
/* token bucket parameters */
struct qdisc_rate_table *rate; /* rate table of the class itself */
struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */
long buffer,cbuffer; /* token bucket depth/rate */
long mbuffer; /* max wait time */
long tokens,ctokens; /* current number of tokens */
psched_time_t t_c; /* checkpoint time */
};
 
/* TODO: maybe compute rate when size is too large .. or drop ? */
static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate,
int size)
{
int slot = size >> rate->rate.cell_log;
if (slot > 255) {
cl->xstats.giants++;
slot = 255;
}
return rate->data[slot];
}
 
struct htb_sched
{
struct list_head root; /* root classes list */
struct list_head hash[HTB_HSIZE]; /* hashed by classid */
struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */
/* self list - roots of self generating tree */
rb_root_t row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
int row_mask[TC_HTB_MAXDEPTH];
rb_node_t *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
 
/* self wait list - roots of wait PQs per row */
rb_root_t wait_pq[TC_HTB_MAXDEPTH];
 
/* time of nearest event per level (row) */
unsigned long near_ev_cache[TC_HTB_MAXDEPTH];
 
/* cached value of jiffies in dequeue */
unsigned long jiffies;
 
/* whether we hit non-work conserving class during this dequeue; we use */
int nwc_hit; /* this to disable mindelay complaint in dequeue */
 
int defcls; /* class where unclassified flows go to */
u32 debug; /* subsystem debug levels */
 
/* filters for qdisc itself */
struct tcf_proto *filter_list;
int filter_cnt;
 
int rate2quantum; /* quant = rate / rate2quantum */
psched_time_t now; /* cached dequeue time */
struct timer_list timer; /* send delay timer */
#ifdef HTB_RATECM
struct timer_list rttim; /* rate computer timer */
int recmp_bucket; /* which hash bucket to recompute next */
#endif
/* non shaped skbs; let them go directly thru */
struct sk_buff_head direct_queue;
int direct_qlen; /* max qlen of above */
 
long direct_pkts;
};
 
/* compute hash of size HTB_HSIZE for given handle */
static __inline__ int htb_hash(u32 h)
{
#if HTB_HSIZE != 16
#error "Declare new hash for your HTB_HSIZE"
#endif
h ^= h>>8; /* stolen from cbq_hash */
h ^= h>>4;
return h & 0xf;
}
 
/* find class in global hash table using given handle */
static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
struct list_head *p;
if (TC_H_MAJ(handle) != sch->handle)
return NULL;
list_for_each (p,q->hash+htb_hash(handle)) {
struct htb_class *cl = list_entry(p,struct htb_class,hlist);
if (cl->classid == handle)
return cl;
}
return NULL;
}
 
/**
* htb_classify - classify a packet into class
*
* It returns NULL if the packet should be dropped or -1 if the packet
* should be passed directly thru. In all other cases leaf class is returned.
* We allow direct class selection by classid in priority. The we examine
* filters in qdisc and in inner nodes (if higher filter points to the inner
* node). If we end up with classid MAJOR:0 we enqueue the skb into special
* internal fifo (direct). These packets then go directly thru. If we still
* have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
* then finish and return direct queue.
*/
#define HTB_DIRECT (struct htb_class*)-1
static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
struct htb_class *cl;
struct tcf_result res;
struct tcf_proto *tcf;
int result;
 
/* allow to select class by setting skb->priority to valid classid;
note that nfmark can be used too by attaching filter fw with no
rules in it */
if (skb->priority == sch->handle)
return HTB_DIRECT; /* X:0 (direct flow) selected */
if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0)
return cl;
 
tcf = q->filter_list;
while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
#ifdef CONFIG_NET_CLS_POLICE
if (result == TC_POLICE_SHOT)
return NULL;
#endif
if ((cl = (void*)res.class) == NULL) {
if (res.classid == sch->handle)
return HTB_DIRECT; /* X:0 (direct flow) */
if ((cl = htb_find(res.classid,sch)) == NULL)
break; /* filter selected invalid classid */
}
if (!cl->level)
return cl; /* we hit leaf; return it */
 
/* we have got inner class; apply inner filter chain */
tcf = cl->filter_list;
}
/* classification failed; try to use default class */
cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch);
if (!cl || cl->level)
return HTB_DIRECT; /* bad default .. this is safe bet */
return cl;
}
 
#ifdef HTB_DEBUG
static void htb_next_rb_node(rb_node_t **n);
#define HTB_DUMTREE(root,memb) if(root) { \
rb_node_t *n = (root)->rb_node; \
while (n->rb_left) n = n->rb_left; \
while (n) { \
struct htb_class *cl = rb_entry(n, struct htb_class, memb); \
printk(" %x",cl->classid); htb_next_rb_node (&n); \
} }
 
static void htb_debug_dump (struct htb_sched *q)
{
int i,p;
printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies);
/* rows */
for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) {
printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]);
for (p=0;p<TC_HTB_NUMPRIO;p++) {
if (!q->row[i][p].rb_node) continue;
printk(" p%d:",p);
HTB_DUMTREE(q->row[i]+p,node[p]);
}
printk("\n");
}
/* classes */
for (i = 0; i < HTB_HSIZE; i++) {
struct list_head *l;
list_for_each (l,q->hash+i) {
struct htb_class *cl = list_entry(l,struct htb_class,hlist);
long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0);
printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d "
"pa=%x f:",
cl->classid,cl->cmode,cl->tokens,cl->ctokens,
cl->pq_node.rb_color==-1?0:cl->pq_key,diff,
cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity);
if (cl->level)
for (p=0;p<TC_HTB_NUMPRIO;p++) {
if (!cl->un.inner.feed[p].rb_node) continue;
printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0);
HTB_DUMTREE(cl->un.inner.feed+p,node[p]);
}
printk("\n");
}
}
}
#endif
/**
* htb_add_to_id_tree - adds class to the round robin list
*
* Routine adds class to the list (actually tree) sorted by classid.
* Make sure that class is not already on such list for given prio.
*/
static void htb_add_to_id_tree (HTB_ARGQ rb_root_t *root,
struct htb_class *cl,int prio)
{
rb_node_t **p = &root->rb_node, *parent = NULL;
HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio);
#ifdef HTB_DEBUG
if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; }
HTB_CHCL(cl);
if (*p) {
struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]);
HTB_CHCL(x);
}
#endif
while (*p) {
struct htb_class *c; parent = *p;
c = rb_entry(parent, struct htb_class, node[prio]);
HTB_CHCL(c);
if (cl->classid > c->classid)
p = &parent->rb_right;
else
p = &parent->rb_left;
}
rb_link_node(&cl->node[prio], parent, p);
rb_insert_color(&cl->node[prio], root);
}
 
/**
* htb_add_to_wait_tree - adds class to the event queue with delay
*
* The class is added to priority event queue to indicate that class will
* change its mode in cl->pq_key microseconds. Make sure that class is not
* already in the queue.
*/
static void htb_add_to_wait_tree (struct htb_sched *q,
struct htb_class *cl,long delay,int debug_hint)
{
rb_node_t **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key);
#ifdef HTB_DEBUG
if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; }
HTB_CHCL(cl);
if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit())
printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint);
#endif
cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay);
if (cl->pq_key == q->jiffies)
cl->pq_key++;
 
/* update the nearest event cache */
if (time_after(q->near_ev_cache[cl->level], cl->pq_key))
q->near_ev_cache[cl->level] = cl->pq_key;
while (*p) {
struct htb_class *c; parent = *p;
c = rb_entry(parent, struct htb_class, pq_node);
if (time_after_eq(cl->pq_key, c->pq_key))
p = &parent->rb_right;
else
p = &parent->rb_left;
}
rb_link_node(&cl->pq_node, parent, p);
rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
}
 
/**
* htb_next_rb_node - finds next node in binary tree
*
* When we are past last key we return NULL.
* Average complexity is 2 steps per call.
*/
static void htb_next_rb_node(rb_node_t **n)
{
rb_node_t *p;
if ((*n)->rb_right) {
/* child at right. use it or its leftmost ancestor */
*n = (*n)->rb_right;
while ((*n)->rb_left)
*n = (*n)->rb_left;
return;
}
while ((p = (*n)->rb_parent) != NULL) {
/* if we've arrived from left child then we have next node */
if (p->rb_left == *n) break;
*n = p;
}
*n = p;
}
 
/**
* htb_add_class_to_row - add class to its row
*
* The class is added to row at priorities marked in mask.
* It does nothing if mask == 0.
*/
static inline void htb_add_class_to_row(struct htb_sched *q,
struct htb_class *cl,int mask)
{
HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n",
cl->classid,mask,q->row_mask[cl->level]);
HTB_CHCL(cl);
q->row_mask[cl->level] |= mask;
while (mask) {
int prio = ffz(~mask);
mask &= ~(1 << prio);
htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio);
}
}
 
/**
* htb_remove_class_from_row - removes class from its row
*
* The class is removed from row at priorities marked in mask.
* It does nothing if mask == 0.
*/
static __inline__ void htb_remove_class_from_row(struct htb_sched *q,
struct htb_class *cl,int mask)
{
int m = 0;
HTB_CHCL(cl);
while (mask) {
int prio = ffz(~mask);
mask &= ~(1 << prio);
if (q->ptr[cl->level][prio] == cl->node+prio)
htb_next_rb_node(q->ptr[cl->level]+prio);
htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio);
if (!q->row[cl->level][prio].rb_node)
m |= 1 << prio;
}
HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n",
cl->classid,mask,q->row_mask[cl->level],m);
q->row_mask[cl->level] &= ~m;
}
 
/**
* htb_activate_prios - creates active classe's feed chain
*
* The class is connected to ancestors and/or appropriate rows
* for priorities it is participating on. cl->cmode must be new
* (activated) mode. It does nothing if cl->prio_activity == 0.
*/
static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl)
{
struct htb_class *p = cl->parent;
long m,mask = cl->prio_activity;
HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode);
HTB_CHCL(cl);
 
while (cl->cmode == HTB_MAY_BORROW && p && mask) {
HTB_CHCL(p);
m = mask; while (m) {
int prio = ffz(~m);
m &= ~(1 << prio);
if (p->un.inner.feed[prio].rb_node)
/* parent already has its feed in use so that
reset bit in mask as parent is already ok */
mask &= ~(1 << prio);
htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio);
}
HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n",
p->classid,p->prio_activity,mask,p->cmode);
p->prio_activity |= mask;
cl = p; p = cl->parent;
HTB_CHCL(cl);
}
if (cl->cmode == HTB_CAN_SEND && mask)
htb_add_class_to_row(q,cl,mask);
}
 
/**
* htb_deactivate_prios - remove class from feed chain
*
* cl->cmode must represent old mode (before deactivation). It does
* nothing if cl->prio_activity == 0. Class is removed from all feed
* chains and rows.
*/
static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
{
struct htb_class *p = cl->parent;
long m,mask = cl->prio_activity;
HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode);
HTB_CHCL(cl);
 
while (cl->cmode == HTB_MAY_BORROW && p && mask) {
m = mask; mask = 0;
while (m) {
int prio = ffz(~m);
m &= ~(1 << prio);
if (p->un.inner.ptr[prio] == cl->node+prio)
htb_next_rb_node(p->un.inner.ptr + prio);
htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio);
if (!p->un.inner.feed[prio].rb_node)
mask |= 1 << prio;
}
HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n",
p->classid,p->prio_activity,mask,p->cmode);
p->prio_activity &= ~mask;
cl = p; p = cl->parent;
HTB_CHCL(cl);
}
if (cl->cmode == HTB_CAN_SEND && mask)
htb_remove_class_from_row(q,cl,mask);
}
 
/**
* htb_class_mode - computes and returns current class mode
*
* It computes cl's mode at time cl->t_c+diff and returns it. If mode
* is not HTB_CAN_SEND then cl->pq_key is updated to time difference
* from now to time when cl will change its state.
* Also it is worth to note that class mode doesn't change simply
* at cl->{c,}tokens == 0 but there can rather be hysteresis of
* 0 .. -cl->{c,}buffer range. It is meant to limit number of
* mode transitions per time unit. The speed gain is about 1/6.
*/
static __inline__ enum htb_cmode
htb_class_mode(struct htb_class *cl,long *diff)
{
long toks;
 
if ((toks = (cl->ctokens + *diff)) < (
#if HTB_HYSTERESIS
cl->cmode != HTB_CANT_SEND ? -cl->cbuffer :
#endif
0)) {
*diff = -toks;
return HTB_CANT_SEND;
}
if ((toks = (cl->tokens + *diff)) >= (
#if HTB_HYSTERESIS
cl->cmode == HTB_CAN_SEND ? -cl->buffer :
#endif
0))
return HTB_CAN_SEND;
 
*diff = -toks;
return HTB_MAY_BORROW;
}
 
/**
* htb_change_class_mode - changes classe's mode
*
* This should be the only way how to change classe's mode under normal
* cirsumstances. Routine will update feed lists linkage, change mode
* and add class to the wait event queue if appropriate. New mode should
* be different from old one and cl->pq_key has to be valid if changing
* to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
*/
static void
htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
{
enum htb_cmode new_mode = htb_class_mode(cl,diff);
HTB_CHCL(cl);
HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid);
 
if (new_mode == cl->cmode)
return;
if (cl->prio_activity) { /* not neccessary: speed optimization */
if (cl->cmode != HTB_CANT_SEND)
htb_deactivate_prios(q,cl);
cl->cmode = new_mode;
if (new_mode != HTB_CANT_SEND)
htb_activate_prios(q,cl);
} else
cl->cmode = new_mode;
}
 
/**
* htb_activate - inserts leaf cl into appropriate active feeds
*
* Routine learns (new) priority of leaf and activates feed chain
* for the prio. It can be called on already active leaf safely.
* It also adds leaf into droplist.
*/
static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl)
{
BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
HTB_CHCL(cl);
if (!cl->prio_activity) {
cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
htb_activate_prios(q,cl);
list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio);
}
}
 
/**
* htb_deactivate - remove leaf cl from active feeds
*
* Make sure that leaf is active. In the other words it can't be called
* with non-active leaf. It also removes class from the drop list.
*/
static __inline__ void
htb_deactivate(struct htb_sched *q,struct htb_class *cl)
{
BUG_TRAP(cl->prio_activity);
HTB_CHCL(cl);
htb_deactivate_prios(q,cl);
cl->prio_activity = 0;
list_del_init(&cl->un.leaf.drop_list);
}
 
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
struct htb_class *cl = htb_classify(skb,sch);
 
if (cl == HTB_DIRECT || !cl) {
/* enqueue to helper queue */
if (q->direct_queue.qlen < q->direct_qlen && cl) {
__skb_queue_tail(&q->direct_queue, skb);
q->direct_pkts++;
} else {
kfree_skb (skb);
sch->stats.drops++;
return NET_XMIT_DROP;
}
} else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
sch->stats.drops++;
cl->stats.drops++;
return NET_XMIT_DROP;
} else {
cl->stats.packets++; cl->stats.bytes += skb->len;
htb_activate (q,cl);
}
 
sch->q.qlen++;
sch->stats.packets++; sch->stats.bytes += skb->len;
HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb);
return NET_XMIT_SUCCESS;
}
 
/* TODO: requeuing packet charges it to policers again !! */
static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
struct htb_class *cl = htb_classify(skb,sch);
struct sk_buff *tskb;
 
if (cl == HTB_DIRECT || !cl) {
/* enqueue to helper queue */
if (q->direct_queue.qlen < q->direct_qlen && cl) {
__skb_queue_head(&q->direct_queue, skb);
} else {
__skb_queue_head(&q->direct_queue, skb);
tskb = __skb_dequeue_tail(&q->direct_queue);
kfree_skb (tskb);
sch->stats.drops++;
return NET_XMIT_CN;
}
} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
sch->stats.drops++;
cl->stats.drops++;
return NET_XMIT_DROP;
} else
htb_activate (q,cl);
 
sch->q.qlen++;
HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb);
return NET_XMIT_SUCCESS;
}
 
static void htb_timer(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
sch->flags &= ~TCQ_F_THROTTLED;
wmb();
netif_schedule(sch->dev);
}
 
#ifdef HTB_RATECM
#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
static void htb_rate_timer(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
struct htb_sched *q = (struct htb_sched *)sch->data;
struct list_head *p;
 
/* lock queue so that we can muck with it */
HTB_QLOCK(sch);
HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies);
 
q->rttim.expires = jiffies + HZ;
add_timer(&q->rttim);
 
/* scan and recompute one bucket at time */
if (++q->recmp_bucket >= HTB_HSIZE)
q->recmp_bucket = 0;
list_for_each (p,q->hash+q->recmp_bucket) {
struct htb_class *cl = list_entry(p,struct htb_class,hlist);
HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n",
cl->classid,cl->sum_bytes,cl->sum_packets);
RT_GEN (cl->sum_bytes,cl->rate_bytes);
RT_GEN (cl->sum_packets,cl->rate_packets);
}
HTB_QUNLOCK(sch);
}
#endif
 
/**
* htb_charge_class - charges ammount "bytes" to leaf and ancestors
*
* Routine assumes that packet "bytes" long was dequeued from leaf cl
* borrowing from "level". It accounts bytes to ceil leaky bucket for
* leaf and all ancestors and to rate bucket for ancestors at levels
* "level" and higher. It also handles possible change of mode resulting
* from the update. Note that mode can also increase here (MAY_BORROW to
* CAN_SEND) because we can use more precise clock that event queue here.
* In such case we remove class from event queue first.
*/
static void htb_charge_class(struct htb_sched *q,struct htb_class *cl,
int level,int bytes)
{
long toks,diff;
enum htb_cmode old_mode;
HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes);
 
#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
if (toks > cl->B) toks = cl->B; \
toks -= L2T(cl, cl->R, bytes); \
if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
cl->T = toks
 
while (cl) {
HTB_CHCL(cl);
diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0);
#ifdef HTB_DEBUG
if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) {
if (net_ratelimit())
printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n",
cl->classid, diff,
(unsigned long long) q->now,
(unsigned long long) cl->t_c,
q->jiffies);
diff = 1000;
}
#endif
if (cl->level >= level) {
if (cl->level == level) cl->xstats.lends++;
HTB_ACCNT (tokens,buffer,rate);
} else {
cl->xstats.borrows++;
cl->tokens += diff; /* we moved t_c; update tokens */
}
HTB_ACCNT (ctokens,cbuffer,ceil);
cl->t_c = q->now;
HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens);
 
old_mode = cl->cmode; diff = 0;
htb_change_class_mode(q,cl,&diff);
if (old_mode != cl->cmode) {
if (old_mode != HTB_CAN_SEND)
htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level);
if (cl->cmode != HTB_CAN_SEND)
htb_add_to_wait_tree (q,cl,diff,1);
}
#ifdef HTB_RATECM
/* update rate counters */
cl->sum_bytes += bytes; cl->sum_packets++;
#endif
 
/* update byte stats except for leaves which are already updated */
if (cl->level) {
cl->stats.bytes += bytes;
cl->stats.packets++;
}
cl = cl->parent;
}
}
 
/**
* htb_do_events - make mode changes to classes at the level
*
* Scans event queue for pending events and applies them. Returns jiffies to
* next pending event (0 for no event in pq).
* Note: Aplied are events whose have cl->pq_key <= jiffies.
*/
static long htb_do_events(struct htb_sched *q,int level)
{
int i;
HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n",
level,q->wait_pq[level].rb_node,q->row_mask[level]);
for (i = 0; i < 500; i++) {
struct htb_class *cl;
long diff;
rb_node_t *p = q->wait_pq[level].rb_node;
if (!p) return 0;
while (p->rb_left) p = p->rb_left;
 
cl = rb_entry(p, struct htb_class, pq_node);
if (time_after(cl->pq_key, q->jiffies)) {
HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies);
return cl->pq_key - q->jiffies;
}
htb_safe_rb_erase(p,q->wait_pq+level);
diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0);
#ifdef HTB_DEBUG
if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) {
if (net_ratelimit())
printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n",
cl->classid, diff,
(unsigned long long) q->now,
(unsigned long long) cl->t_c,
q->jiffies);
diff = 1000;
}
#endif
htb_change_class_mode(q,cl,&diff);
if (cl->cmode != HTB_CAN_SEND)
htb_add_to_wait_tree (q,cl,diff,2);
}
if (net_ratelimit())
printk(KERN_WARNING "htb: too many events !\n");
return HZ/10;
}
 
/**
* htb_lookup_leaf - returns next leaf class in DRR order
*
* Find leaf where current feed pointers points to.
*/
static struct htb_class *
htb_lookup_leaf(rb_root_t *tree,int prio,rb_node_t **pptr)
{
int i;
struct {
rb_node_t *root;
rb_node_t **pptr;
} stk[TC_HTB_MAXDEPTH],*sp = stk;
BUG_TRAP(tree->rb_node);
sp->root = tree->rb_node;
sp->pptr = pptr;
 
for (i = 0; i < 65535; i++) {
if (!*sp->pptr) { /* we are at right end; rewind & go up */
*sp->pptr = sp->root;
while ((*sp->pptr)->rb_left)
*sp->pptr = (*sp->pptr)->rb_left;
if (sp > stk) {
sp--;
BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL;
htb_next_rb_node (sp->pptr);
}
} else {
struct htb_class *cl;
cl = rb_entry(*sp->pptr,struct htb_class,node[prio]);
HTB_CHCL(cl);
if (!cl->level)
return cl;
(++sp)->root = cl->un.inner.feed[prio].rb_node;
sp->pptr = cl->un.inner.ptr+prio;
}
}
BUG_TRAP(0);
return NULL;
}
 
/* dequeues packet at given priority and level; call only if
you are sure that there is active class at prio/level */
static struct sk_buff *
htb_dequeue_tree(struct htb_sched *q,int prio,int level)
{
struct sk_buff *skb = NULL;
struct htb_class *cl,*start;
/* look initial class up in the row */
start = cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio);
do {
next:
BUG_TRAP(cl);
if (!cl) return NULL;
HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n",
prio,level,cl->classid,cl->un.leaf.deficit[level]);
 
/* class can be empty - it is unlikely but can be true if leaf
qdisc drops packets in enqueue routine or if someone used
graft operation on the leaf since last dequeue;
simply deactivate and skip such class */
if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
struct htb_class *next;
htb_deactivate(q,cl);
 
/* row/level might become empty */
if ((q->row_mask[level] & (1 << prio)) == 0)
return NULL;
next = htb_lookup_leaf (q->row[level]+prio,
prio,q->ptr[level]+prio);
if (cl == start) /* fix start if we just deleted it */
start = next;
cl = next;
goto next;
}
if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL))
break;
if (!cl->warned) {
printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid);
cl->warned = 1;
}
q->nwc_hit++;
htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio);
cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio);
} while (cl != start);
 
if (likely(skb != NULL)) {
if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n",
level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum);
cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio);
}
/* this used to be after charge_class but this constelation
gives us slightly better performance */
if (!cl->un.leaf.q->q.qlen)
htb_deactivate (q,cl);
htb_charge_class (q,cl,level,skb->len);
}
return skb;
}
 
static void htb_delay_by(struct Qdisc *sch,long delay)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
if (netif_queue_stopped(sch->dev)) return;
if (delay <= 0) delay = 1;
if (unlikely(delay > 5*HZ)) {
if (net_ratelimit())
printk(KERN_INFO "HTB delay %ld > 5sec\n", delay);
delay = 5*HZ;
}
/* why don't use jiffies here ? because expires can be in past */
mod_timer(&q->timer, q->jiffies + delay);
sch->flags |= TCQ_F_THROTTLED;
sch->stats.overlimits++;
HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay);
}
 
static struct sk_buff *htb_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb = NULL;
struct htb_sched *q = (struct htb_sched *)sch->data;
int level;
long min_delay;
#ifdef HTB_DEBUG
int evs_used = 0;
#endif
 
q->jiffies = jiffies;
HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue),
sch->q.qlen);
 
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) {
sch->flags &= ~TCQ_F_THROTTLED;
sch->q.qlen--;
return skb;
}
 
if (!sch->q.qlen) goto fin;
PSCHED_GET_TIME(q->now);
 
min_delay = LONG_MAX;
q->nwc_hit = 0;
for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
/* common case optimization - skip event handler quickly */
int m;
long delay;
if (time_after_eq(q->jiffies, q->near_ev_cache[level])) {
delay = htb_do_events(q,level);
q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ);
#ifdef HTB_DEBUG
evs_used++;
#endif
} else
delay = q->near_ev_cache[level] - q->jiffies;
if (delay && min_delay > delay)
min_delay = delay;
m = ~q->row_mask[level];
while (m != (int)(-1)) {
int prio = ffz (m);
m |= 1 << prio;
skb = htb_dequeue_tree(q,prio,level);
if (likely(skb != NULL)) {
sch->q.qlen--;
sch->flags &= ~TCQ_F_THROTTLED;
goto fin;
}
}
}
#ifdef HTB_DEBUG
if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) {
if (min_delay == LONG_MAX) {
printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n",
evs_used,q->jiffies,jiffies);
htb_debug_dump(q);
} else
printk(KERN_WARNING "HTB: mindelay=%ld, some class has "
"too small rate\n",min_delay);
}
#endif
htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay);
fin:
HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb);
return skb;
}
 
/* try to drop from each class (by prio) until one succeed */
static unsigned int htb_drop(struct Qdisc* sch)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
int prio;
 
for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
struct list_head *p;
list_for_each (p,q->drops+prio) {
struct htb_class *cl = list_entry(p, struct htb_class,
un.leaf.drop_list);
unsigned int len;
if (cl->un.leaf.q->ops->drop &&
(len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
sch->q.qlen--;
if (!cl->un.leaf.q->q.qlen)
htb_deactivate (q,cl);
return len;
}
}
}
return 0;
}
 
/* reset all classes */
/* always caled under BH & queue lock */
static void htb_reset(struct Qdisc* sch)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
int i;
HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle);
 
for (i = 0; i < HTB_HSIZE; i++) {
struct list_head *p;
list_for_each (p,q->hash+i) {
struct htb_class *cl = list_entry(p,struct htb_class,hlist);
if (cl->level)
memset(&cl->un.inner,0,sizeof(cl->un.inner));
else {
if (cl->un.leaf.q)
qdisc_reset(cl->un.leaf.q);
INIT_LIST_HEAD(&cl->un.leaf.drop_list);
}
cl->prio_activity = 0;
cl->cmode = HTB_CAN_SEND;
#ifdef HTB_DEBUG
cl->pq_node.rb_color = -1;
memset(cl->node,255,sizeof(cl->node));
#endif
 
}
}
sch->flags &= ~TCQ_F_THROTTLED;
del_timer(&q->timer);
__skb_queue_purge(&q->direct_queue);
sch->q.qlen = 0;
memset(q->row,0,sizeof(q->row));
memset(q->row_mask,0,sizeof(q->row_mask));
memset(q->wait_pq,0,sizeof(q->wait_pq));
memset(q->ptr,0,sizeof(q->ptr));
for (i = 0; i < TC_HTB_NUMPRIO; i++)
INIT_LIST_HEAD(q->drops+i);
}
 
static int htb_init(struct Qdisc *sch, struct rtattr *opt)
{
struct htb_sched *q = (struct htb_sched*)sch->data;
struct rtattr *tb[TCA_HTB_INIT];
struct tc_htb_glob *gopt;
int i;
#ifdef HTB_DEBUG
printk(KERN_INFO "HTB init, kernel part version %d.%d\n",
HTB_VER >> 16,HTB_VER & 0xffff);
#endif
if (!opt || rtattr_parse(tb, TCA_HTB_INIT, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
tb[TCA_HTB_INIT-1] == NULL ||
RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) {
printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
return -EINVAL;
}
gopt = RTA_DATA(tb[TCA_HTB_INIT-1]);
if (gopt->version != HTB_VER >> 16) {
printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n",
HTB_VER >> 16,HTB_VER & 0xffff,gopt->version);
return -EINVAL;
}
memset(q,0,sizeof(*q));
q->debug = gopt->debug;
HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum);
 
INIT_LIST_HEAD(&q->root);
for (i = 0; i < HTB_HSIZE; i++)
INIT_LIST_HEAD(q->hash+i);
for (i = 0; i < TC_HTB_NUMPRIO; i++)
INIT_LIST_HEAD(q->drops+i);
 
init_timer(&q->timer);
skb_queue_head_init(&q->direct_queue);
 
q->direct_qlen = sch->dev->tx_queue_len;
if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
q->direct_qlen = 2;
q->timer.function = htb_timer;
q->timer.data = (unsigned long)sch;
 
#ifdef HTB_RATECM
init_timer(&q->rttim);
q->rttim.function = htb_rate_timer;
q->rttim.data = (unsigned long)sch;
q->rttim.expires = jiffies + HZ;
add_timer(&q->rttim);
#endif
if ((q->rate2quantum = gopt->rate2quantum) < 1)
q->rate2quantum = 1;
q->defcls = gopt->defcls;
 
MOD_INC_USE_COUNT;
return 0;
}
 
static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct htb_sched *q = (struct htb_sched*)sch->data;
unsigned char *b = skb->tail;
struct rtattr *rta;
struct tc_htb_glob gopt;
HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle);
/* stats */
HTB_QLOCK(sch);
gopt.direct_pkts = q->direct_pkts;
 
#ifdef HTB_DEBUG
if (HTB_DBG_COND(0,2))
htb_debug_dump(q);
#endif
gopt.version = HTB_VER;
gopt.rate2quantum = q->rate2quantum;
gopt.defcls = q->defcls;
gopt.debug = q->debug;
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
rta->rta_len = skb->tail - b;
sch->stats.qlen = sch->q.qlen;
RTA_PUT(skb, TCA_STATS, sizeof(sch->stats), &sch->stats);
HTB_QUNLOCK(sch);
return skb->len;
rtattr_failure:
HTB_QUNLOCK(sch);
skb_trim(skb, skb->tail - skb->data);
return -1;
}
 
static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
#ifdef HTB_DEBUG
struct htb_sched *q = (struct htb_sched*)sch->data;
#endif
struct htb_class *cl = (struct htb_class*)arg;
unsigned char *b = skb->tail;
struct rtattr *rta;
struct tc_htb_opt opt;
 
HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid);
 
HTB_QLOCK(sch);
tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
tcm->tcm_handle = cl->classid;
if (!cl->level && cl->un.leaf.q) {
tcm->tcm_info = cl->un.leaf.q->handle;
cl->stats.qlen = cl->un.leaf.q->q.qlen;
}
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
memset (&opt,0,sizeof(opt));
 
opt.rate = cl->rate->rate; opt.buffer = cl->buffer;
opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer;
opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio;
opt.level = cl->level;
RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
rta->rta_len = skb->tail - b;
 
#ifdef HTB_RATECM
cl->stats.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE);
cl->stats.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE);
#endif
 
cl->xstats.tokens = cl->tokens;
cl->xstats.ctokens = cl->ctokens;
RTA_PUT(skb, TCA_STATS, sizeof(cl->stats), &cl->stats);
RTA_PUT(skb, TCA_XSTATS, sizeof(cl->xstats), &cl->xstats);
HTB_QUNLOCK(sch);
return skb->len;
rtattr_failure:
HTB_QUNLOCK(sch);
skb_trim(skb, b - skb->data);
return -1;
}
 
static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct htb_class *cl = (struct htb_class*)arg;
 
if (cl && !cl->level) {
if (new == NULL && (new = qdisc_create_dflt(sch->dev,
&pfifo_qdisc_ops)) == NULL)
return -ENOBUFS;
sch_tree_lock(sch);
if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
if (cl->prio_activity)
htb_deactivate ((struct htb_sched*)sch->data,cl);
 
/* TODO: is it correct ? Why CBQ doesn't do it ? */
sch->q.qlen -= (*old)->q.qlen;
qdisc_reset(*old);
}
sch_tree_unlock(sch);
return 0;
}
return -ENOENT;
}
 
static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg)
{
struct htb_class *cl = (struct htb_class*)arg;
return (cl && !cl->level) ? cl->un.leaf.q : NULL;
}
 
static unsigned long htb_get(struct Qdisc *sch, u32 classid)
{
#ifdef HTB_DEBUG
struct htb_sched *q = (struct htb_sched *)sch->data;
#endif
struct htb_class *cl = htb_find(classid,sch);
HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0);
if (cl)
cl->refcnt++;
return (unsigned long)cl;
}
 
static void htb_destroy_filters(struct tcf_proto **fl)
{
struct tcf_proto *tp;
 
while ((tp = *fl) != NULL) {
*fl = tp->next;
tcf_destroy(tp);
}
}
 
static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0);
if (!cl->level) {
BUG_TRAP(cl->un.leaf.q);
sch->q.qlen -= cl->un.leaf.q->q.qlen;
qdisc_destroy(cl->un.leaf.q);
}
qdisc_put_rtab(cl->rate);
qdisc_put_rtab(cl->ceil);
#ifdef CONFIG_NET_ESTIMATOR
qdisc_kill_estimator(&cl->stats);
#endif
htb_destroy_filters (&cl->filter_list);
while (!list_empty(&cl->children))
htb_destroy_class (sch,list_entry(cl->children.next,
struct htb_class,sibling));
 
/* note: this delete may happen twice (see htb_delete) */
list_del(&cl->hlist);
list_del(&cl->sibling);
if (cl->prio_activity)
htb_deactivate (q,cl);
if (cl->cmode != HTB_CAN_SEND)
htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level);
kfree(cl);
}
 
/* always caled under BH & queue lock */
static void htb_destroy(struct Qdisc* sch)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
HTB_DBG(0,1,"htb_destroy q=%p\n",q);
 
del_timer_sync (&q->timer);
#ifdef HTB_RATECM
del_timer_sync (&q->rttim);
#endif
/* This line used to be after htb_destroy_class call below
and surprisingly it worked in 2.4. But it must precede it
because filter need its target class alive to be able to call
unbind_filter on it (without Oops). */
htb_destroy_filters(&q->filter_list);
while (!list_empty(&q->root))
htb_destroy_class (sch,list_entry(q->root.next,
struct htb_class,sibling));
 
__skb_queue_purge(&q->direct_queue);
MOD_DEC_USE_COUNT;
}
 
static int htb_delete(struct Qdisc *sch, unsigned long arg)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
struct htb_class *cl = (struct htb_class*)arg;
HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0);
 
// TODO: why don't allow to delete subtree ? references ? does
// tc subsys quarantee us that in htb_destroy it holds no class
// refs so that we can remove children safely there ?
if (!list_empty(&cl->children) || cl->filter_cnt)
return -EBUSY;
sch_tree_lock(sch);
/* delete from hash and active; remainder in destroy_class */
list_del_init(&cl->hlist);
if (cl->prio_activity)
htb_deactivate (q,cl);
 
if (--cl->refcnt == 0)
htb_destroy_class(sch,cl);
 
sch_tree_unlock(sch);
return 0;
}
 
static void htb_put(struct Qdisc *sch, unsigned long arg)
{
#ifdef HTB_DEBUG
struct htb_sched *q = (struct htb_sched *)sch->data;
#endif
struct htb_class *cl = (struct htb_class*)arg;
HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0);
 
if (--cl->refcnt == 0)
htb_destroy_class(sch,cl);
}
 
static int htb_change_class(struct Qdisc *sch, u32 classid,
u32 parentid, struct rtattr **tca, unsigned long *arg)
{
int err = -EINVAL;
struct htb_sched *q = (struct htb_sched *)sch->data;
struct htb_class *cl = (struct htb_class*)*arg,*parent;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
struct rtattr *tb[TCA_HTB_RTAB];
struct tc_htb_opt *hopt;
 
/* extract all subattrs from opt attr */
if (!opt || rtattr_parse(tb, TCA_HTB_RTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
tb[TCA_HTB_PARMS-1] == NULL ||
RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt))
goto failure;
parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch);
 
hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]);
HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum);
rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]);
ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]);
if (!rtab || !ctab) goto failure;
 
if (!cl) { /* new class */
struct Qdisc *new_q;
/* check for valid classid */
if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch))
goto failure;
 
/* check maximal depth */
if (parent && parent->parent && parent->parent->level < 2) {
printk(KERN_ERR "htb: tree is too deep\n");
goto failure;
}
err = -ENOBUFS;
if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
goto failure;
memset(cl, 0, sizeof(*cl));
cl->refcnt = 1;
INIT_LIST_HEAD(&cl->sibling);
INIT_LIST_HEAD(&cl->hlist);
INIT_LIST_HEAD(&cl->children);
INIT_LIST_HEAD(&cl->un.leaf.drop_list);
#ifdef HTB_DEBUG
cl->magic = HTB_CMAGIC;
#endif
 
/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
so that can't be used inside of sch_tree_lock
-- thanks to Karlis Peisenieks */
new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
sch_tree_lock(sch);
if (parent && !parent->level) {
/* turn parent into inner node */
sch->q.qlen -= parent->un.leaf.q->q.qlen;
qdisc_destroy (parent->un.leaf.q);
if (parent->prio_activity)
htb_deactivate (q,parent);
 
/* remove from evt list because of level change */
if (parent->cmode != HTB_CAN_SEND) {
htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/);
parent->cmode = HTB_CAN_SEND;
}
parent->level = (parent->parent ? parent->parent->level
: TC_HTB_MAXDEPTH) - 1;
memset (&parent->un.inner,0,sizeof(parent->un.inner));
}
/* leaf (we) needs elementary qdisc */
cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
 
cl->classid = classid; cl->parent = parent;
 
/* set class to be in HTB_CAN_SEND state */
cl->tokens = hopt->buffer;
cl->ctokens = hopt->cbuffer;
cl->mbuffer = 60000000; /* 1min */
PSCHED_GET_TIME(cl->t_c);
cl->cmode = HTB_CAN_SEND;
 
/* attach to the hash list and parent's family */
list_add_tail(&cl->hlist, q->hash+htb_hash(classid));
list_add_tail(&cl->sibling, parent ? &parent->children : &q->root);
#ifdef HTB_DEBUG
{
int i;
for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1;
cl->pq_node.rb_color = -1;
}
#endif
} else sch_tree_lock(sch);
 
/* it used to be a nasty bug here, we have to check that node
is really leaf before changing cl->un.leaf ! */
if (!cl->level) {
cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid);
cl->un.leaf.quantum = 1000;
}
if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid);
cl->un.leaf.quantum = 200000;
}
if (hopt->quantum)
cl->un.leaf.quantum = hopt->quantum;
if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
}
 
cl->buffer = hopt->buffer;
cl->cbuffer = hopt->cbuffer;
if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab;
if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab;
sch_tree_unlock(sch);
 
*arg = (unsigned long)cl;
return 0;
 
failure:
if (rtab) qdisc_put_rtab(rtab);
if (ctab) qdisc_put_rtab(ctab);
return err;
}
 
static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
struct htb_class *cl = (struct htb_class *)arg;
struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl);
return fl;
}
 
static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
struct htb_class *cl = htb_find (classid,sch);
HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt);
/*if (cl && !cl->level) return 0;
The line above used to be there to prevent attaching filters to
leaves. But at least tc_index filter uses this just to get class
for other reasons so that we have to allow for it.
----
19.6.2002 As Werner explained it is ok - bind filter is just
another way to "lock" the class - unlike "get" this lock can
be broken by class during destroy IIUC.
*/
if (cl)
cl->filter_cnt++;
else
q->filter_cnt++;
return (unsigned long)cl;
}
 
static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
struct htb_class *cl = (struct htb_class *)arg;
HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt);
if (cl)
cl->filter_cnt--;
else
q->filter_cnt--;
}
 
static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct htb_sched *q = (struct htb_sched *)sch->data;
int i;
 
if (arg->stop)
return;
 
for (i = 0; i < HTB_HSIZE; i++) {
struct list_head *p;
list_for_each (p,q->hash+i) {
struct htb_class *cl = list_entry(p,struct htb_class,hlist);
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
 
static struct Qdisc_class_ops htb_class_ops =
{
htb_graft,
htb_leaf,
htb_get,
htb_put,
htb_change_class,
htb_delete,
htb_walk,
 
htb_find_tcf,
htb_bind_filter,
htb_unbind_filter,
 
htb_dump_class,
};
 
struct Qdisc_ops htb_qdisc_ops =
{
NULL,
&htb_class_ops,
"htb",
sizeof(struct htb_sched),
 
htb_enqueue,
htb_dequeue,
htb_requeue,
htb_drop,
 
htb_init,
htb_reset,
htb_destroy,
NULL /* htb_change */,
 
htb_dump,
};
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&htb_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&htb_qdisc_ops);
}
MODULE_LICENSE("GPL");
#endif
/police.c
0,0 → 1,251
/*
* net/sched/police.c Input police filter.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/config.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log])
#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log])
 
static u32 idx_gen;
static struct tcf_police *tcf_police_ht[16];
/* Policer hash table lock */
static rwlock_t police_lock = RW_LOCK_UNLOCKED;
 
/* Each policer is serialized by its individual spinlock */
 
static __inline__ unsigned tcf_police_hash(u32 index)
{
return index&0xF;
}
 
static __inline__ struct tcf_police * tcf_police_lookup(u32 index)
{
struct tcf_police *p;
 
read_lock(&police_lock);
for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) {
if (p->index == index)
break;
}
read_unlock(&police_lock);
return p;
}
 
static __inline__ u32 tcf_police_new_index(void)
{
do {
if (++idx_gen == 0)
idx_gen = 1;
} while (tcf_police_lookup(idx_gen));
 
return idx_gen;
}
 
 
void tcf_police_destroy(struct tcf_police *p)
{
unsigned h = tcf_police_hash(p->index);
struct tcf_police **p1p;
for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) {
if (*p1p == p) {
write_lock_bh(&police_lock);
*p1p = p->next;
write_unlock_bh(&police_lock);
#ifdef CONFIG_NET_ESTIMATOR
qdisc_kill_estimator(&p->stats);
#endif
if (p->R_tab)
qdisc_put_rtab(p->R_tab);
if (p->P_tab)
qdisc_put_rtab(p->P_tab);
kfree(p);
return;
}
}
BUG_TRAP(0);
}
 
struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est)
{
unsigned h;
struct tcf_police *p;
struct rtattr *tb[TCA_POLICE_MAX];
struct tc_police *parm;
 
if (rtattr_parse(tb, TCA_POLICE_MAX, RTA_DATA(rta), RTA_PAYLOAD(rta)) < 0)
return NULL;
 
if (tb[TCA_POLICE_TBF-1] == NULL)
return NULL;
 
parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);
 
if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
p->refcnt++;
return p;
}
 
p = kmalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return NULL;
 
memset(p, 0, sizeof(*p));
p->refcnt = 1;
spin_lock_init(&p->lock);
p->stats.lock = &p->lock;
if (parm->rate.rate) {
if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL)
goto failure;
if (parm->peakrate.rate &&
(p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL)
goto failure;
}
if (tb[TCA_POLICE_RESULT-1])
p->result = *(int*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
#ifdef CONFIG_NET_ESTIMATOR
if (tb[TCA_POLICE_AVRATE-1])
p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
#endif
p->toks = p->burst = parm->burst;
p->mtu = parm->mtu;
if (p->mtu == 0) {
p->mtu = ~0;
if (p->R_tab)
p->mtu = 255<<p->R_tab->rate.cell_log;
}
if (p->P_tab)
p->ptoks = L2T_P(p, p->mtu);
PSCHED_GET_TIME(p->t_c);
p->index = parm->index ? : tcf_police_new_index();
p->action = parm->action;
#ifdef CONFIG_NET_ESTIMATOR
if (est)
qdisc_new_estimator(&p->stats, est);
#endif
h = tcf_police_hash(p->index);
write_lock_bh(&police_lock);
p->next = tcf_police_ht[h];
tcf_police_ht[h] = p;
write_unlock_bh(&police_lock);
return p;
 
failure:
if (p->R_tab)
qdisc_put_rtab(p->R_tab);
kfree(p);
return NULL;
}
 
int tcf_police(struct sk_buff *skb, struct tcf_police *p)
{
psched_time_t now;
long toks;
long ptoks = 0;
 
spin_lock(&p->lock);
 
p->stats.bytes += skb->len;
p->stats.packets++;
 
#ifdef CONFIG_NET_ESTIMATOR
if (p->ewma_rate && p->stats.bps >= p->ewma_rate) {
p->stats.overlimits++;
spin_unlock(&p->lock);
return p->action;
}
#endif
 
if (skb->len <= p->mtu) {
if (p->R_tab == NULL) {
spin_unlock(&p->lock);
return p->result;
}
 
PSCHED_GET_TIME(now);
 
toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst, 0);
 
if (p->P_tab) {
ptoks = toks + p->ptoks;
if (ptoks > (long)L2T_P(p, p->mtu))
ptoks = (long)L2T_P(p, p->mtu);
ptoks -= L2T_P(p, skb->len);
}
toks += p->toks;
if (toks > (long)p->burst)
toks = p->burst;
toks -= L2T(p, skb->len);
 
if ((toks|ptoks) >= 0) {
p->t_c = now;
p->toks = toks;
p->ptoks = ptoks;
spin_unlock(&p->lock);
return p->result;
}
}
 
p->stats.overlimits++;
spin_unlock(&p->lock);
return p->action;
}
 
int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p)
{
unsigned char *b = skb->tail;
struct tc_police opt;
 
opt.index = p->index;
opt.action = p->action;
opt.mtu = p->mtu;
opt.burst = p->burst;
if (p->R_tab)
opt.rate = p->R_tab->rate;
else
memset(&opt.rate, 0, sizeof(opt.rate));
if (p->P_tab)
opt.peakrate = p->P_tab->rate;
else
memset(&opt.peakrate, 0, sizeof(opt.peakrate));
RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
if (p->result)
RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
#ifdef CONFIG_NET_ESTIMATOR
if (p->ewma_rate)
RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
#endif
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
/cls_fw.c
0,0 → 1,379
/*
* net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <linux/netfilter.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
struct fw_head
{
struct fw_filter *ht[256];
};
 
struct fw_filter
{
struct fw_filter *next;
u32 id;
struct tcf_result res;
#ifdef CONFIG_NET_CLS_POLICE
struct tcf_police *police;
#endif
};
 
static __inline__ int fw_hash(u32 handle)
{
return handle&0xFF;
}
 
static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f;
#ifdef CONFIG_NETFILTER
u32 id = skb->nfmark;
#else
u32 id = 0;
#endif
 
if (head == NULL)
goto old_method;
 
for (f=head->ht[fw_hash(id)]; f; f=f->next) {
if (f->id == id) {
*res = f->res;
#ifdef CONFIG_NET_CLS_POLICE
if (f->police)
return tcf_police(skb, f->police);
#endif
return 0;
}
}
return -1;
 
old_method:
if (id && (TC_H_MAJ(id) == 0 ||
!(TC_H_MAJ(id^tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
}
return -1;
}
 
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
{
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f;
 
if (head == NULL)
return 0;
 
for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
if (f->id == handle)
return (unsigned long)f;
}
return 0;
}
 
static void fw_put(struct tcf_proto *tp, unsigned long f)
{
}
 
static int fw_init(struct tcf_proto *tp)
{
MOD_INC_USE_COUNT;
return 0;
}
 
static void fw_destroy(struct tcf_proto *tp)
{
struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL);
struct fw_filter *f;
int h;
 
if (head == NULL) {
MOD_DEC_USE_COUNT;
return;
}
 
for (h=0; h<256; h++) {
while ((f=head->ht[h]) != NULL) {
unsigned long cl;
head->ht[h] = f->next;
 
if ((cl = __cls_set_class(&f->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(f->police);
#endif
kfree(f);
}
}
kfree(head);
MOD_DEC_USE_COUNT;
}
 
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
{
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f = (struct fw_filter*)arg;
struct fw_filter **fp;
 
if (head == NULL || f == NULL)
return -EINVAL;
 
for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
unsigned long cl;
 
tcf_tree_lock(tp);
*fp = f->next;
tcf_tree_unlock(tp);
 
if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(f->police);
#endif
kfree(f);
return 0;
}
}
return -EINVAL;
}
 
static int fw_change(struct tcf_proto *tp, unsigned long base,
u32 handle,
struct rtattr **tca,
unsigned long *arg)
{
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_FW_MAX];
int err;
 
if (!opt)
return handle ? -EINVAL : 0;
 
if (rtattr_parse(tb, TCA_FW_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
return -EINVAL;
 
if ((f = (struct fw_filter*)*arg) != NULL) {
/* Node exists: adjust only classid */
 
if (f->id != handle && handle)
return -EINVAL;
if (tb[TCA_FW_CLASSID-1]) {
unsigned long cl;
 
f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid);
cl = cls_set_class(tp, &f->res.class, cl);
if (cl)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
}
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_FW_POLICE-1]) {
struct tcf_police *police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]);
 
tcf_tree_lock(tp);
police = xchg(&f->police, police);
tcf_tree_unlock(tp);
 
tcf_police_release(police);
}
#endif
return 0;
}
 
if (!handle)
return -EINVAL;
 
if (head == NULL) {
head = kmalloc(sizeof(struct fw_head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
memset(head, 0, sizeof(*head));
 
tcf_tree_lock(tp);
tp->root = head;
tcf_tree_unlock(tp);
}
 
f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL);
if (f == NULL)
return -ENOBUFS;
memset(f, 0, sizeof(*f));
 
f->id = handle;
 
if (tb[TCA_FW_CLASSID-1]) {
err = -EINVAL;
if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != 4)
goto errout;
f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
}
 
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_FW_POLICE-1])
f->police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]);
#endif
 
f->next = head->ht[fw_hash(handle)];
tcf_tree_lock(tp);
head->ht[fw_hash(handle)] = f;
tcf_tree_unlock(tp);
 
*arg = (unsigned long)f;
return 0;
 
errout:
if (f)
kfree(f);
return err;
}
 
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct fw_head *head = (struct fw_head*)tp->root;
int h;
 
if (head == NULL)
arg->stop = 1;
 
if (arg->stop)
return;
 
for (h = 0; h < 256; h++) {
struct fw_filter *f;
 
for (f = head->ht[h]; f; f = f->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
}
 
static int fw_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct fw_filter *f = (struct fw_filter*)fh;
unsigned char *b = skb->tail;
struct rtattr *rta;
 
if (f == NULL)
return skb->len;
 
t->tcm_handle = f->id;
 
if (!f->res.classid
#ifdef CONFIG_NET_CLS_POLICE
&& !f->police
#endif
)
return skb->len;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
if (f->res.classid)
RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid);
#ifdef CONFIG_NET_CLS_POLICE
if (f->police) {
struct rtattr * p_rta = (struct rtattr*)skb->tail;
 
RTA_PUT(skb, TCA_FW_POLICE, 0, NULL);
 
if (tcf_police_dump(skb, f->police) < 0)
goto rtattr_failure;
 
p_rta->rta_len = skb->tail - (u8*)p_rta;
}
#endif
 
rta->rta_len = skb->tail - b;
#ifdef CONFIG_NET_CLS_POLICE
if (f->police) {
if (qdisc_copy_stats(skb, &f->police->stats))
goto rtattr_failure;
}
#endif
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
struct tcf_proto_ops cls_fw_ops = {
NULL,
"fw",
fw_classify,
fw_init,
fw_destroy,
 
fw_get,
fw_put,
fw_change,
fw_delete,
fw_walk,
fw_dump
};
 
#ifdef MODULE
int init_module(void)
{
return register_tcf_proto_ops(&cls_fw_ops);
}
 
void cleanup_module(void)
{
unregister_tcf_proto_ops(&cls_fw_ops);
}
#endif
MODULE_LICENSE("GPL");
/sch_atm.c
0,0 → 1,718
/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
 
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
 
 
#include <linux/config.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/interrupt.h>
#include <linux/atmdev.h>
#include <linux/atmclip.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/file.h> /* for fput */
#include <net/pkt_sched.h>
#include <net/sock.h>
 
 
extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
#define sockfd_put(sock) fput((sock)->file) /* @@@ copied because it's
__inline__ in socket.c */
 
 
#if 0 /* control */
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define DPRINTK(format,args...)
#endif
 
#if 0 /* data */
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define D2PRINTK(format,args...)
#endif
 
 
/*
* The ATM queuing discipline provides a framework for invoking classifiers
* (aka "filters"), which in turn select classes of this queuing discipline.
* Each class maps the flow(s) it is handling to a given VC. Multiple classes
* may share the same VC.
*
* When creating a class, VCs are specified by passing the number of the open
* socket descriptor by which the calling process references the VC. The kernel
* keeps the VC open at least until all classes using it are removed.
*
* In this file, most functions are named atm_tc_* to avoid confusion with all
* the atm_* in net/atm. This naming convention differs from what's used in the
* rest of net/sched.
*
* Known bugs:
* - sometimes messes up the IP stack
* - any manipulations besides the few operations described in the README, are
* untested and likely to crash the system
* - should lock the flow while there is data in the queue (?)
*/
 
 
#define PRIV(sch) ((struct atm_qdisc_data *) (sch)->data)
#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
 
 
struct atm_flow_data {
struct Qdisc *q; /* FIFO, TBF, etc. */
struct tcf_proto *filter_list;
struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */
struct atm_qdisc_data *parent; /* parent qdisc */
struct socket *sock; /* for closing */
u32 classid; /* x:y type ID */
int ref; /* reference count */
struct tc_stats stats;
struct atm_flow_data *next;
struct atm_flow_data *excess; /* flow for excess traffic;
NULL to set CLP instead */
int hdr_len;
unsigned char hdr[0]; /* header data; MUST BE LAST */
};
 
struct atm_qdisc_data {
struct atm_flow_data link; /* unclassified skbs go here */
struct atm_flow_data *flows; /* NB: "link" is also on this
list */
struct tasklet_struct task; /* requeue tasklet */
};
 
 
/* ------------------------- Class/flow operations ------------------------- */
 
 
static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow)
{
struct atm_flow_data *walk;
 
DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow);
for (walk = qdisc->flows; walk; walk = walk->next)
if (walk == flow) return 1;
DPRINTK("find_flow: not found\n");
return 0;
}
 
 
static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch,
u32 classid)
{
struct atm_flow_data *flow;
 
for (flow = PRIV(sch)->flows; flow; flow = flow->next)
if (flow->classid == classid) break;
return flow;
}
 
 
static int atm_tc_graft(struct Qdisc *sch,unsigned long arg,
struct Qdisc *new,struct Qdisc **old)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow = (struct atm_flow_data *) arg;
 
DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch,
p,flow,new,old);
if (!find_flow(p,flow)) return -EINVAL;
if (!new) new = &noop_qdisc;
*old = xchg(&flow->q,new);
if (*old) qdisc_reset(*old);
return 0;
}
 
 
static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl)
{
struct atm_flow_data *flow = (struct atm_flow_data *) cl;
 
DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow);
return flow ? flow->q : NULL;
}
 
 
static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid)
{
struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch);
struct atm_flow_data *flow;
 
DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
flow = lookup_flow(sch,classid);
if (flow) flow->ref++;
DPRINTK("atm_tc_get: flow %p\n",flow);
return (unsigned long) flow;
}
 
 
static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return atm_tc_get(sch,classid);
}
 
 
static void destroy_filters(struct atm_flow_data *flow)
{
struct tcf_proto *filter;
 
while ((filter = flow->filter_list)) {
DPRINTK("destroy_filters: destroying filter %p\n",filter);
flow->filter_list = filter->next;
tcf_destroy(filter);
}
}
 
 
/*
* atm_tc_put handles all destructions, including the ones that are explicitly
* requested (atm_tc_destroy, etc.). The assumption here is that we never drop
* anything that still seems to be in use.
*/
 
static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow = (struct atm_flow_data *) cl;
struct atm_flow_data **prev;
 
DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
if (--flow->ref) return;
DPRINTK("atm_tc_put: destroying\n");
for (prev = &p->flows; *prev; prev = &(*prev)->next)
if (*prev == flow) break;
if (!*prev) {
printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow);
return;
}
*prev = flow->next;
DPRINTK("atm_tc_put: qdisc %p\n",flow->q);
qdisc_destroy(flow->q);
destroy_filters(flow);
if (flow->sock) {
DPRINTK("atm_tc_put: f_count %d\n",
file_count(flow->sock->file));
flow->vcc->pop = flow->old_pop;
sockfd_put(flow->sock);
}
if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess);
if (flow != &p->link) kfree(flow);
/*
* If flow == &p->link, the qdisc no longer works at this point and
* needs to be removed. (By the caller of atm_tc_put.)
*/
}
 
 
static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb)
{
struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
 
D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p);
VCC2FLOW(vcc)->old_pop(vcc,skb);
tasklet_schedule(&p->task);
}
 
 
static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
struct rtattr **tca, unsigned long *arg)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow = (struct atm_flow_data *) *arg;
struct atm_flow_data *excess = NULL;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_ATM_MAX];
struct socket *sock;
int fd,error,hdr_len;
void *hdr;
 
DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
"flow %p,opt %p)\n",sch,p,classid,parent,flow,opt);
/*
* The concept of parents doesn't apply for this qdisc.
*/
if (parent && parent != TC_H_ROOT && parent != sch->handle)
return -EINVAL;
/*
* ATM classes cannot be changed. In order to change properties of the
* ATM connection, that socket needs to be modified directly (via the
* native ATM API. In order to send a flow to a different VC, the old
* class needs to be removed and a new one added. (This may be changed
* later.)
*/
if (flow) return -EBUSY;
if (opt == NULL || rtattr_parse(tb,TCA_ATM_MAX,RTA_DATA(opt),
RTA_PAYLOAD(opt))) return -EINVAL;
if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd))
return -EINVAL;
fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]);
DPRINTK("atm_tc_change: fd %d\n",fd);
if (tb[TCA_ATM_HDR-1]) {
hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]);
hdr = RTA_DATA(tb[TCA_ATM_HDR-1]);
}
else {
hdr_len = RFC1483LLC_LEN;
hdr = NULL; /* default LLC/SNAP for IP */
}
if (!tb[TCA_ATM_EXCESS-1]) excess = NULL;
else {
if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32))
return -EINVAL;
excess = (struct atm_flow_data *) atm_tc_get(sch,
*(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1]));
if (!excess) return -ENOENT;
}
DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n",
opt->rta_type,RTA_PAYLOAD(opt),hdr_len);
if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */
DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file));
if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
error = -EPROTOTYPE;
goto err_out;
}
/* @@@ should check if the socket is really operational or we'll crash
on vcc->send */
if (classid) {
if (TC_H_MAJ(classid ^ sch->handle)) {
DPRINTK("atm_tc_change: classid mismatch\n");
error = -EINVAL;
goto err_out;
}
if (find_flow(p,flow)) {
error = -EEXIST;
goto err_out;
}
}
else {
int i;
unsigned long cl;
 
for (i = 1; i < 0x8000; i++) {
classid = TC_H_MAKE(sch->handle,0x8000 | i);
if (!(cl = atm_tc_get(sch,classid))) break;
atm_tc_put(sch,cl);
}
}
DPRINTK("atm_tc_change: new id %x\n",classid);
flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL);
DPRINTK("atm_tc_change: flow %p\n",flow);
if (!flow) {
error = -ENOBUFS;
goto err_out;
}
memset(flow,0,sizeof(*flow));
flow->filter_list = NULL;
if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops)))
flow->q = &noop_qdisc;
DPRINTK("atm_tc_change: qdisc %p\n",flow->q);
flow->sock = sock;
flow->vcc = ATM_SD(sock); /* speedup */
flow->vcc->user_back = flow;
DPRINTK("atm_tc_change: vcc %p\n",flow->vcc);
flow->old_pop = flow->vcc->pop;
flow->parent = p;
flow->vcc->pop = sch_atm_pop;
flow->classid = classid;
flow->ref = 1;
flow->excess = excess;
flow->next = p->link.next;
p->link.next = flow;
flow->hdr_len = hdr_len;
if (hdr) memcpy(flow->hdr,hdr,hdr_len);
else {
memcpy(flow->hdr,llc_oui,sizeof(llc_oui));
((u16 *) flow->hdr)[3] = htons(ETH_P_IP);
}
*arg = (unsigned long) flow;
return 0;
err_out:
if (excess) atm_tc_put(sch,(unsigned long) excess);
sockfd_put(sock);
return error;
}
 
 
static int atm_tc_delete(struct Qdisc *sch,unsigned long arg)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow = (struct atm_flow_data *) arg;
 
DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
if (!find_flow(PRIV(sch),flow)) return -EINVAL;
if (flow->filter_list || flow == &p->link) return -EBUSY;
/*
* Reference count must be 2: one for "keepalive" (set at class
* creation), and one for the reference held when calling delete.
*/
if (flow->ref < 2) {
printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref);
return -EINVAL;
}
if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/
atm_tc_put(sch,arg);
return 0;
}
 
 
static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow;
 
DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
if (walker->stop) return;
for (flow = p->flows; flow; flow = flow->next) {
if (walker->count >= walker->skip)
if (walker->fn(sch,(unsigned long) flow,walker) < 0) {
walker->stop = 1;
break;
}
walker->count++;
}
}
 
 
static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow = (struct atm_flow_data *) cl;
 
DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
return flow ? &flow->filter_list : &p->link.filter_list;
}
 
 
/* --------------------------- Qdisc operations ---------------------------- */
 
 
static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow = NULL ; /* @@@ */
struct tcf_result res;
int result;
int ret = NET_XMIT_POLICED;
 
D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
result = TC_POLICE_OK; /* be nice to gcc */
if (TC_H_MAJ(skb->priority) != sch->handle ||
!(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority)))
for (flow = p->flows; flow; flow = flow->next)
if (flow->filter_list) {
result = tc_classify(skb,flow->filter_list,
&res);
if (result < 0) continue;
flow = (struct atm_flow_data *) res.class;
if (!flow) flow = lookup_flow(sch,res.classid);
break;
}
if (!flow) flow = &p->link;
else {
if (flow->vcc)
ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
/*@@@ looks good ... but it's not supposed to work :-)*/
#ifdef CONFIG_NET_CLS_POLICE
switch (result) {
case TC_POLICE_SHOT:
kfree_skb(skb);
break;
case TC_POLICE_RECLASSIFY:
if (flow->excess) flow = flow->excess;
else {
ATM_SKB(skb)->atm_options |=
ATM_ATMOPT_CLP;
break;
}
/* fall through */
case TC_POLICE_OK:
/* fall through */
default:
break;
}
#endif
}
if (
#ifdef CONFIG_NET_CLS_POLICE
result == TC_POLICE_SHOT ||
#endif
(ret = flow->q->enqueue(skb,flow->q)) != 0) {
sch->stats.drops++;
if (flow) flow->stats.drops++;
return ret;
}
sch->stats.bytes += skb->len;
sch->stats.packets++;
flow->stats.bytes += skb->len;
flow->stats.packets++;
/*
* Okay, this may seem weird. We pretend we've dropped the packet if
* it goes via ATM. The reason for this is that the outer qdisc
* expects to be able to q->dequeue the packet later on if we return
* success at this place. Also, sch->q.qdisc needs to reflect whether
* there is a packet egligible for dequeuing or not. Note that the
* statistics of the outer qdisc are necessarily wrong because of all
* this. There's currently no correct solution for this.
*/
if (flow == &p->link) {
sch->q.qlen++;
return 0;
}
tasklet_schedule(&p->task);
return NET_XMIT_BYPASS;
}
 
 
/*
* Dequeue packets and send them over ATM. Note that we quite deliberately
* avoid checking net_device's flow control here, simply because sch_atm
* uses its own channels, which have nothing to do with any CLIP/LANE/or
* non-ATM interfaces.
*/
 
 
static void sch_atm_dequeue(unsigned long data)
{
struct Qdisc *sch = (struct Qdisc *) data;
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow;
struct sk_buff *skb;
 
D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p);
for (flow = p->link.next; flow; flow = flow->next)
/*
* If traffic is properly shaped, this won't generate nasty
* little bursts. Otherwise, it may ... (but that's okay)
*/
while ((skb = flow->q->dequeue(flow->q))) {
if (!atm_may_send(flow->vcc,skb->truesize)) {
(void) flow->q->ops->requeue(skb,flow->q);
break;
}
D2PRINTK("atm_tc_deqeueue: sending on class %p\n",flow);
/* remove any LL header somebody else has attached */
skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data);
if (skb_headroom(skb) < flow->hdr_len) {
struct sk_buff *new;
 
new = skb_realloc_headroom(skb,flow->hdr_len);
dev_kfree_skb(skb);
if (!new) continue;
skb = new;
}
D2PRINTK("sch_atm_dequeue: ip %p, data %p\n",
skb->nh.iph,skb->data);
ATM_SKB(skb)->vcc = flow->vcc;
memcpy(skb_push(skb,flow->hdr_len),flow->hdr,
flow->hdr_len);
atomic_add(skb->truesize,&flow->vcc->sk->wmem_alloc);
/* atm.atm_options are already set by atm_tc_enqueue */
(void) flow->vcc->send(flow->vcc,skb);
}
}
 
 
static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
{
struct atm_qdisc_data *p = PRIV(sch);
struct sk_buff *skb;
 
D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p);
tasklet_schedule(&p->task);
skb = p->link.q->dequeue(p->link.q);
if (skb) sch->q.qlen--;
return skb;
}
 
 
static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch)
{
struct atm_qdisc_data *p = PRIV(sch);
int ret;
 
D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
ret = p->link.q->ops->requeue(skb,p->link.q);
if (!ret) sch->q.qlen++;
else {
sch->stats.drops++;
p->link.stats.drops++;
}
return ret;
}
 
 
static unsigned int atm_tc_drop(struct Qdisc *sch)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow;
unsigned int len;
 
DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p);
for (flow = p->flows; flow; flow = flow->next)
if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
return len;
return 0;
}
 
 
static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt)
{
struct atm_qdisc_data *p = PRIV(sch);
 
DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
memset(p,0,sizeof(*p));
p->flows = &p->link;
if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops)))
p->link.q = &noop_qdisc;
DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q);
p->link.filter_list = NULL;
p->link.vcc = NULL;
p->link.sock = NULL;
p->link.classid = sch->handle;
p->link.ref = 1;
p->link.next = NULL;
tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch);
MOD_INC_USE_COUNT;
return 0;
}
 
 
static void atm_tc_reset(struct Qdisc *sch)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow;
 
DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p);
for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q);
sch->q.qlen = 0;
}
 
 
static void atm_tc_destroy(struct Qdisc *sch)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow;
 
DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p);
/* races ? */
while ((flow = p->flows)) {
destroy_filters(flow);
if (flow->ref > 1)
printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow,
flow->ref);
atm_tc_put(sch,(unsigned long) flow);
if (p->flows == flow) {
printk(KERN_ERR "atm_destroy: putting flow %p didn't "
"kill it\n",flow);
p->flows = flow->next; /* brute force */
break;
}
}
tasklet_kill(&p->task);
MOD_DEC_USE_COUNT;
}
 
 
static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct atm_qdisc_data *p = PRIV(sch);
struct atm_flow_data *flow = (struct atm_flow_data *) cl;
unsigned char *b = skb->tail;
struct rtattr *rta;
 
DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
sch,p,flow,skb,tcm);
if (!find_flow(p,flow)) return -EINVAL;
tcm->tcm_handle = flow->classid;
rta = (struct rtattr *) b;
RTA_PUT(skb,TCA_OPTIONS,0,NULL);
RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr);
if (flow->vcc) {
struct sockaddr_atmpvc pvc;
int state;
 
pvc.sap_family = AF_ATMPVC;
pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
pvc.sap_addr.vpi = flow->vcc->vpi;
pvc.sap_addr.vci = flow->vcc->vci;
RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc);
state = ATM_VF2VS(flow->vcc->flags);
RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state);
}
if (flow->excess)
RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid);
else {
static u32 zero = 0;
 
RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero);
}
rta->rta_len = skb->tail-b;
return skb->len;
 
rtattr_failure:
skb_trim(skb,b-skb->data);
return -1;
}
 
static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
{
return 0;
}
 
static struct Qdisc_class_ops atm_class_ops =
{
atm_tc_graft, /* graft */
atm_tc_leaf, /* leaf */
atm_tc_get, /* get */
atm_tc_put, /* put */
atm_tc_change, /* change */
atm_tc_delete, /* delete */
atm_tc_walk, /* walk */
 
atm_tc_find_tcf, /* tcf_chain */
atm_tc_bind_filter, /* bind_tcf */
atm_tc_put, /* unbind_tcf */
 
atm_tc_dump_class, /* dump */
};
 
struct Qdisc_ops atm_qdisc_ops =
{
NULL, /* next */
&atm_class_ops, /* cl_ops */
"atm",
sizeof(struct atm_qdisc_data),
 
atm_tc_enqueue, /* enqueue */
atm_tc_dequeue, /* dequeue */
atm_tc_requeue, /* requeue */
atm_tc_drop, /* drop */
 
atm_tc_init, /* init */
atm_tc_reset, /* reset */
atm_tc_destroy, /* destroy */
NULL, /* change */
 
atm_tc_dump /* dump */
};
 
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&atm_qdisc_ops);
}
 
 
void cleanup_module(void)
{
unregister_qdisc(&atm_qdisc_ops);
}
#endif
/sch_gred.c
0,0 → 1,637
/*
* net/sched/sch_gred.c Generic Random Early Detection queue.
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
*
* 991129: - Bug fix with grio mode
* - a better sing. AvgQ mode with Grio(WRED)
* - A finer grained VQ dequeue based on sugestion
* from Ren Liu
* - More error checks
*
*
*
* For all the glorious comments look at Alexey's sch_red.c
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
#if 1 /* control */
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define DPRINTK(format,args...)
#endif
 
#if 0 /* data */
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define D2PRINTK(format,args...)
#endif
 
struct gred_sched_data;
struct gred_sched;
 
struct gred_sched_data
{
/* Parameters */
u32 limit; /* HARD maximal queue length */
u32 qth_min; /* Min average length threshold: A scaled */
u32 qth_max; /* Max average length threshold: A scaled */
u32 DP; /* the drop pramaters */
char Wlog; /* log(W) */
char Plog; /* random number bits */
u32 Scell_max;
u32 Rmask;
u32 bytesin; /* bytes seen on virtualQ so far*/
u32 packetsin; /* packets seen on virtualQ so far*/
u32 backlog; /* bytes on the virtualQ */
u32 forced; /* packets dropped for exceeding limits */
u32 early; /* packets dropped as a warning */
u32 other; /* packets dropped by invoking drop() */
u32 pdrop; /* packets dropped because we exceeded physical queue limits */
char Scell_log;
u8 Stab[256];
u8 prio; /* the prio of this vq */
 
/* Variables */
unsigned long qave; /* Average queue length: A scaled */
int qcount; /* Packets since last random number generation */
u32 qR; /* Cached random number */
 
psched_time_t qidlestart; /* Start of idle period */
};
 
struct gred_sched
{
struct gred_sched_data *tab[MAX_DPs];
u32 DPs;
u32 def;
u8 initd;
u8 grio;
u8 eqp;
};
 
static int
gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
psched_time_t now;
struct gred_sched_data *q=NULL;
struct gred_sched *t= (struct gred_sched *)sch->data;
unsigned long qave=0;
int i=0;
 
if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
goto do_enqueue;
}
 
 
if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
printk("GRED: setting to default (%d)\n ",t->def);
if (!(q=t->tab[t->def])) {
DPRINTK("GRED: setting to default FAILED! dropping!! "
"(%d)\n ", t->def);
goto drop;
}
/* fix tc_index? --could be controvesial but needed for
requeueing */
skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
}
 
D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
"general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
sch->stats.backlog);
/* sum up all the qaves of prios <= to ours to get the new qave*/
if (!t->eqp && t->grio) {
for (i=0;i<t->DPs;i++) {
if ((!t->tab[i]) || (i==q->DP))
continue;
if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
qave +=t->tab[i]->qave;
}
}
 
q->packetsin++;
q->bytesin+=skb->len;
 
if (t->eqp && t->grio) {
qave=0;
q->qave=t->tab[t->def]->qave;
q->qidlestart=t->tab[t->def]->qidlestart;
}
 
if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
long us_idle;
PSCHED_GET_TIME(now);
us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0);
PSCHED_SET_PASTPERFECT(q->qidlestart);
 
q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
} else {
if (t->eqp) {
q->qave += sch->stats.backlog - (q->qave >> q->Wlog);
} else {
q->qave += q->backlog - (q->qave >> q->Wlog);
}
 
}
 
if (t->eqp && t->grio)
t->tab[t->def]->qave=q->qave;
 
if ((q->qave+qave) < q->qth_min) {
q->qcount = -1;
enqueue:
if (q->backlog + skb->len <= q->limit) {
q->backlog += skb->len;
do_enqueue:
__skb_queue_tail(&sch->q, skb);
sch->stats.backlog += skb->len;
sch->stats.bytes += skb->len;
sch->stats.packets++;
return 0;
} else {
q->pdrop++;
}
 
drop:
kfree_skb(skb);
sch->stats.drops++;
return NET_XMIT_DROP;
}
if ((q->qave+qave) >= q->qth_max) {
q->qcount = -1;
sch->stats.overlimits++;
q->forced++;
goto drop;
}
if (++q->qcount) {
if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
goto enqueue;
q->qcount = 0;
q->qR = net_random()&q->Rmask;
sch->stats.overlimits++;
q->early++;
goto drop;
}
q->qR = net_random()&q->Rmask;
goto enqueue;
}
 
static int
gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
struct gred_sched_data *q;
struct gred_sched *t= (struct gred_sched *)sch->data;
q= t->tab[(skb->tc_index&0xf)];
/* error checking here -- probably unnecessary */
PSCHED_SET_PASTPERFECT(q->qidlestart);
 
__skb_queue_head(&sch->q, skb);
sch->stats.backlog += skb->len;
q->backlog += skb->len;
return 0;
}
 
static struct sk_buff *
gred_dequeue(struct Qdisc* sch)
{
struct sk_buff *skb;
struct gred_sched_data *q;
struct gred_sched *t= (struct gred_sched *)sch->data;
 
skb = __skb_dequeue(&sch->q);
if (skb) {
sch->stats.backlog -= skb->len;
q= t->tab[(skb->tc_index&0xf)];
if (q) {
q->backlog -= skb->len;
if (!q->backlog && !t->eqp)
PSCHED_GET_TIME(q->qidlestart);
} else {
D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
}
return skb;
}
 
if (t->eqp) {
q= t->tab[t->def];
if (!q)
D2PRINTK("no default VQ set: Results will be "
"screwed up\n");
else
PSCHED_GET_TIME(q->qidlestart);
}
 
return NULL;
}
 
static unsigned int gred_drop(struct Qdisc* sch)
{
struct sk_buff *skb;
 
struct gred_sched_data *q;
struct gred_sched *t= (struct gred_sched *)sch->data;
 
skb = __skb_dequeue_tail(&sch->q);
if (skb) {
unsigned int len = skb->len;
sch->stats.backlog -= len;
sch->stats.drops++;
q= t->tab[(skb->tc_index&0xf)];
if (q) {
q->backlog -= len;
q->other++;
if (!q->backlog && !t->eqp)
PSCHED_GET_TIME(q->qidlestart);
} else {
D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
}
 
kfree_skb(skb);
return len;
}
 
q=t->tab[t->def];
if (!q) {
D2PRINTK("no default VQ set: Results might be screwed up\n");
return 0;
}
 
PSCHED_GET_TIME(q->qidlestart);
return 0;
 
}
 
static void gred_reset(struct Qdisc* sch)
{
int i;
struct gred_sched_data *q;
struct gred_sched *t= (struct gred_sched *)sch->data;
 
__skb_queue_purge(&sch->q);
 
sch->stats.backlog = 0;
 
for (i=0;i<t->DPs;i++) {
q= t->tab[i];
if (!q)
continue;
PSCHED_SET_PASTPERFECT(q->qidlestart);
q->qave = 0;
q->qcount = -1;
q->backlog = 0;
q->other=0;
q->forced=0;
q->pdrop=0;
q->early=0;
}
}
 
static int gred_change(struct Qdisc *sch, struct rtattr *opt)
{
struct gred_sched *table = (struct gred_sched *)sch->data;
struct gred_sched_data *q;
struct tc_gred_qopt *ctl;
struct tc_gred_sopt *sopt;
struct rtattr *tb[TCA_GRED_STAB];
struct rtattr *tb2[TCA_GRED_DPS];
int i;
 
if (opt == NULL ||
rtattr_parse(tb, TCA_GRED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) )
return -EINVAL;
 
if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
rtattr_parse(tb2, TCA_GRED_DPS, RTA_DATA(opt),
RTA_PAYLOAD(opt));
 
if (tb2[TCA_GRED_DPS-1] == 0)
return -EINVAL;
 
sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
table->DPs=sopt->DPs;
table->def=sopt->def_DP;
table->grio=sopt->grio;
table->initd=0;
/* probably need to clear all the table DP entries as well */
MOD_INC_USE_COUNT;
return 0;
}
 
 
if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 ||
RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
return -EINVAL;
 
ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
if (ctl->DP > MAX_DPs-1 ) {
/* misbehaving is punished! Put in the default drop probability */
DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP "
"set to default at %d\n",ctl->DP,table->def);
ctl->DP=table->def;
}
if (table->tab[ctl->DP] == NULL) {
table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data),
GFP_KERNEL);
if (NULL == table->tab[ctl->DP])
return -ENOMEM;
memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data)));
}
q= table->tab[ctl->DP];
 
if (table->grio) {
if (ctl->prio <=0) {
if (table->def && table->tab[table->def]) {
DPRINTK("\nGRED: DP %u does not have a prio"
"setting default to %d\n",ctl->DP,
table->tab[table->def]->prio);
q->prio=table->tab[table->def]->prio;
} else {
DPRINTK("\nGRED: DP %u does not have a prio"
" setting default to 8\n",ctl->DP);
q->prio=8;
}
} else {
q->prio=ctl->prio;
}
} else {
q->prio=8;
}
 
 
q->DP=ctl->DP;
q->Wlog = ctl->Wlog;
q->Plog = ctl->Plog;
q->limit = ctl->limit;
q->Scell_log = ctl->Scell_log;
q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
q->Scell_max = (255<<q->Scell_log);
q->qth_min = ctl->qth_min<<ctl->Wlog;
q->qth_max = ctl->qth_max<<ctl->Wlog;
q->qave=0;
q->backlog=0;
q->qcount = -1;
q->other=0;
q->forced=0;
q->pdrop=0;
q->early=0;
 
PSCHED_SET_PASTPERFECT(q->qidlestart);
memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
 
if ( table->initd && table->grio) {
/* this looks ugly but its not in the fast path */
for (i=0;i<table->DPs;i++) {
if ((!table->tab[i]) || (i==q->DP) )
continue;
if (table->tab[i]->prio == q->prio ){
/* WRED mode detected */
table->eqp=1;
break;
}
}
}
 
if (!table->initd) {
table->initd=1;
/*
the first entry also goes into the default until
over-written
*/
 
if (table->tab[table->def] == NULL) {
table->tab[table->def]=
kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL);
if (NULL == table->tab[table->def])
return -ENOMEM;
 
memset(table->tab[table->def], 0,
(sizeof(struct gred_sched_data)));
}
q= table->tab[table->def];
q->DP=table->def;
q->Wlog = ctl->Wlog;
q->Plog = ctl->Plog;
q->limit = ctl->limit;
q->Scell_log = ctl->Scell_log;
q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
q->Scell_max = (255<<q->Scell_log);
q->qth_min = ctl->qth_min<<ctl->Wlog;
q->qth_max = ctl->qth_max<<ctl->Wlog;
 
if (table->grio)
q->prio=table->tab[ctl->DP]->prio;
else
q->prio=8;
 
q->qcount = -1;
PSCHED_SET_PASTPERFECT(q->qidlestart);
memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
}
return 0;
 
}
 
static int gred_init(struct Qdisc *sch, struct rtattr *opt)
{
struct gred_sched *table = (struct gred_sched *)sch->data;
struct tc_gred_sopt *sopt;
struct rtattr *tb[TCA_GRED_STAB];
struct rtattr *tb2[TCA_GRED_DPS];
 
if (opt == NULL ||
rtattr_parse(tb, TCA_GRED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) )
return -EINVAL;
 
if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0 ) {
rtattr_parse(tb2, TCA_GRED_DPS, RTA_DATA(opt),RTA_PAYLOAD(opt));
 
if (tb2[TCA_GRED_DPS-1] == 0)
return -EINVAL;
 
sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
table->DPs=sopt->DPs;
table->def=sopt->def_DP;
table->grio=sopt->grio;
table->initd=0;
MOD_INC_USE_COUNT;
return 0;
}
 
DPRINTK("\n GRED_INIT error!\n");
return -EINVAL;
}
 
static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
{
unsigned long qave;
struct rtattr *rta;
struct tc_gred_qopt *opt = NULL ;
struct tc_gred_qopt *dst;
struct gred_sched *table = (struct gred_sched *)sch->data;
struct gred_sched_data *q;
int i;
unsigned char *b = skb->tail;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL);
 
if (opt == NULL) {
DPRINTK("gred_dump:failed to malloc for %Zd\n",
sizeof(struct tc_gred_qopt)*MAX_DPs);
goto rtattr_failure;
}
 
memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs);
 
if (!table->initd) {
DPRINTK("NO GRED Queues setup!\n");
}
 
for (i=0;i<MAX_DPs;i++) {
dst= &opt[i];
q= table->tab[i];
 
if (!q) {
/* hack -- fix at some point with proper message
This is how we indicate to tc that there is no VQ
at this DP */
 
dst->DP=MAX_DPs+i;
continue;
}
 
dst->limit=q->limit;
dst->qth_min=q->qth_min>>q->Wlog;
dst->qth_max=q->qth_max>>q->Wlog;
dst->DP=q->DP;
dst->backlog=q->backlog;
if (q->qave) {
if (table->eqp && table->grio) {
q->qidlestart=table->tab[table->def]->qidlestart;
q->qave=table->tab[table->def]->qave;
}
if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
long idle;
psched_time_t now;
PSCHED_GET_TIME(now);
idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0);
qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF];
dst->qave = qave >> q->Wlog;
 
} else {
dst->qave = q->qave >> q->Wlog;
}
} else {
dst->qave = 0;
}
 
dst->Wlog = q->Wlog;
dst->Plog = q->Plog;
dst->Scell_log = q->Scell_log;
dst->other = q->other;
dst->forced = q->forced;
dst->early = q->early;
dst->pdrop = q->pdrop;
dst->prio = q->prio;
dst->packets=q->packetsin;
dst->bytesin=q->bytesin;
}
 
RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt);
rta->rta_len = skb->tail - b;
 
kfree(opt);
return skb->len;
 
rtattr_failure:
if (opt)
kfree(opt);
DPRINTK("gred_dump: FAILURE!!!!\n");
 
/* also free the opt struct here */
skb_trim(skb, b - skb->data);
return -1;
}
 
static void gred_destroy(struct Qdisc *sch)
{
struct gred_sched *table = (struct gred_sched *)sch->data;
int i;
 
for (i = 0;i < table->DPs; i++) {
if (table->tab[i])
kfree(table->tab[i]);
}
MOD_DEC_USE_COUNT;
}
 
struct Qdisc_ops gred_qdisc_ops =
{
NULL,
NULL,
"gred",
sizeof(struct gred_sched),
gred_enqueue,
gred_dequeue,
gred_requeue,
gred_drop,
gred_init,
gred_reset,
gred_destroy,
gred_change, /* change */
gred_dump,
};
 
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&gred_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&gred_qdisc_ops);
}
#endif
MODULE_LICENSE("GPL");
/sch_dsmark.c
0,0 → 1,486
/* net/sched/sch_dsmark.c - Differentiated Services field marker */
 
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
 
 
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h> /* for pkt_sched */
#include <linux/rtnetlink.h>
#include <net/pkt_sched.h>
#include <net/dsfield.h>
#include <asm/byteorder.h>
 
 
#if 1 /* control */
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define DPRINTK(format,args...)
#endif
 
#if 0 /* data */
#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define D2PRINTK(format,args...)
#endif
 
 
#define PRIV(sch) ((struct dsmark_qdisc_data *) (sch)->data)
 
 
/*
* classid class marking
* ------- ----- -------
* n/a 0 n/a
* x:0 1 use entry [0]
* ... ... ...
* x:y y>0 y+1 use entry [y]
* ... ... ...
* x:indices-1 indices use entry [indices-1]
* ... ... ...
* x:y y+1 use entry [y & (indices-1)]
* ... ... ...
* 0xffff 0x10000 use entry [indices-1]
*/
 
 
#define NO_DEFAULT_INDEX (1 << 16)
 
struct dsmark_qdisc_data {
struct Qdisc *q;
struct tcf_proto *filter_list;
__u8 *mask; /* "owns" the array */
__u8 *value;
__u16 indices;
__u32 default_index; /* index range is 0...0xffff */
int set_tc_index;
};
 
 
/* ------------------------- Class/flow operations ------------------------- */
 
 
static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
struct Qdisc *new,struct Qdisc **old)
{
struct dsmark_qdisc_data *p = PRIV(sch);
 
DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new,
old);
if (!new)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = xchg(&p->q,new);
if (*old)
qdisc_reset(*old);
sch->q.qlen = 0;
sch_tree_unlock(sch); /* @@@ move up ? */
return 0;
}
 
 
static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
{
struct dsmark_qdisc_data *p = PRIV(sch);
 
return p->q;
}
 
 
static unsigned long dsmark_get(struct Qdisc *sch,u32 classid)
{
struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch);
 
DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
return TC_H_MIN(classid)+1;
}
 
 
static unsigned long dsmark_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return dsmark_get(sch,classid);
}
 
 
static void dsmark_put(struct Qdisc *sch, unsigned long cl)
{
}
 
 
static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
struct rtattr **tca, unsigned long *arg)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_DSMARK_MAX];
 
DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
"arg 0x%lx\n",sch,p,classid,parent,*arg);
if (*arg > p->indices)
return -ENOENT;
if (!opt || rtattr_parse(tb, TCA_DSMARK_MAX, RTA_DATA(opt),
RTA_PAYLOAD(opt)))
return -EINVAL;
if (tb[TCA_DSMARK_MASK-1]) {
if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1]))
return -EINVAL;
p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]);
}
if (tb[TCA_DSMARK_VALUE-1]) {
if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1]))
return -EINVAL;
p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]);
}
return 0;
}
 
 
static int dsmark_delete(struct Qdisc *sch,unsigned long arg)
{
struct dsmark_qdisc_data *p = PRIV(sch);
 
if (!arg || arg > p->indices)
return -EINVAL;
p->mask[arg-1] = 0xff;
p->value[arg-1] = 0;
return 0;
}
 
 
static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
{
struct dsmark_qdisc_data *p = PRIV(sch);
int i;
 
DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
if (walker->stop)
return;
for (i = 0; i < p->indices; i++) {
if (p->mask[i] == 0xff && !p->value[i])
continue;
if (walker->count >= walker->skip) {
if (walker->fn(sch, i+1, walker) < 0) {
walker->stop = 1;
break;
}
}
walker->count++;
}
}
 
 
static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl)
{
struct dsmark_qdisc_data *p = PRIV(sch);
 
return &p->filter_list;
}
 
 
/* --------------------------- Qdisc operations ---------------------------- */
 
 
static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct tcf_result res;
int result;
int ret = NET_XMIT_POLICED;
 
D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
if (p->set_tc_index) {
switch (skb->protocol) {
case __constant_htons(ETH_P_IP):
skb->tc_index = ipv4_get_dsfield(skb->nh.iph);
break;
case __constant_htons(ETH_P_IPV6):
skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h);
break;
default:
skb->tc_index = 0;
break;
};
}
result = TC_POLICE_OK; /* be nice to gcc */
if (TC_H_MAJ(skb->priority) == sch->handle) {
skb->tc_index = TC_H_MIN(skb->priority);
} else {
result = tc_classify(skb,p->filter_list,&res);
D2PRINTK("result %d class 0x%04x\n",result,res.classid);
switch (result) {
#ifdef CONFIG_NET_CLS_POLICE
case TC_POLICE_SHOT:
kfree_skb(skb);
break;
#if 0
case TC_POLICE_RECLASSIFY:
/* FIXME: what to do here ??? */
#endif
#endif
case TC_POLICE_OK:
skb->tc_index = TC_H_MIN(res.classid);
break;
case TC_POLICE_UNSPEC:
/* fall through */
default:
if (p->default_index != NO_DEFAULT_INDEX)
skb->tc_index = p->default_index;
break;
};
}
if (
#ifdef CONFIG_NET_CLS_POLICE
result == TC_POLICE_SHOT ||
#endif
 
((ret = p->q->enqueue(skb,p->q)) != 0)) {
sch->stats.drops++;
return ret;
}
sch->stats.bytes += skb->len;
sch->stats.packets++;
sch->q.qlen++;
return ret;
}
 
 
static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct sk_buff *skb;
int index;
 
D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p);
skb = p->q->ops->dequeue(p->q);
if (!skb)
return NULL;
sch->q.qlen--;
index = skb->tc_index & (p->indices-1);
D2PRINTK("index %d->%d\n",skb->tc_index,index);
switch (skb->protocol) {
case __constant_htons(ETH_P_IP):
ipv4_change_dsfield(skb->nh.iph,
p->mask[index],p->value[index]);
break;
case __constant_htons(ETH_P_IPV6):
ipv6_change_dsfield(skb->nh.ipv6h,
p->mask[index],p->value[index]);
break;
default:
/*
* Only complain if a change was actually attempted.
* This way, we can send non-IP traffic through dsmark
* and don't need yet another qdisc as a bypass.
*/
if (p->mask[index] != 0xff || p->value[index])
printk(KERN_WARNING "dsmark_dequeue: "
"unsupported protocol %d\n",
htons(skb->protocol));
break;
};
return skb;
}
 
 
static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch)
{
int ret;
struct dsmark_qdisc_data *p = PRIV(sch);
 
D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
if ((ret = p->q->ops->requeue(skb, p->q)) == 0) {
sch->q.qlen++;
return 0;
}
sch->stats.drops++;
return ret;
}
 
 
static unsigned int dsmark_drop(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
unsigned int len;
DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
if (!p->q->ops->drop)
return 0;
if (!(len = p->q->ops->drop(p->q)))
return 0;
sch->q.qlen--;
return len;
}
 
 
int dsmark_init(struct Qdisc *sch,struct rtattr *opt)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct rtattr *tb[TCA_DSMARK_MAX];
__u16 tmp;
 
DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
if (rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 ||
!tb[TCA_DSMARK_INDICES-1] ||
RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16))
return -EINVAL;
memset(p,0,sizeof(*p));
p->filter_list = NULL;
p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]);
if (!p->indices)
return -EINVAL;
for (tmp = p->indices; tmp != 1; tmp >>= 1) {
if (tmp & 1)
return -EINVAL;
}
p->default_index = NO_DEFAULT_INDEX;
if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) {
if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16))
return -EINVAL;
p->default_index =
*(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
}
p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1];
p->mask = kmalloc(p->indices*2,GFP_KERNEL);
if (!p->mask)
return -ENOMEM;
p->value = p->mask+p->indices;
memset(p->mask,0xff,p->indices);
memset(p->value,0,p->indices);
if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
p->q = &noop_qdisc;
DPRINTK("dsmark_init: qdisc %p\n",&p->q);
MOD_INC_USE_COUNT;
return 0;
}
 
 
static void dsmark_reset(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
 
DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
qdisc_reset(p->q);
sch->q.qlen = 0;
}
 
 
static void dsmark_destroy(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct tcf_proto *tp;
 
DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p);
while (p->filter_list) {
tp = p->filter_list;
p->filter_list = tp->next;
tcf_destroy(tp);
}
qdisc_destroy(p->q);
p->q = &noop_qdisc;
kfree(p->mask);
MOD_DEC_USE_COUNT;
}
 
 
static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct dsmark_qdisc_data *p = PRIV(sch);
unsigned char *b = skb->tail;
struct rtattr *rta;
 
DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl);
if (!cl || cl > p->indices)
return -EINVAL;
tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1);
rta = (struct rtattr *) b;
RTA_PUT(skb,TCA_OPTIONS,0,NULL);
RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]);
RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]);
rta->rta_len = skb->tail-b;
return skb->len;
 
rtattr_failure:
skb_trim(skb,b-skb->data);
return -1;
}
 
static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct dsmark_qdisc_data *p = PRIV(sch);
unsigned char *b = skb->tail;
struct rtattr *rta;
 
rta = (struct rtattr *) b;
RTA_PUT(skb,TCA_OPTIONS,0,NULL);
RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices);
if (p->default_index != NO_DEFAULT_INDEX) {
__u16 tmp = p->default_index;
 
RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp);
}
if (p->set_tc_index)
RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL);
rta->rta_len = skb->tail-b;
return skb->len;
 
rtattr_failure:
skb_trim(skb,b-skb->data);
return -1;
}
 
static struct Qdisc_class_ops dsmark_class_ops =
{
dsmark_graft, /* graft */
dsmark_leaf, /* leaf */
dsmark_get, /* get */
dsmark_put, /* put */
dsmark_change, /* change */
dsmark_delete, /* delete */
dsmark_walk, /* walk */
 
dsmark_find_tcf, /* tcf_chain */
dsmark_bind_filter, /* bind_tcf */
dsmark_put, /* unbind_tcf */
 
dsmark_dump_class, /* dump */
};
 
struct Qdisc_ops dsmark_qdisc_ops =
{
NULL, /* next */
&dsmark_class_ops, /* cl_ops */
"dsmark",
sizeof(struct dsmark_qdisc_data),
 
dsmark_enqueue, /* enqueue */
dsmark_dequeue, /* dequeue */
dsmark_requeue, /* requeue */
dsmark_drop, /* drop */
 
dsmark_init, /* init */
dsmark_reset, /* reset */
dsmark_destroy, /* destroy */
NULL, /* change */
 
dsmark_dump /* dump */
};
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&dsmark_qdisc_ops);
}
 
 
void cleanup_module(void)
{
unregister_qdisc(&dsmark_qdisc_ops);
}
#endif
MODULE_LICENSE("GPL");
/sch_hfsc.c
0,0 → 1,1841
/*
* Copyright (c) 2003 Patrick McHardy, <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* 2003-10-17 - Ported from altq
*/
/*
* Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software and
* its documentation is hereby granted (including for commercial or
* for-profit use), provided that both the copyright notice and this
* permission notice appear in all copies of the software, derivative
* works, or modified versions, and any portions thereof.
*
* THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
* WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS
* SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
* Carnegie Mellon encourages (but does not require) users of this
* software to return any improvements or extensions that they make,
* and to grant Carnegie Mellon the rights to redistribute these
* changes without encumbrance.
*/
/*
* H-FSC is described in Proceedings of SIGCOMM'97,
* "A Hierarchical Fair Service Curve Algorithm for Link-Sharing,
* Real-Time and Priority Service"
* by Ion Stoica, Hui Zhang, and T. S. Eugene Ng.
*
* Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing.
* when a class has an upperlimit, the fit-time is computed from the
* upperlimit service curve. the link-sharing scheduler does not schedule
* a class whose fit-time exceeds the current time.
*/
 
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/pkt_sched.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <asm/system.h>
#include <asm/div64.h>
 
#define HFSC_DEBUG 1
 
/*
* kernel internal service curve representation:
* coordinates are given by 64 bit unsigned integers.
* x-axis: unit is clock count.
* y-axis: unit is byte.
*
* The service curve parameters are converted to the internal
* representation. The slope values are scaled to avoid overflow.
* the inverse slope values as well as the y-projection of the 1st
* segment are kept in order to to avoid 64-bit divide operations
* that are expensive on 32-bit architectures.
*/
 
struct internal_sc
{
u64 sm1; /* scaled slope of the 1st segment */
u64 ism1; /* scaled inverse-slope of the 1st segment */
u64 dx; /* the x-projection of the 1st segment */
u64 dy; /* the y-projection of the 1st segment */
u64 sm2; /* scaled slope of the 2nd segment */
u64 ism2; /* scaled inverse-slope of the 2nd segment */
};
 
/* runtime service curve */
struct runtime_sc
{
u64 x; /* current starting position on x-axis */
u64 y; /* current starting position on y-axis */
u64 sm1; /* scaled slope of the 1st segment */
u64 ism1; /* scaled inverse-slope of the 1st segment */
u64 dx; /* the x-projection of the 1st segment */
u64 dy; /* the y-projection of the 1st segment */
u64 sm2; /* scaled slope of the 2nd segment */
u64 ism2; /* scaled inverse-slope of the 2nd segment */
};
 
enum hfsc_class_flags
{
HFSC_RSC = 0x1,
HFSC_FSC = 0x2,
HFSC_USC = 0x4
};
 
struct hfsc_class
{
u32 classid; /* class id */
unsigned int refcnt; /* usage count */
 
struct tc_stats stats; /* generic statistics */
unsigned int level; /* class level in hierarchy */
struct tcf_proto *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
 
struct hfsc_sched *sched; /* scheduler data */
struct hfsc_class *cl_parent; /* parent class */
struct list_head siblings; /* sibling classes */
struct list_head children; /* child classes */
struct Qdisc *qdisc; /* leaf qdisc */
 
struct list_head actlist; /* active children list */
struct list_head alist; /* active children list member */
struct list_head ellist; /* eligible list member */
struct list_head hlist; /* hash list member */
struct list_head dlist; /* drop list member */
 
u64 cl_total; /* total work in bytes */
u64 cl_cumul; /* cumulative work in bytes done by
real-time criteria */
 
u64 cl_d; /* deadline*/
u64 cl_e; /* eligible time */
u64 cl_vt; /* virtual time */
u64 cl_f; /* time when this class will fit for
link-sharing, max(myf, cfmin) */
u64 cl_myf; /* my fit-time (calculated from this
class's own upperlimit curve) */
u64 cl_myfadj; /* my fit-time adjustment (to cancel
history dependence) */
u64 cl_cfmin; /* earliest children's fit-time (used
with cl_myf to obtain cl_f) */
u64 cl_cvtmin; /* minimal virtual time among the
children fit for link-sharing
(monotonic within a period) */
u64 cl_vtadj; /* intra-period cumulative vt
adjustment */
u64 cl_vtoff; /* inter-period cumulative vt offset */
u64 cl_cvtmax; /* max child's vt in the last period */
 
struct internal_sc cl_rsc; /* internal real-time service curve */
struct internal_sc cl_fsc; /* internal fair service curve */
struct internal_sc cl_usc; /* internal upperlimit service curve */
struct runtime_sc cl_deadline; /* deadline curve */
struct runtime_sc cl_eligible; /* eligible curve */
struct runtime_sc cl_virtual; /* virtual curve */
struct runtime_sc cl_ulimit; /* upperlimit curve */
 
unsigned long cl_flags; /* which curves are valid */
unsigned long cl_vtperiod; /* vt period sequence number */
unsigned long cl_parentperiod;/* parent's vt period sequence number*/
unsigned long cl_nactive; /* number of active children */
};
 
#define HFSC_HSIZE 16
 
struct hfsc_sched
{
u16 defcls; /* default class id */
struct hfsc_class root; /* root class */
struct list_head clhash[HFSC_HSIZE]; /* class hash */
struct list_head eligible; /* eligible list */
struct list_head droplist; /* active leaf class list (for
dropping) */
struct sk_buff_head requeue; /* requeued packet */
struct timer_list wd_timer; /* watchdog timer */
};
 
/*
* macros
*/
#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
#include <linux/time.h>
#undef PSCHED_GET_TIME
#define PSCHED_GET_TIME(stamp) \
do { \
struct timeval tv; \
do_gettimeofday(&tv); \
(stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec; \
} while (0)
#endif
 
#if HFSC_DEBUG
#define ASSERT(cond) \
do { \
if (unlikely(!(cond))) \
printk("assertion %s failed at %s:%i (%s)\n", \
#cond, __FILE__, __LINE__, __FUNCTION__); \
} while (0)
#else
#define ASSERT(cond)
#endif /* HFSC_DEBUG */
 
#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */
 
 
/*
* eligible list holds backlogged classes being sorted by their eligible times.
* there is one eligible list per hfsc instance.
*/
 
static void
ellist_insert(struct hfsc_class *cl)
{
struct list_head *head = &cl->sched->eligible;
struct hfsc_class *p;
 
/* check the last entry first */
if (list_empty(head) ||
((p = list_entry(head->prev, struct hfsc_class, ellist)) &&
p->cl_e <= cl->cl_e)) {
list_add_tail(&cl->ellist, head);
return;
}
 
list_for_each_entry(p, head, ellist) {
if (cl->cl_e < p->cl_e) {
/* insert cl before p */
list_add_tail(&cl->ellist, &p->ellist);
return;
}
}
ASSERT(0); /* should not reach here */
}
 
static inline void
ellist_remove(struct hfsc_class *cl)
{
list_del(&cl->ellist);
}
 
static void
ellist_update(struct hfsc_class *cl)
{
struct list_head *head = &cl->sched->eligible;
struct hfsc_class *p, *last;
 
/*
* the eligible time of a class increases monotonically.
* if the next entry has a larger eligible time, nothing to do.
*/
if (cl->ellist.next == head ||
((p = list_entry(cl->ellist.next, struct hfsc_class, ellist)) &&
cl->cl_e <= p->cl_e))
return;
 
/* check the last entry */
last = list_entry(head->prev, struct hfsc_class, ellist);
if (last->cl_e <= cl->cl_e) {
list_move_tail(&cl->ellist, head);
return;
}
 
/*
* the new position must be between the next entry
* and the last entry
*/
list_for_each_entry_continue(p, head, ellist) {
if (cl->cl_e < p->cl_e) {
list_move_tail(&cl->ellist, &p->ellist);
return;
}
}
ASSERT(0); /* should not reach here */
}
 
/* find the class with the minimum deadline among the eligible classes */
static inline struct hfsc_class *
ellist_get_mindl(struct list_head *head, u64 cur_time)
{
struct hfsc_class *p, *cl = NULL;
 
list_for_each_entry(p, head, ellist) {
if (p->cl_e > cur_time)
break;
if (cl == NULL || p->cl_d < cl->cl_d)
cl = p;
}
return cl;
}
 
/* find the class with minimum eligible time among the eligible classes */
static inline struct hfsc_class *
ellist_get_minel(struct list_head *head)
{
if (list_empty(head))
return NULL;
return list_entry(head->next, struct hfsc_class, ellist);
}
 
/*
* active children list holds backlogged child classes being sorted
* by their virtual time. each intermediate class has one active
* children list.
*/
static void
actlist_insert(struct hfsc_class *cl)
{
struct list_head *head = &cl->cl_parent->actlist;
struct hfsc_class *p;
 
/* check the last entry first */
if (list_empty(head) ||
((p = list_entry(head->prev, struct hfsc_class, alist)) &&
p->cl_vt <= cl->cl_vt)) {
list_add_tail(&cl->alist, head);
return;
}
 
list_for_each_entry(p, head, alist) {
if (cl->cl_vt < p->cl_vt) {
/* insert cl before p */
list_add_tail(&cl->alist, &p->alist);
return;
}
}
ASSERT(0); /* should not reach here */
}
 
static inline void
actlist_remove(struct hfsc_class *cl)
{
list_del(&cl->alist);
}
 
static void
actlist_update(struct hfsc_class *cl)
{
struct list_head *head = &cl->cl_parent->actlist;
struct hfsc_class *p, *last;
 
/*
* the virtual time of a class increases monotonically.
* if the next entry has a larger virtual time, nothing to do.
*/
if (cl->alist.next == head ||
((p = list_entry(cl->alist.next, struct hfsc_class, alist)) &&
cl->cl_vt <= p->cl_vt))
return;
 
/* check the last entry */
last = list_entry(head->prev, struct hfsc_class, alist);
if (last->cl_vt <= cl->cl_vt) {
list_move_tail(&cl->alist, head);
return;
}
 
/*
* the new position must be between the next entry
* and the last entry
*/
list_for_each_entry_continue(p, head, alist) {
if (cl->cl_vt < p->cl_vt) {
list_move_tail(&cl->alist, &p->alist);
return;
}
}
ASSERT(0); /* should not reach here */
}
 
static inline struct hfsc_class *
actlist_firstfit(struct hfsc_class *cl, u64 cur_time)
{
struct hfsc_class *p;
 
list_for_each_entry(p, &cl->actlist, alist) {
if (p->cl_f <= cur_time) {
return p;
}
}
return NULL;
}
 
/*
* get the leaf class with the minimum vt in the hierarchy
*/
static struct hfsc_class *
actlist_get_minvt(struct hfsc_class *cl, u64 cur_time)
{
/* if root-class's cfmin is bigger than cur_time nothing to do */
if (cl->cl_cfmin > cur_time)
return NULL;
 
while (cl->level > 0) {
cl = actlist_firstfit(cl, cur_time);
if (cl == NULL)
return NULL;
/*
* update parent's cl_cvtmin.
*/
if (cl->cl_parent->cl_cvtmin < cl->cl_vt)
cl->cl_parent->cl_cvtmin = cl->cl_vt;
}
return cl;
}
 
/*
* service curve support functions
*
* external service curve parameters
* m: bps
* d: us
* internal service curve parameters
* sm: (bytes/psched_us) << SM_SHIFT
* ism: (psched_us/byte) << ISM_SHIFT
* dx: psched_us
*
* Time source resolution
* PSCHED_JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us.
* PSCHED_CPU: resolution is between 0.5us and 1us.
* PSCHED_GETTIMEOFDAY: resolution is exactly 1us.
*
* sm and ism are scaled in order to keep effective digits.
* SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective
* digits in decimal using the following table.
*
* Note: We can afford the additional accuracy (altq hfsc keeps at most
* 3 effective digits) thanks to the fact that linux clock is bounded
* much more tightly.
*
* bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps
* ------------+-------------------------------------------------------
* bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3
* bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3
* bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3
*
* 0.5us/byte 160 16 1.6 0.16 0.016
* us/byte 80 8 0.8 0.08 0.008
* 1.27us/byte 63 6.3 0.63 0.063 0.0063
*/
#define SM_SHIFT 20
#define ISM_SHIFT 18
 
#define SM_MASK ((1ULL << SM_SHIFT) - 1)
#define ISM_MASK ((1ULL << ISM_SHIFT) - 1)
 
static inline u64
seg_x2y(u64 x, u64 sm)
{
u64 y;
 
/*
* compute
* y = x * sm >> SM_SHIFT
* but divide it for the upper and lower bits to avoid overflow
*/
y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
return y;
}
 
static inline u64
seg_y2x(u64 y, u64 ism)
{
u64 x;
 
if (y == 0)
x = 0;
else if (ism == HT_INFINITY)
x = HT_INFINITY;
else {
x = (y >> ISM_SHIFT) * ism
+ (((y & ISM_MASK) * ism) >> ISM_SHIFT);
}
return x;
}
 
/* Convert m (bps) into sm (bytes/psched us) */
static u64
m2sm(u32 m)
{
u64 sm;
 
sm = ((u64)m << SM_SHIFT);
sm += PSCHED_JIFFIE2US(HZ) - 1;
do_div(sm, PSCHED_JIFFIE2US(HZ));
return sm;
}
 
/* convert m (bps) into ism (psched us/byte) */
static u64
m2ism(u32 m)
{
u64 ism;
 
if (m == 0)
ism = HT_INFINITY;
else {
ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT);
ism += m - 1;
do_div(ism, m);
}
return ism;
}
 
/* convert d (us) into dx (psched us) */
static u64
d2dx(u32 d)
{
u64 dx;
 
dx = ((u64)d * PSCHED_JIFFIE2US(HZ));
dx += 1000000 - 1;
do_div(dx, 1000000);
return dx;
}
 
/* convert sm (bytes/psched us) into m (bps) */
static u32
sm2m(u64 sm)
{
u64 m;
 
m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT;
return (u32)m;
}
 
/* convert dx (psched us) into d (us) */
static u32
dx2d(u64 dx)
{
u64 d;
 
d = dx * 1000000;
do_div(d, PSCHED_JIFFIE2US(HZ));
return (u32)d;
}
 
static void
sc2isc(struct tc_service_curve *sc, struct internal_sc *isc)
{
isc->sm1 = m2sm(sc->m1);
isc->ism1 = m2ism(sc->m1);
isc->dx = d2dx(sc->d);
isc->dy = seg_x2y(isc->dx, isc->sm1);
isc->sm2 = m2sm(sc->m2);
isc->ism2 = m2ism(sc->m2);
}
 
/*
* initialize the runtime service curve with the given internal
* service curve starting at (x, y).
*/
static void
rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
{
rtsc->x = x;
rtsc->y = y;
rtsc->sm1 = isc->sm1;
rtsc->ism1 = isc->ism1;
rtsc->dx = isc->dx;
rtsc->dy = isc->dy;
rtsc->sm2 = isc->sm2;
rtsc->ism2 = isc->ism2;
}
 
/*
* calculate the y-projection of the runtime service curve by the
* given x-projection value
*/
static u64
rtsc_y2x(struct runtime_sc *rtsc, u64 y)
{
u64 x;
 
if (y < rtsc->y)
x = rtsc->x;
else if (y <= rtsc->y + rtsc->dy) {
/* x belongs to the 1st segment */
if (rtsc->dy == 0)
x = rtsc->x + rtsc->dx;
else
x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1);
} else {
/* x belongs to the 2nd segment */
x = rtsc->x + rtsc->dx
+ seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2);
}
return x;
}
 
static u64
rtsc_x2y(struct runtime_sc *rtsc, u64 x)
{
u64 y;
 
if (x <= rtsc->x)
y = rtsc->y;
else if (x <= rtsc->x + rtsc->dx)
/* y belongs to the 1st segment */
y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1);
else
/* y belongs to the 2nd segment */
y = rtsc->y + rtsc->dy
+ seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2);
return y;
}
 
/*
* update the runtime service curve by taking the minimum of the current
* runtime service curve and the service curve starting at (x, y).
*/
static void
rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
{
u64 y1, y2, dx, dy;
u32 dsm;
 
if (isc->sm1 <= isc->sm2) {
/* service curve is convex */
y1 = rtsc_x2y(rtsc, x);
if (y1 < y)
/* the current rtsc is smaller */
return;
rtsc->x = x;
rtsc->y = y;
return;
}
 
/*
* service curve is concave
* compute the two y values of the current rtsc
* y1: at x
* y2: at (x + dx)
*/
y1 = rtsc_x2y(rtsc, x);
if (y1 <= y) {
/* rtsc is below isc, no change to rtsc */
return;
}
 
y2 = rtsc_x2y(rtsc, x + isc->dx);
if (y2 >= y + isc->dy) {
/* rtsc is above isc, replace rtsc by isc */
rtsc->x = x;
rtsc->y = y;
rtsc->dx = isc->dx;
rtsc->dy = isc->dy;
return;
}
 
/*
* the two curves intersect
* compute the offsets (dx, dy) using the reverse
* function of seg_x2y()
* seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
*/
dx = (y1 - y) << SM_SHIFT;
dsm = isc->sm1 - isc->sm2;
do_div(dx, dsm);
/*
* check if (x, y1) belongs to the 1st segment of rtsc.
* if so, add the offset.
*/
if (rtsc->x + rtsc->dx > x)
dx += rtsc->x + rtsc->dx - x;
dy = seg_x2y(dx, isc->sm1);
 
rtsc->x = x;
rtsc->y = y;
rtsc->dx = dx;
rtsc->dy = dy;
return;
}
 
static void
init_ed(struct hfsc_class *cl, unsigned int next_len)
{
u64 cur_time;
 
PSCHED_GET_TIME(cur_time);
 
/* update the deadline curve */
rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
 
/*
* update the eligible curve.
* for concave, it is equal to the deadline curve.
* for convex, it is a linear curve with slope m2.
*/
cl->cl_eligible = cl->cl_deadline;
if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
cl->cl_eligible.dx = 0;
cl->cl_eligible.dy = 0;
}
 
/* compute e and d */
cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
 
ellist_insert(cl);
}
 
static void
update_ed(struct hfsc_class *cl, unsigned int next_len)
{
cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
 
ellist_update(cl);
}
 
static inline void
update_d(struct hfsc_class *cl, unsigned int next_len)
{
cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
}
 
static void
update_cfmin(struct hfsc_class *cl)
{
struct hfsc_class *p;
u64 cfmin;
 
if (list_empty(&cl->actlist)) {
cl->cl_cfmin = 0;
return;
}
cfmin = HT_INFINITY;
list_for_each_entry(p, &cl->actlist, alist) {
if (p->cl_f == 0) {
cl->cl_cfmin = 0;
return;
}
if (p->cl_f < cfmin)
cfmin = p->cl_f;
}
cl->cl_cfmin = cfmin;
}
 
static void
init_vf(struct hfsc_class *cl, unsigned int len)
{
struct hfsc_class *max_cl, *p;
u64 vt, f, cur_time;
int go_active;
 
cur_time = 0;
go_active = 1;
for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
if (go_active && cl->cl_nactive++ == 0)
go_active = 1;
else
go_active = 0;
 
if (go_active) {
if (!list_empty(&cl->cl_parent->actlist)) {
max_cl = list_entry(cl->cl_parent->actlist.prev,
struct hfsc_class, alist);
/*
* set vt to the average of the min and max
* classes. if the parent's period didn't
* change, don't decrease vt of the class.
*/
vt = max_cl->cl_vt;
if (cl->cl_parent->cl_cvtmin != 0)
vt = (cl->cl_parent->cl_cvtmin + vt)/2;
 
if (cl->cl_parent->cl_vtperiod !=
cl->cl_parentperiod || vt > cl->cl_vt)
cl->cl_vt = vt;
} else {
/*
* first child for a new parent backlog period.
* add parent's cvtmax to vtoff of children
* to make a new vt (vtoff + vt) larger than
* the vt in the last period for all children.
*/
vt = cl->cl_parent->cl_cvtmax;
list_for_each_entry(p, &cl->cl_parent->children,
siblings)
p->cl_vtoff += vt;
cl->cl_vt = 0;
cl->cl_parent->cl_cvtmax = 0;
cl->cl_parent->cl_cvtmin = 0;
}
 
/* update the virtual curve */
vt = cl->cl_vt + cl->cl_vtoff;
rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
cl->cl_total);
if (cl->cl_virtual.x == vt) {
cl->cl_virtual.x -= cl->cl_vtoff;
cl->cl_vtoff = 0;
}
cl->cl_vtadj = 0;
 
cl->cl_vtperiod++; /* increment vt period */
cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
if (cl->cl_parent->cl_nactive == 0)
cl->cl_parentperiod++;
cl->cl_f = 0;
 
actlist_insert(cl);
 
if (cl->cl_flags & HFSC_USC) {
/* class has upper limit curve */
if (cur_time == 0)
PSCHED_GET_TIME(cur_time);
 
/* update the ulimit curve */
rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time,
cl->cl_total);
/* compute myf */
cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
cl->cl_total);
cl->cl_myfadj = 0;
}
}
 
f = max(cl->cl_myf, cl->cl_cfmin);
if (f != cl->cl_f) {
cl->cl_f = f;
update_cfmin(cl->cl_parent);
}
}
}
 
static void
update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
{
u64 f; /* , myf_bound, delta; */
int go_passive = 0;
 
if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC)
go_passive = 1;
 
for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
cl->cl_total += len;
 
if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0)
continue;
 
if (go_passive && --cl->cl_nactive == 0)
go_passive = 1;
else
go_passive = 0;
 
if (go_passive) {
/* no more active child, going passive */
 
/* update cvtmax of the parent class */
if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
cl->cl_parent->cl_cvtmax = cl->cl_vt;
 
/* remove this class from the vt list */
actlist_remove(cl);
 
update_cfmin(cl->cl_parent);
 
continue;
}
 
/*
* update vt and f
*/
cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
- cl->cl_vtoff + cl->cl_vtadj;
 
/*
* if vt of the class is smaller than cvtmin,
* the class was skipped in the past due to non-fit.
* if so, we need to adjust vtadj.
*/
if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
cl->cl_vt = cl->cl_parent->cl_cvtmin;
}
 
/* update the vt list */
actlist_update(cl);
 
if (cl->cl_flags & HFSC_USC) {
cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
cl->cl_total);
#if 0
/*
* This code causes classes to stay way under their
* limit when multiple classes are used at gigabit
* speed. needs investigation. -kaber
*/
/*
* if myf lags behind by more than one clock tick
* from the current time, adjust myfadj to prevent
* a rate-limited class from going greedy.
* in a steady state under rate-limiting, myf
* fluctuates within one clock tick.
*/
myf_bound = cur_time - PSCHED_JIFFIE2US(1);
if (cl->cl_myf < myf_bound) {
delta = cur_time - cl->cl_myf;
cl->cl_myfadj += delta;
cl->cl_myf += delta;
}
#endif
}
 
f = max(cl->cl_myf, cl->cl_cfmin);
if (f != cl->cl_f) {
cl->cl_f = f;
update_cfmin(cl->cl_parent);
}
}
}
 
static void
set_active(struct hfsc_class *cl, unsigned int len)
{
if (cl->cl_flags & HFSC_RSC)
init_ed(cl, len);
if (cl->cl_flags & HFSC_FSC)
init_vf(cl, len);
 
list_add_tail(&cl->dlist, &cl->sched->droplist);
}
 
static void
set_passive(struct hfsc_class *cl)
{
if (cl->cl_flags & HFSC_RSC)
ellist_remove(cl);
 
list_del(&cl->dlist);
 
/*
* actlist is now handled in update_vf() so that update_vf(cl, 0, 0)
* needs to be called explicitly to remove a class from actlist
*/
}
 
/*
* hack to get length of first packet in queue.
*/
static unsigned int
qdisc_peek_len(struct Qdisc *sch)
{
struct sk_buff *skb;
unsigned int len;
 
skb = sch->dequeue(sch);
if (skb == NULL) {
if (net_ratelimit())
printk("qdisc_peek_len: non work-conserving qdisc ?\n");
return 0;
}
len = skb->len;
if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) {
if (net_ratelimit())
printk("qdisc_peek_len: failed to requeue\n");
return 0;
}
return len;
}
 
static void
hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
{
unsigned int len = cl->qdisc->q.qlen;
 
qdisc_reset(cl->qdisc);
if (len > 0) {
update_vf(cl, 0, 0);
set_passive(cl);
sch->q.qlen -= len;
}
}
 
static void
hfsc_adjust_levels(struct hfsc_class *cl)
{
struct hfsc_class *p;
unsigned int level;
 
do {
level = 0;
list_for_each_entry(p, &cl->children, siblings) {
if (p->level > level)
level = p->level;
}
cl->level = level + 1;
} while ((cl = cl->cl_parent) != NULL);
}
 
static inline unsigned int
hfsc_hash(u32 h)
{
h ^= h >> 8;
h ^= h >> 4;
 
return h & (HFSC_HSIZE - 1);
}
 
static inline struct hfsc_class *
hfsc_find_class(u32 classid, struct Qdisc *sch)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl;
 
list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) {
if (cl->classid == classid)
return cl;
}
return NULL;
}
 
static void
hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc,
u64 cur_time)
{
sc2isc(rsc, &cl->cl_rsc);
rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
cl->cl_eligible = cl->cl_deadline;
if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
cl->cl_eligible.dx = 0;
cl->cl_eligible.dy = 0;
}
cl->cl_flags |= HFSC_RSC;
}
 
static void
hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
{
sc2isc(fsc, &cl->cl_fsc);
rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
cl->cl_flags |= HFSC_FSC;
}
 
static void
hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
u64 cur_time)
{
sc2isc(usc, &cl->cl_usc);
rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total);
cl->cl_flags |= HFSC_USC;
}
 
static int
hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
struct rtattr **tca, unsigned long *arg)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl = (struct hfsc_class *)*arg;
struct hfsc_class *parent = NULL;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_HFSC_MAX];
struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL;
u64 cur_time;
 
if (opt == NULL ||
rtattr_parse(tb, TCA_HFSC_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)))
return -EINVAL;
 
if (tb[TCA_HFSC_RSC-1]) {
if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc))
return -EINVAL;
rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]);
if (rsc->m1 == 0 && rsc->m2 == 0)
rsc = NULL;
}
 
if (tb[TCA_HFSC_FSC-1]) {
if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc))
return -EINVAL;
fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]);
if (fsc->m1 == 0 && fsc->m2 == 0)
fsc = NULL;
}
 
if (tb[TCA_HFSC_USC-1]) {
if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc))
return -EINVAL;
usc = RTA_DATA(tb[TCA_HFSC_USC-1]);
if (usc->m1 == 0 && usc->m2 == 0)
usc = NULL;
}
 
if (cl != NULL) {
if (parentid) {
if (cl->cl_parent && cl->cl_parent->classid != parentid)
return -EINVAL;
if (cl->cl_parent == NULL && parentid != TC_H_ROOT)
return -EINVAL;
}
PSCHED_GET_TIME(cur_time);
 
sch_tree_lock(sch);
if (rsc != NULL)
hfsc_change_rsc(cl, rsc, cur_time);
if (fsc != NULL)
hfsc_change_fsc(cl, fsc);
if (usc != NULL)
hfsc_change_usc(cl, usc, cur_time);
 
if (cl->qdisc->q.qlen != 0) {
if (cl->cl_flags & HFSC_RSC)
update_ed(cl, qdisc_peek_len(cl->qdisc));
if (cl->cl_flags & HFSC_FSC)
update_vf(cl, 0, cur_time);
}
sch_tree_unlock(sch);
 
#ifdef CONFIG_NET_ESTIMATOR
if (tca[TCA_RATE-1]) {
qdisc_kill_estimator(&cl->stats);
qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
}
#endif
return 0;
}
 
if (parentid == TC_H_ROOT)
return -EEXIST;
 
parent = &q->root;
if (parentid) {
parent = hfsc_find_class(parentid, sch);
if (parent == NULL)
return -ENOENT;
}
 
if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0)
return -EINVAL;
if (hfsc_find_class(classid, sch))
return -EEXIST;
 
if (rsc == NULL && fsc == NULL)
return -EINVAL;
 
cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL);
if (cl == NULL)
return -ENOBUFS;
memset(cl, 0, sizeof(struct hfsc_class));
 
if (rsc != NULL)
hfsc_change_rsc(cl, rsc, 0);
if (fsc != NULL)
hfsc_change_fsc(cl, fsc);
if (usc != NULL)
hfsc_change_usc(cl, usc, 0);
 
cl->refcnt = 1;
cl->classid = classid;
cl->sched = q;
cl->cl_parent = parent;
cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
if (cl->qdisc == NULL)
cl->qdisc = &noop_qdisc;
cl->stats.lock = &sch->dev->queue_lock;
INIT_LIST_HEAD(&cl->children);
INIT_LIST_HEAD(&cl->actlist);
 
sch_tree_lock(sch);
list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]);
list_add_tail(&cl->siblings, &parent->children);
if (parent->level == 0)
hfsc_purge_queue(sch, parent);
hfsc_adjust_levels(parent);
sch_tree_unlock(sch);
 
#ifdef CONFIG_NET_ESTIMATOR
if (tca[TCA_RATE-1])
qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
#endif
*arg = (unsigned long)cl;
return 0;
}
 
static void
hfsc_destroy_filters(struct tcf_proto **fl)
{
struct tcf_proto *tp;
 
while ((tp = *fl) != NULL) {
*fl = tp->next;
tcf_destroy(tp);
}
}
 
static void
hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
 
hfsc_destroy_filters(&cl->filter_list);
qdisc_destroy(cl->qdisc);
#ifdef CONFIG_NET_ESTIMATOR
qdisc_kill_estimator(&cl->stats);
#endif
if (cl != &q->root)
kfree(cl);
}
 
static int
hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl = (struct hfsc_class *)arg;
 
if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root)
return -EBUSY;
 
sch_tree_lock(sch);
 
list_del(&cl->hlist);
list_del(&cl->siblings);
hfsc_adjust_levels(cl->cl_parent);
hfsc_purge_queue(sch, cl);
if (--cl->refcnt == 0)
hfsc_destroy_class(sch, cl);
 
sch_tree_unlock(sch);
return 0;
}
 
static struct hfsc_class *
hfsc_classify(struct sk_buff *skb, struct Qdisc *sch)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl;
struct tcf_result res;
struct tcf_proto *tcf;
int result;
 
if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 &&
(cl = hfsc_find_class(skb->priority, sch)) != NULL)
if (cl->level == 0)
return cl;
 
tcf = q->root.filter_list;
while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
#ifdef CONFIG_NET_CLS_POLICE
if (result == TC_POLICE_SHOT)
return NULL;
#endif
if ((cl = (struct hfsc_class *)res.class) == NULL) {
if ((cl = hfsc_find_class(res.classid, sch)) == NULL)
break; /* filter selected invalid classid */
}
 
if (cl->level == 0)
return cl; /* hit leaf class */
 
/* apply inner filter chain */
tcf = cl->filter_list;
}
 
/* classification failed, try default class */
cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
if (cl == NULL || cl->level > 0)
return NULL;
 
return cl;
}
 
static int
hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
 
if (cl == NULL)
return -ENOENT;
if (cl->level > 0)
return -EINVAL;
if (new == NULL) {
new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
if (new == NULL)
new = &noop_qdisc;
}
 
sch_tree_lock(sch);
hfsc_purge_queue(sch, cl);
*old = xchg(&cl->qdisc, new);
sch_tree_unlock(sch);
return 0;
}
 
static struct Qdisc *
hfsc_class_leaf(struct Qdisc *sch, unsigned long arg)
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
 
if (cl != NULL && cl->level == 0)
return cl->qdisc;
 
return NULL;
}
 
static unsigned long
hfsc_get_class(struct Qdisc *sch, u32 classid)
{
struct hfsc_class *cl = hfsc_find_class(classid, sch);
 
if (cl != NULL)
cl->refcnt++;
 
return (unsigned long)cl;
}
 
static void
hfsc_put_class(struct Qdisc *sch, unsigned long arg)
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
 
if (--cl->refcnt == 0)
hfsc_destroy_class(sch, cl);
}
 
static unsigned long
hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid)
{
struct hfsc_class *p = (struct hfsc_class *)parent;
struct hfsc_class *cl = hfsc_find_class(classid, sch);
 
if (cl != NULL) {
if (p != NULL && p->level <= cl->level)
return 0;
cl->filter_cnt++;
}
 
return (unsigned long)cl;
}
 
static void
hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
 
cl->filter_cnt--;
}
 
static struct tcf_proto **
hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl = (struct hfsc_class *)arg;
 
if (cl == NULL)
cl = &q->root;
 
return &cl->filter_list;
}
 
static int
hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
{
struct tc_service_curve tsc;
 
tsc.m1 = sm2m(sc->sm1);
tsc.d = dx2d(sc->dx);
tsc.m2 = sm2m(sc->sm2);
RTA_PUT(skb, attr, sizeof(tsc), &tsc);
 
return skb->len;
 
rtattr_failure:
return -1;
}
 
static inline int
hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
{
if ((cl->cl_flags & HFSC_RSC) &&
(hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0))
goto rtattr_failure;
 
if ((cl->cl_flags & HFSC_FSC) &&
(hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0))
goto rtattr_failure;
 
if ((cl->cl_flags & HFSC_USC) &&
(hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0))
goto rtattr_failure;
 
return skb->len;
 
rtattr_failure:
return -1;
}
 
static inline int
hfsc_dump_stats(struct sk_buff *skb, struct hfsc_class *cl)
{
cl->stats.qlen = cl->qdisc->q.qlen;
if (qdisc_copy_stats(skb, &cl->stats) < 0)
goto rtattr_failure;
 
return skb->len;
 
rtattr_failure:
return -1;
}
 
static inline int
hfsc_dump_xstats(struct sk_buff *skb, struct hfsc_class *cl)
{
struct tc_hfsc_stats xstats;
 
xstats.level = cl->level;
xstats.period = cl->cl_vtperiod;
xstats.work = cl->cl_total;
xstats.rtwork = cl->cl_cumul;
RTA_PUT(skb, TCA_XSTATS, sizeof(xstats), &xstats);
 
return skb->len;
 
rtattr_failure:
return -1;
}
 
static int
hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
struct tcmsg *tcm)
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
unsigned char *b = skb->tail;
struct rtattr *rta = (struct rtattr *)b;
 
tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT;
tcm->tcm_handle = cl->classid;
if (cl->level == 0)
tcm->tcm_info = cl->qdisc->handle;
 
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
if (hfsc_dump_curves(skb, cl) < 0)
goto rtattr_failure;
rta->rta_len = skb->tail - b;
 
if ((hfsc_dump_stats(skb, cl) < 0) ||
(hfsc_dump_xstats(skb, cl) < 0))
goto rtattr_failure;
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static void
hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl;
unsigned int i;
 
if (arg->stop)
return;
 
for (i = 0; i < HFSC_HSIZE; i++) {
list_for_each_entry(cl, &q->clhash[i], hlist) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
 
static void
hfsc_watchdog(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
 
sch->flags &= ~TCQ_F_THROTTLED;
netif_schedule(sch->dev);
}
 
static void
hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl;
u64 next_time = 0;
long delay;
 
if ((cl = ellist_get_minel(&q->eligible)) != NULL)
next_time = cl->cl_e;
if (q->root.cl_cfmin != 0) {
if (next_time == 0 || next_time > q->root.cl_cfmin)
next_time = q->root.cl_cfmin;
}
ASSERT(next_time != 0);
delay = next_time - cur_time;
delay = PSCHED_US2JIFFIE(delay);
 
sch->flags |= TCQ_F_THROTTLED;
mod_timer(&q->wd_timer, jiffies + delay);
}
 
static int
hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct tc_hfsc_qopt *qopt;
unsigned int i;
 
if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
return -EINVAL;
qopt = RTA_DATA(opt);
 
memset(q, 0, sizeof(struct hfsc_sched));
sch->stats.lock = &sch->dev->queue_lock;
 
q->defcls = qopt->defcls;
for (i = 0; i < HFSC_HSIZE; i++)
INIT_LIST_HEAD(&q->clhash[i]);
INIT_LIST_HEAD(&q->eligible);
INIT_LIST_HEAD(&q->droplist);
skb_queue_head_init(&q->requeue);
 
q->root.refcnt = 1;
q->root.classid = sch->handle;
q->root.sched = q;
q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
if (q->root.qdisc == NULL)
q->root.qdisc = &noop_qdisc;
q->root.stats.lock = &sch->dev->queue_lock;
INIT_LIST_HEAD(&q->root.children);
INIT_LIST_HEAD(&q->root.actlist);
 
list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]);
 
init_timer(&q->wd_timer);
q->wd_timer.function = hfsc_watchdog;
q->wd_timer.data = (unsigned long)sch;
 
MOD_INC_USE_COUNT;
return 0;
}
 
static int
hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct tc_hfsc_qopt *qopt;
 
if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
return -EINVAL;;
qopt = RTA_DATA(opt);
 
sch_tree_lock(sch);
q->defcls = qopt->defcls;
sch_tree_unlock(sch);
 
return 0;
}
 
static void
hfsc_reset_class(struct hfsc_class *cl)
{
cl->cl_total = 0;
cl->cl_cumul = 0;
cl->cl_d = 0;
cl->cl_e = 0;
cl->cl_vt = 0;
cl->cl_vtadj = 0;
cl->cl_vtoff = 0;
cl->cl_cvtmin = 0;
cl->cl_cvtmax = 0;
cl->cl_vtperiod = 0;
cl->cl_parentperiod = 0;
cl->cl_f = 0;
cl->cl_myf = 0;
cl->cl_myfadj = 0;
cl->cl_cfmin = 0;
cl->cl_nactive = 0;
INIT_LIST_HEAD(&cl->actlist);
qdisc_reset(cl->qdisc);
 
if (cl->cl_flags & HFSC_RSC)
rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0);
if (cl->cl_flags & HFSC_FSC)
rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0);
if (cl->cl_flags & HFSC_USC)
rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0);
}
 
static void
hfsc_reset_qdisc(struct Qdisc *sch)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl;
unsigned int i;
 
for (i = 0; i < HFSC_HSIZE; i++) {
list_for_each_entry(cl, &q->clhash[i], hlist)
hfsc_reset_class(cl);
}
__skb_queue_purge(&q->requeue);
INIT_LIST_HEAD(&q->eligible);
INIT_LIST_HEAD(&q->droplist);
del_timer(&q->wd_timer);
sch->flags &= ~TCQ_F_THROTTLED;
sch->q.qlen = 0;
}
 
static void
hfsc_destroy_qdisc(struct Qdisc *sch)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl, *next;
unsigned int i;
 
for (i = 0; i < HFSC_HSIZE; i++) {
list_for_each_entry_safe(cl, next, &q->clhash[i], hlist)
hfsc_destroy_class(sch, cl);
}
__skb_queue_purge(&q->requeue);
del_timer(&q->wd_timer);
MOD_DEC_USE_COUNT;
}
 
static int
hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
unsigned char *b = skb->tail;
struct tc_hfsc_qopt qopt;
 
qopt.defcls = q->defcls;
RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 
sch->stats.qlen = sch->q.qlen;
if (qdisc_copy_stats(skb, &sch->stats) < 0)
goto rtattr_failure;
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int
hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct hfsc_class *cl = hfsc_classify(skb, sch);
unsigned int len = skb->len;
int err;
 
if (cl == NULL) {
kfree_skb(skb);
sch->stats.drops++;
return NET_XMIT_DROP;
}
 
err = cl->qdisc->enqueue(skb, cl->qdisc);
if (unlikely(err != NET_XMIT_SUCCESS)) {
cl->stats.drops++;
sch->stats.drops++;
return err;
}
 
if (cl->qdisc->q.qlen == 1)
set_active(cl, len);
 
cl->stats.packets++;
cl->stats.bytes += len;
sch->stats.packets++;
sch->stats.bytes += len;
sch->q.qlen++;
 
return NET_XMIT_SUCCESS;
}
 
static struct sk_buff *
hfsc_dequeue(struct Qdisc *sch)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl;
struct sk_buff *skb;
u64 cur_time;
unsigned int next_len;
int realtime = 0;
 
if (sch->q.qlen == 0)
return NULL;
if ((skb = __skb_dequeue(&q->requeue)))
goto out;
 
PSCHED_GET_TIME(cur_time);
 
/*
* if there are eligible classes, use real-time criteria.
* find the class with the minimum deadline among
* the eligible classes.
*/
if ((cl = ellist_get_mindl(&q->eligible, cur_time)) != NULL) {
realtime = 1;
} else {
/*
* use link-sharing criteria
* get the class with the minimum vt in the hierarchy
*/
cl = actlist_get_minvt(&q->root, cur_time);
if (cl == NULL) {
sch->stats.overlimits++;
if (!netif_queue_stopped(sch->dev))
hfsc_schedule_watchdog(sch, cur_time);
return NULL;
}
}
 
skb = cl->qdisc->dequeue(cl->qdisc);
if (skb == NULL) {
if (net_ratelimit())
printk("HFSC: Non-work-conserving qdisc ?\n");
return NULL;
}
 
update_vf(cl, skb->len, cur_time);
if (realtime)
cl->cl_cumul += skb->len;
 
if (cl->qdisc->q.qlen != 0) {
if (cl->cl_flags & HFSC_RSC) {
/* update ed */
next_len = qdisc_peek_len(cl->qdisc);
if (realtime)
update_ed(cl, next_len);
else
update_d(cl, next_len);
}
} else {
/* the class becomes passive */
set_passive(cl);
}
 
out:
sch->flags &= ~TCQ_F_THROTTLED;
sch->q.qlen--;
 
return skb;
}
 
static int
hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
 
__skb_queue_head(&q->requeue, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
 
static unsigned int
hfsc_drop(struct Qdisc *sch)
{
struct hfsc_sched *q = (struct hfsc_sched *)sch->data;
struct hfsc_class *cl;
unsigned int len;
 
list_for_each_entry(cl, &q->droplist, dlist) {
if (cl->qdisc->ops->drop != NULL &&
(len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
if (cl->qdisc->q.qlen == 0) {
update_vf(cl, 0, 0);
set_passive(cl);
} else {
list_move_tail(&cl->dlist, &q->droplist);
}
cl->stats.drops++;
sch->stats.drops++;
sch->q.qlen--;
return len;
}
}
return 0;
}
 
static struct Qdisc_class_ops hfsc_class_ops = {
.change = hfsc_change_class,
.delete = hfsc_delete_class,
.graft = hfsc_graft_class,
.leaf = hfsc_class_leaf,
.get = hfsc_get_class,
.put = hfsc_put_class,
.bind_tcf = hfsc_bind_tcf,
.unbind_tcf = hfsc_unbind_tcf,
.tcf_chain = hfsc_tcf_chain,
.dump = hfsc_dump_class,
.walk = hfsc_walk
};
 
struct Qdisc_ops hfsc_qdisc_ops = {
.id = "hfsc",
.init = hfsc_init_qdisc,
.change = hfsc_change_qdisc,
.reset = hfsc_reset_qdisc,
.destroy = hfsc_destroy_qdisc,
.dump = hfsc_dump_qdisc,
.enqueue = hfsc_enqueue,
.dequeue = hfsc_dequeue,
.requeue = hfsc_requeue,
.drop = hfsc_drop,
.cl_ops = &hfsc_class_ops,
.priv_size = sizeof(struct hfsc_sched)
};
 
static int __init
hfsc_init(void)
{
return register_qdisc(&hfsc_qdisc_ops);
}
 
static void __exit
hfsc_cleanup(void)
{
unregister_qdisc(&hfsc_qdisc_ops);
}
 
MODULE_LICENSE("GPL");
module_init(hfsc_init);
module_exit(hfsc_cleanup);
/sch_fifo.c
0,0 → 1,211
/*
* net/sched/sch_fifo.c The simplest FIFO queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
#include <linux/config.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
/* 1 band FIFO pseudo-"scheduler" */
 
struct fifo_sched_data
{
unsigned limit;
};
 
static int
bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
 
if (sch->stats.backlog + skb->len <= q->limit) {
__skb_queue_tail(&sch->q, skb);
sch->stats.backlog += skb->len;
sch->stats.bytes += skb->len;
sch->stats.packets++;
return 0;
}
sch->stats.drops++;
#ifdef CONFIG_NET_CLS_POLICE
if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
#endif
kfree_skb(skb);
return NET_XMIT_DROP;
}
 
static int
bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
__skb_queue_head(&sch->q, skb);
sch->stats.backlog += skb->len;
return 0;
}
 
static struct sk_buff *
bfifo_dequeue(struct Qdisc* sch)
{
struct sk_buff *skb;
 
skb = __skb_dequeue(&sch->q);
if (skb)
sch->stats.backlog -= skb->len;
return skb;
}
 
static unsigned int
fifo_drop(struct Qdisc* sch)
{
struct sk_buff *skb;
 
skb = __skb_dequeue_tail(&sch->q);
if (skb) {
unsigned int len = skb->len;
sch->stats.backlog -= len;
kfree_skb(skb);
return len;
}
return 0;
}
 
static void
fifo_reset(struct Qdisc* sch)
{
skb_queue_purge(&sch->q);
sch->stats.backlog = 0;
}
 
static int
pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data;
 
if (sch->q.qlen < q->limit) {
__skb_queue_tail(&sch->q, skb);
sch->stats.bytes += skb->len;
sch->stats.packets++;
return 0;
}
sch->stats.drops++;
#ifdef CONFIG_NET_CLS_POLICE
if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
#endif
kfree_skb(skb);
return NET_XMIT_DROP;
}
 
static int
pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
__skb_queue_head(&sch->q, skb);
return 0;
}
 
 
static struct sk_buff *
pfifo_dequeue(struct Qdisc* sch)
{
return __skb_dequeue(&sch->q);
}
 
static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
{
struct fifo_sched_data *q = (void*)sch->data;
 
if (opt == NULL) {
unsigned int limit = sch->dev->tx_queue_len ? : 1;
 
if (sch->ops == &bfifo_qdisc_ops)
q->limit = limit*sch->dev->mtu;
else
q->limit = limit;
} else {
struct tc_fifo_qopt *ctl = RTA_DATA(opt);
if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
return -EINVAL;
q->limit = ctl->limit;
}
return 0;
}
 
static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fifo_sched_data *q = (void*)sch->data;
unsigned char *b = skb->tail;
struct tc_fifo_qopt opt;
 
opt.limit = q->limit;
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
struct Qdisc_ops pfifo_qdisc_ops =
{
NULL,
NULL,
"pfifo",
sizeof(struct fifo_sched_data),
 
pfifo_enqueue,
pfifo_dequeue,
pfifo_requeue,
fifo_drop,
 
fifo_init,
fifo_reset,
NULL,
fifo_init,
 
fifo_dump,
};
 
struct Qdisc_ops bfifo_qdisc_ops =
{
NULL,
NULL,
"bfifo",
sizeof(struct fifo_sched_data),
 
bfifo_enqueue,
bfifo_dequeue,
bfifo_requeue,
fifo_drop,
 
fifo_init,
fifo_reset,
NULL,
fifo_init,
fifo_dump,
};
/cls_rsvp6.c
0,0 → 1,43
/*
* net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <linux/ipv6.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
#define RSVP_DST_LEN 4
#define RSVP_ID "rsvp6"
#define RSVP_OPS cls_rsvp6_ops
 
#include "cls_rsvp.h"
MODULE_LICENSE("GPL");
/sch_sfq.c
0,0 → 1,502
/*
* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <net/ip.h>
#include <linux/ipv6.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
 
/* Stochastic Fairness Queuing algorithm.
=======================================
 
Source:
Paul E. McKenney "Stochastic Fairness Queuing",
IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 
Paul E. McKenney "Stochastic Fairness Queuing",
"Interworking: Research and Experience", v.2, 1991, p.113-131.
 
 
See also:
M. Shreedhar and George Varghese "Efficient Fair
Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 
 
This is not the thing that is usually called (W)FQ nowadays.
It does not use any timestamp mechanism, but instead
processes queues in round-robin order.
 
ADVANTAGE:
 
- It is very cheap. Both CPU and memory requirements are minimal.
 
DRAWBACKS:
 
- "Stochastic" -> It is not 100% fair.
When hash collisions occur, several flows are considered as one.
 
- "Round-robin" -> It introduces larger delays than virtual clock
based schemes, and should not be used for isolating interactive
traffic from non-interactive. It means, that this scheduler
should be used as leaf of CBQ or P3, which put interactive traffic
to higher priority band.
 
We still need true WFQ for top level CSZ, but using WFQ
for the best effort traffic is absolutely pointless:
SFQ is superior for this purpose.
 
IMPLEMENTATION:
This implementation limits maximal queue length to 128;
maximal mtu to 2^15-1; number of hash buckets to 1024.
The only goal of this restrictions was that all data
fit into one 4K page :-). Struct sfq_sched_data is
organized in anti-cache manner: all the data for a bucket
are scattered over different locations. This is not good,
but it allowed me to put it into 4K.
 
It is easy to increase these values, but not in flight. */
 
#define SFQ_DEPTH 128
#define SFQ_HASH_DIVISOR 1024
 
/* This type should contain at least SFQ_DEPTH*2 values */
typedef unsigned char sfq_index;
 
struct sfq_head
{
sfq_index next;
sfq_index prev;
};
 
struct sfq_sched_data
{
/* Parameters */
int perturb_period;
unsigned quantum; /* Allotment per round: MUST BE >= MTU */
int limit;
 
/* Variables */
struct timer_list perturb_timer;
int perturbation;
sfq_index tail; /* Index of current slot in round */
sfq_index max_depth; /* Maximal depth */
 
sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
sfq_index next[SFQ_DEPTH]; /* Active slots link */
short allot[SFQ_DEPTH]; /* Current allotment per slot */
unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */
struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */
struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */
};
 
static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
{
int pert = q->perturbation;
 
/* Have we any rotation primitives? If not, WHY? */
h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
h ^= h>>10;
return h & 0x3FF;
}
 
static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
{
u32 h, h2;
 
switch (skb->protocol) {
case __constant_htons(ETH_P_IP):
{
struct iphdr *iph = skb->nh.iph;
h = iph->daddr;
h2 = iph->saddr^iph->protocol;
if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
(iph->protocol == IPPROTO_TCP ||
iph->protocol == IPPROTO_UDP ||
iph->protocol == IPPROTO_ESP))
h2 ^= *(((u32*)iph) + iph->ihl);
break;
}
case __constant_htons(ETH_P_IPV6):
{
struct ipv6hdr *iph = skb->nh.ipv6h;
h = iph->daddr.s6_addr32[3];
h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
if (iph->nexthdr == IPPROTO_TCP ||
iph->nexthdr == IPPROTO_UDP ||
iph->nexthdr == IPPROTO_ESP)
h2 ^= *(u32*)&iph[1];
break;
}
default:
h = (u32)(unsigned long)skb->dst^skb->protocol;
h2 = (u32)(unsigned long)skb->sk;
}
return sfq_fold_hash(q, h, h2);
}
 
extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
int d = q->qs[x].qlen + SFQ_DEPTH;
 
p = d;
n = q->dep[d].next;
q->dep[x].next = n;
q->dep[x].prev = p;
q->dep[p].next = q->dep[n].prev = x;
}
 
extern __inline__ void sfq_dec(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
 
n = q->dep[x].next;
p = q->dep[x].prev;
q->dep[p].next = n;
q->dep[n].prev = p;
 
if (n == p && q->max_depth == q->qs[x].qlen + 1)
q->max_depth--;
 
sfq_link(q, x);
}
 
extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
int d;
 
n = q->dep[x].next;
p = q->dep[x].prev;
q->dep[p].next = n;
q->dep[n].prev = p;
d = q->qs[x].qlen;
if (q->max_depth < d)
q->max_depth = d;
 
sfq_link(q, x);
}
 
static unsigned int sfq_drop(struct Qdisc *sch)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
sfq_index d = q->max_depth;
struct sk_buff *skb;
unsigned int len;
 
/* Queue is full! Find the longest slot and
drop a packet from it */
 
if (d > 1) {
sfq_index x = q->dep[d+SFQ_DEPTH].next;
skb = q->qs[x].prev;
len = skb->len;
__skb_unlink(skb, &q->qs[x]);
kfree_skb(skb);
sfq_dec(q, x);
sch->q.qlen--;
sch->stats.drops++;
return len;
}
 
if (d == 1) {
/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
d = q->next[q->tail];
q->next[q->tail] = q->next[d];
q->allot[q->next[d]] += q->quantum;
skb = q->qs[d].prev;
len = skb->len;
__skb_unlink(skb, &q->qs[d]);
kfree_skb(skb);
sfq_dec(q, d);
sch->q.qlen--;
q->ht[q->hash[d]] = SFQ_DEPTH;
sch->stats.drops++;
return len;
}
 
return 0;
}
 
static int
sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
unsigned hash = sfq_hash(q, skb);
sfq_index x;
 
x = q->ht[hash];
if (x == SFQ_DEPTH) {
q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
q->hash[x] = hash;
}
__skb_queue_tail(&q->qs[x], skb);
sfq_inc(q, x);
if (q->qs[x].qlen == 1) { /* The flow is new */
if (q->tail == SFQ_DEPTH) { /* It is the first flow */
q->tail = x;
q->next[x] = x;
q->allot[x] = q->quantum;
} else {
q->next[x] = q->next[q->tail];
q->next[q->tail] = x;
q->tail = x;
}
}
if (++sch->q.qlen < q->limit-1) {
sch->stats.bytes += skb->len;
sch->stats.packets++;
return 0;
}
 
sfq_drop(sch);
return NET_XMIT_CN;
}
 
static int
sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
unsigned hash = sfq_hash(q, skb);
sfq_index x;
 
x = q->ht[hash];
if (x == SFQ_DEPTH) {
q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
q->hash[x] = hash;
}
__skb_queue_head(&q->qs[x], skb);
sfq_inc(q, x);
if (q->qs[x].qlen == 1) { /* The flow is new */
if (q->tail == SFQ_DEPTH) { /* It is the first flow */
q->tail = x;
q->next[x] = x;
q->allot[x] = q->quantum;
} else {
q->next[x] = q->next[q->tail];
q->next[q->tail] = x;
q->tail = x;
}
}
if (++sch->q.qlen < q->limit - 1)
return 0;
 
sch->stats.drops++;
sfq_drop(sch);
return NET_XMIT_CN;
}
 
 
 
 
static struct sk_buff *
sfq_dequeue(struct Qdisc* sch)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
struct sk_buff *skb;
sfq_index a, old_a;
 
/* No active slots */
if (q->tail == SFQ_DEPTH)
return NULL;
 
a = old_a = q->next[q->tail];
 
/* Grab packet */
skb = __skb_dequeue(&q->qs[a]);
sfq_dec(q, a);
sch->q.qlen--;
 
/* Is the slot empty? */
if (q->qs[a].qlen == 0) {
q->ht[q->hash[a]] = SFQ_DEPTH;
a = q->next[a];
if (a == old_a) {
q->tail = SFQ_DEPTH;
return skb;
}
q->next[q->tail] = a;
q->allot[a] += q->quantum;
} else if ((q->allot[a] -= skb->len) <= 0) {
q->tail = a;
a = q->next[a];
q->allot[a] += q->quantum;
}
return skb;
}
 
static void
sfq_reset(struct Qdisc* sch)
{
struct sk_buff *skb;
 
while ((skb = sfq_dequeue(sch)) != NULL)
kfree_skb(skb);
}
 
static void sfq_perturbation(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
 
q->perturbation = net_random()&0x1F;
q->perturb_timer.expires = jiffies + q->perturb_period;
 
if (q->perturb_period) {
q->perturb_timer.expires = jiffies + q->perturb_period;
add_timer(&q->perturb_timer);
}
}
 
static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
struct tc_sfq_qopt *ctl = RTA_DATA(opt);
 
if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
return -EINVAL;
 
sch_tree_lock(sch);
q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
q->perturb_period = ctl->perturb_period*HZ;
if (ctl->limit)
q->limit = min_t(u32, ctl->limit, SFQ_DEPTH);
 
while (sch->q.qlen >= q->limit-1)
sfq_drop(sch);
 
del_timer(&q->perturb_timer);
if (q->perturb_period) {
q->perturb_timer.expires = jiffies + q->perturb_period;
add_timer(&q->perturb_timer);
}
sch_tree_unlock(sch);
return 0;
}
 
static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
int i;
 
q->perturb_timer.data = (unsigned long)sch;
q->perturb_timer.function = sfq_perturbation;
init_timer(&q->perturb_timer);
 
for (i=0; i<SFQ_HASH_DIVISOR; i++)
q->ht[i] = SFQ_DEPTH;
for (i=0; i<SFQ_DEPTH; i++) {
skb_queue_head_init(&q->qs[i]);
q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
}
q->limit = SFQ_DEPTH;
q->max_depth = 0;
q->tail = SFQ_DEPTH;
if (opt == NULL) {
q->quantum = psched_mtu(sch->dev);
q->perturb_period = 0;
} else {
int err = sfq_change(sch, opt);
if (err)
return err;
}
for (i=0; i<SFQ_DEPTH; i++)
sfq_link(q, i);
MOD_INC_USE_COUNT;
return 0;
}
 
static void sfq_destroy(struct Qdisc *sch)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
del_timer(&q->perturb_timer);
MOD_DEC_USE_COUNT;
}
 
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
unsigned char *b = skb->tail;
struct tc_sfq_qopt opt;
 
opt.quantum = q->quantum;
opt.perturb_period = q->perturb_period/HZ;
 
opt.limit = q->limit;
opt.divisor = SFQ_HASH_DIVISOR;
opt.flows = q->limit;
 
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
struct Qdisc_ops sfq_qdisc_ops =
{
NULL,
NULL,
"sfq",
sizeof(struct sfq_sched_data),
 
sfq_enqueue,
sfq_dequeue,
sfq_requeue,
sfq_drop,
 
sfq_init,
sfq_reset,
sfq_destroy,
NULL, /* sfq_change */
 
sfq_dump,
};
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&sfq_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&sfq_qdisc_ops);
}
#endif
MODULE_LICENSE("GPL");
/Config.in
0,0 → 1,42
#
# Traffic control configuration.
#
tristate ' CBQ packet scheduler' CONFIG_NET_SCH_CBQ
tristate ' HTB packet scheduler' CONFIG_NET_SCH_HTB
tristate ' CSZ packet scheduler' CONFIG_NET_SCH_CSZ
#tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ
tristate ' H-FSC packet scheduler' CONFIG_NET_SCH_HFSC
if [ "$CONFIG_ATM" = "y" -o "$CONFIG_ATM" = "m" ]; then
dep_tristate ' ATM pseudo-scheduler' CONFIG_NET_SCH_ATM $CONFIG_ATM
fi
tristate ' The simplest PRIO pseudoscheduler' CONFIG_NET_SCH_PRIO
tristate ' RED queue' CONFIG_NET_SCH_RED
tristate ' SFQ queue' CONFIG_NET_SCH_SFQ
tristate ' TEQL queue' CONFIG_NET_SCH_TEQL
tristate ' TBF queue' CONFIG_NET_SCH_TBF
tristate ' GRED queue' CONFIG_NET_SCH_GRED
tristate ' Network delay simulator' CONFIG_NET_SCH_DELAY
tristate ' Diffserv field marker' CONFIG_NET_SCH_DSMARK
if [ "$CONFIG_NETFILTER" = "y" ]; then
tristate ' Ingress Qdisc' CONFIG_NET_SCH_INGRESS
fi
bool ' QoS support' CONFIG_NET_QOS
if [ "$CONFIG_NET_QOS" = "y" ]; then
bool ' Rate estimator' CONFIG_NET_ESTIMATOR
fi
bool ' Packet classifier API' CONFIG_NET_CLS
if [ "$CONFIG_NET_CLS" = "y" ]; then
tristate ' TC index classifier' CONFIG_NET_CLS_TCINDEX
tristate ' Routing table based classifier' CONFIG_NET_CLS_ROUTE4
if [ "$CONFIG_NET_CLS_ROUTE4" != "n" ]; then
define_bool CONFIG_NET_CLS_ROUTE y
fi
tristate ' Firewall based classifier' CONFIG_NET_CLS_FW
tristate ' U32 classifier' CONFIG_NET_CLS_U32
if [ "$CONFIG_NET_QOS" = "y" ]; then
tristate ' Special RSVP classifier' CONFIG_NET_CLS_RSVP
tristate ' Special RSVP classifier for IPv6' CONFIG_NET_CLS_RSVP6
bool ' Traffic policing (needed for in/egress)' CONFIG_NET_CLS_POLICE
fi
fi
 
/cls_rsvp.c
0,0 → 1,42
/*
* net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
 
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
#define RSVP_DST_LEN 1
#define RSVP_ID "rsvp"
#define RSVP_OPS cls_rsvp_ops
 
#include "cls_rsvp.h"
MODULE_LICENSE("GPL");
/sch_delay.c
0,0 → 1,277
/*
* net/sched/sch_delay.c Simple constant delay
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Stephen Hemminger <shemminger@osdl.org>
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
 
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
/* Network delay simulator
This scheduler adds a fixed delay to all packets.
Similar to NISTnet and BSD Dummynet.
 
It uses byte fifo underneath similar to TBF */
struct dly_sched_data {
u32 latency;
u32 limit;
struct timer_list timer;
struct Qdisc *qdisc;
};
 
/* Time stamp put into socket buffer control block */
struct dly_skb_cb {
psched_time_t queuetime;
};
 
/* Enqueue packets with underlying discipline (fifo)
* but mark them with current time first.
*/
static int dly_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
struct dly_skb_cb *cb = (struct dly_skb_cb *)skb->cb;
int ret;
 
PSCHED_GET_TIME(cb->queuetime);
 
/* Queue to underlying scheduler */
ret = q->qdisc->enqueue(skb, q->qdisc);
if (ret)
sch->stats.drops++;
else {
sch->q.qlen++;
sch->stats.bytes += skb->len;
sch->stats.packets++;
}
return 0;
}
 
/* Requeue packets but don't change time stamp */
static int dly_requeue(struct sk_buff *skb, struct Qdisc *sch)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
int ret;
 
ret = q->qdisc->ops->requeue(skb, q->qdisc);
if (ret == 0)
sch->q.qlen++;
return ret;
}
 
static unsigned int dly_drop(struct Qdisc *sch)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
unsigned int len;
 
len = q->qdisc->ops->drop(q->qdisc);
if (len) {
sch->q.qlen--;
sch->stats.drops++;
}
return len;
}
 
/* Dequeue packet.
* If packet needs to be held up, then stop the
* queue and set timer to wakeup later.
*/
static struct sk_buff *dly_dequeue(struct Qdisc *sch)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
struct sk_buff *skb = q->qdisc->dequeue(q->qdisc);
 
if (skb) {
struct dly_skb_cb *cb = (struct dly_skb_cb *)skb->cb;
psched_time_t now;
long diff;
 
PSCHED_GET_TIME(now);
diff = q->latency - PSCHED_TDIFF(now, cb->queuetime);
 
if (diff <= 0) {
sch->q.qlen--;
sch->flags &= ~TCQ_F_THROTTLED;
return skb;
}
 
if (!netif_queue_stopped(sch->dev)) {
long delay = PSCHED_US2JIFFIE(diff);
if (delay <= 0)
delay = 1;
mod_timer(&q->timer, jiffies+delay);
}
 
if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
sch->q.qlen--;
sch->stats.drops++;
}
sch->flags |= TCQ_F_THROTTLED;
}
return NULL;
}
 
static void dly_reset(struct Qdisc *sch)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
 
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
sch->flags &= ~TCQ_F_THROTTLED;
del_timer(&q->timer);
}
 
static void dly_timer(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
 
sch->flags &= ~TCQ_F_THROTTLED;
netif_schedule(sch->dev);
}
 
/* Tell Fifo the new limit. */
static int change_limit(struct Qdisc *q, u32 limit)
{
struct rtattr *rta;
int ret;
 
rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (!rta)
return -ENOMEM;
 
rta->rta_type = RTM_NEWQDISC;
((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
ret = q->ops->change(q, rta);
kfree(rta);
 
return ret;
}
 
/* Setup underlying FIFO discipline */
static int dly_change(struct Qdisc *sch, struct rtattr *opt)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
struct tc_dly_qopt *qopt = RTA_DATA(opt);
int err;
 
if (q->qdisc == &noop_qdisc) {
struct Qdisc *child
= qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops);
if (!child)
return -EINVAL;
q->qdisc = child;
}
 
err = change_limit(q->qdisc, qopt->limit);
if (err) {
qdisc_destroy(q->qdisc);
q->qdisc = &noop_qdisc;
} else {
q->latency = qopt->latency;
q->limit = qopt->limit;
}
return err;
}
 
static int dly_init(struct Qdisc *sch, struct rtattr *opt)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
int err;
 
if (!opt)
return -EINVAL;
 
MOD_INC_USE_COUNT;
 
init_timer(&q->timer);
q->timer.function = dly_timer;
q->timer.data = (unsigned long) sch;
q->qdisc = &noop_qdisc;
 
err = dly_change(sch, opt);
if (err)
MOD_DEC_USE_COUNT;
 
return err;
}
 
static void dly_destroy(struct Qdisc *sch)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
 
del_timer(&q->timer);
qdisc_destroy(q->qdisc);
q->qdisc = &noop_qdisc;
 
MOD_DEC_USE_COUNT;
}
 
static int dly_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct dly_sched_data *q = (struct dly_sched_data *)sch->data;
unsigned char *b = skb->tail;
struct tc_dly_qopt qopt;
 
qopt.latency = q->latency;
qopt.limit = q->limit;
 
RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
struct Qdisc_ops dly_qdisc_ops = {
.id = "delay",
.priv_size = sizeof(struct dly_sched_data),
.enqueue = dly_enqueue,
.dequeue = dly_dequeue,
.requeue = dly_requeue,
.drop = dly_drop,
.init = dly_init,
.reset = dly_reset,
.destroy = dly_destroy,
.change = dly_change,
.dump = dly_dump,
};
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&dly_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&dly_qdisc_ops);
}
#endif
MODULE_LICENSE("GPL");
/Makefile
0,0 → 1,36
#
# Makefile for the Linux Traffic Control Unit.
#
 
O_TARGET := sched.o
 
obj-y := sch_generic.o
 
 
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o
obj-$(CONFIG_NET_ESTIMATOR) += estimator.o
obj-$(CONFIG_NET_CLS) += cls_api.o
obj-$(CONFIG_NET_CLS_POLICE) += police.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_CSZ) += sch_csz.o
obj-$(CONFIG_NET_SCH_DELAY) += sch_delay.o
obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
 
include $(TOPDIR)/Rules.make
/sch_csz.c
0,0 → 1,1069
/*
* net/sched/sch_csz.c Clark-Shenker-Zhang scheduler.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
*/
 
#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/notifier.h>
#include <net/ip.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
 
 
/* Clark-Shenker-Zhang algorithm.
=======================================
 
SOURCE.
 
David D. Clark, Scott Shenker and Lixia Zhang
"Supporting Real-Time Applications in an Integrated Services Packet
Network: Architecture and Mechanism".
 
CBQ presents a flexible universal algorithm for packet scheduling,
but it has pretty poor delay characteristics.
Round-robin scheduling and link-sharing goals
apparently contradict minimization of network delay and jitter.
Moreover, correct handling of predictive flows seems to be
impossible in CBQ.
 
CSZ presents a more precise but less flexible and less efficient
approach. As I understand it, the main idea is to create
WFQ flows for each guaranteed service and to allocate
the rest of bandwidth to dummy flow-0. Flow-0 comprises
the predictive services and the best effort traffic;
it is handled by a priority scheduler with the highest
priority band allocated for predictive services, and the rest ---
to the best effort packets.
 
Note that in CSZ flows are NOT limited to their bandwidth. It
is supposed that the flow passed admission control at the edge
of the QoS network and it doesn't need further shaping. Any
attempt to improve the flow or to shape it to a token bucket
at intermediate hops will introduce undesired delays and raise
jitter.
 
At the moment CSZ is the only scheduler that provides
true guaranteed service. Another schemes (including CBQ)
do not provide guaranteed delay and randomize jitter.
There is a proof (Sally Floyd), that delay
can be estimated by a IntServ compliant formula.
This result is true formally, but it is wrong in principle.
It takes into account only round-robin delays,
ignoring delays introduced by link sharing i.e. overlimiting.
Note that temporary overlimits are inevitable because
real links are not ideal, and the real algorithm must take this
into account.
 
ALGORITHM.
 
--- Notations.
 
$B$ is link bandwidth (bits/sec).
 
$I$ is set of all flows, including flow $0$.
Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and
$\sum_{a \in I} r_a = 1$.
 
--- Flow model.
 
Let $m_a$ is the number of backlogged bits in flow $a$.
The flow is {\em active}, if $m_a > 0$.
This number is a discontinuous function of time;
when a packet $i$ arrives:
\[
m_a(t_i+0) - m_a(t_i-0) = L^i,
\]
where $L^i$ is the length of the arrived packet.
The flow queue is drained continuously until $m_a == 0$:
\[
{d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}.
\]
I.e. flow rates are their allocated rates proportionally
scaled to take all available link bandwidth. Apparently,
it is not the only possible policy. F.e. CBQ classes
without borrowing would be modelled by:
\[
{d m_a \over dt} = - B r_a .
\]
More complicated hierarchical bandwidth allocation
policies are possible, but unfortunately, the basic
flow equations have a simple solution only for proportional
scaling.
 
--- Departure times.
 
We calculate the time until the last bit of packet is sent:
\[
E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a },
\]
where $\delta_a(t)$ is number of bits drained since $t_i$.
We have to evaluate $E_a^i$ for all queued packets,
then find the packet with minimal $E_a^i$ and send it.
 
This sounds good, but direct implementation of the algorithm
is absolutely infeasible. Luckily, if flow rates
are scaled proportionally, the equations have a simple solution.
The differential equation for $E_a^i$ is
\[
{d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} =
{ B \over \sum_{b \in A} r_b}
\]
with initial condition
\[
E_a^i (t_i) = { m_a(t_i) \over r_a } .
\]
 
Let's introduce an auxiliary function $R(t)$:
 
--- Round number.
 
Consider the following model: we rotate over active flows,
sending $r_a B$ bits from every flow, so that we send
$B \sum_{a \in A} r_a$ bits per round, that takes
$\sum_{a \in A} r_a$ seconds.
Hence, $R(t)$ (round number) is a monotonically increasing
linear function of time when $A$ is not changed
\[
{ d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a }
\]
and it is continuous when $A$ changes.
 
The central observation is that the quantity
$F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all!
$R(t)$ does not depend on flow, so that $F_a^i$ can be
calculated only once on packet arrival, and we need not
recalculate $E$ numbers and resorting queues.
The number $F_a^i$ is called finish number of the packet.
It is just the value of $R(t)$ when the last bit of packet
is sent out.
 
Maximal finish number on flow is called finish number of flow
and minimal one is "start number of flow".
Apparently, flow is active if and only if $F_a \leq R$.
 
When a packet of length $L_i$ bit arrives to flow $a$ at time $t_i$,
we calculate $F_a^i$ as:
 
If flow was inactive ($F_a < R$):
$F_a^i = R(t) + {L_i \over B r_a}$
otherwise
$F_a^i = F_a + {L_i \over B r_a}$
 
These equations complete the algorithm specification.
 
It looks pretty hairy, but there is a simple
procedure for solving these equations.
See procedure csz_update(), that is a generalization of
the algorithm from S. Keshav's thesis Chapter 3
"Efficient Implementation of Fair Queeing".
 
NOTES.
 
* We implement only the simplest variant of CSZ,
when flow-0 is a explicit 4band priority fifo.
This is bad, but we need a "peek" operation in addition
to "dequeue" to implement complete CSZ.
I do not want to do that, unless it is absolutely
necessary.
* A primitive support for token bucket filtering
presents itself too. It directly contradicts CSZ, but
even though the Internet is on the globe ... :-)
"the edges of the network" really exist.
BUGS.
 
* Fixed point arithmetic is overcomplicated, suboptimal and even
wrong. Check it later. */
 
 
/* This number is arbitrary */
 
#define CSZ_GUARANTEED 16
#define CSZ_FLOWS (CSZ_GUARANTEED+4)
 
struct csz_head
{
struct csz_head *snext;
struct csz_head *sprev;
struct csz_head *fnext;
struct csz_head *fprev;
};
 
struct csz_flow
{
struct csz_head *snext;
struct csz_head *sprev;
struct csz_head *fnext;
struct csz_head *fprev;
 
/* Parameters */
struct tc_ratespec rate;
struct tc_ratespec slice;
u32 *L_tab; /* Lookup table for L/(B*r_a) values */
unsigned long limit; /* Maximal length of queue */
#ifdef CSZ_PLUS_TBF
struct tc_ratespec peakrate;
__u32 buffer; /* Depth of token bucket, normalized
as L/(B*r_a) */
__u32 mtu;
#endif
 
/* Variables */
#ifdef CSZ_PLUS_TBF
unsigned long tokens; /* Tokens number: usecs */
psched_time_t t_tbf;
unsigned long R_tbf;
int throttled;
#endif
unsigned peeked;
unsigned long start; /* Finish number of the first skb */
unsigned long finish; /* Finish number of the flow */
 
struct sk_buff_head q; /* FIFO queue */
};
 
#define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log])
 
struct csz_sched_data
{
/* Parameters */
unsigned char rate_log; /* fixed point position for rate;
* really we need not it */
unsigned char R_log; /* fixed point position for round number */
unsigned char delta_log; /* 1<<delta_log is maximal timeout in usecs;
* 21 <-> 2.1sec is MAXIMAL value */
 
/* Variables */
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
#ifdef CSZ_PLUS_TBF
struct timer_list wd_timer;
long wd_expires;
#endif
psched_time_t t_c; /* Time check-point */
unsigned long R_c; /* R-number check-point */
unsigned long rate; /* Current sum of rates of active flows */
struct csz_head s; /* Flows sorted by "start" */
struct csz_head f; /* Flows sorted by "finish" */
 
struct sk_buff_head other[4];/* Predicted (0) and the best efforts
classes (1,2,3) */
struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */
};
 
/* These routines (csz_insert_finish and csz_insert_start) are
the most time consuming part of all the algorithm.
 
We insert to sorted list, so that time
is linear with respect to number of active flows in the worst case.
Note that we have not very large number of guaranteed flows,
so that logarithmic algorithms (heap etc.) are useless,
they are slower than linear one when length of list <= 32.
 
Heap would take sence if we used WFQ for best efforts
flows, but SFQ is better choice in this case.
*/
 
 
/* Insert flow "this" to the list "b" before
flow with greater finish number.
*/
 
#if 0
/* Scan forward */
extern __inline__ void csz_insert_finish(struct csz_head *b,
struct csz_flow *this)
{
struct csz_head *f = b->fnext;
unsigned long finish = this->finish;
 
while (f != b) {
if (((struct csz_flow*)f)->finish - finish > 0)
break;
f = f->fnext;
}
this->fnext = f;
this->fprev = f->fprev;
this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
}
#else
/* Scan backward */
extern __inline__ void csz_insert_finish(struct csz_head *b,
struct csz_flow *this)
{
struct csz_head *f = b->fprev;
unsigned long finish = this->finish;
 
while (f != b) {
if (((struct csz_flow*)f)->finish - finish <= 0)
break;
f = f->fprev;
}
this->fnext = f->fnext;
this->fprev = f;
this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
}
#endif
 
/* Insert flow "this" to the list "b" before
flow with greater start number.
*/
 
extern __inline__ void csz_insert_start(struct csz_head *b,
struct csz_flow *this)
{
struct csz_head *f = b->snext;
unsigned long start = this->start;
 
while (f != b) {
if (((struct csz_flow*)f)->start - start > 0)
break;
f = f->snext;
}
this->snext = f;
this->sprev = f->sprev;
this->snext->sprev = this->sprev->snext = (struct csz_head*)this;
}
 
 
/* Calculate and return current round number.
It is another time consuming part, but
it is impossible to avoid it.
 
It costs O(N) that make all the algorithm useful only
to play with closest to ideal fluid model.
 
There exist less academic, but more practical modifications,
which might have even better characteristics (WF2Q+, HPFQ, HFSC)
*/
 
static unsigned long csz_update(struct Qdisc *sch)
{
struct csz_sched_data *q = (struct csz_sched_data*)sch->data;
struct csz_flow *a;
unsigned long F;
unsigned long tmp;
psched_time_t now;
unsigned long delay;
unsigned long R_c;
 
PSCHED_GET_TIME(now);
delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset);
 
if (delay>>q->delta_log) {
do_reset:
/* Delta is too large.
It is possible if MTU/BW > 1<<q->delta_log
(i.e. configuration error) or because of hardware
fault. We have no choice...
*/
qdisc_reset(sch);
return 0;
}
 
q->t_c = now;
 
for (;;) {
a = (struct csz_flow*)q->f.fnext;
 
/* No more active flows. Reset R and exit. */
if (a == (struct csz_flow*)&q->f) {
#ifdef CSZ_DEBUG
if (q->rate) {
printk("csz_update: rate!=0 on inactive csz\n");
q->rate = 0;
}
#endif
q->R_c = 0;
return 0;
}
 
F = a->finish;
 
#ifdef CSZ_DEBUG
if (q->rate == 0) {
printk("csz_update: rate=0 on active csz\n");
goto do_reset;
}
#endif
 
/*
* tmp = (t - q->t_c)/q->rate;
*/
 
tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log);
 
tmp += q->R_c;
 
/* OK, this flow (and all flows with greater
finish numbers) is still active */
if (F - tmp > 0)
break;
 
/* It is more not active */
 
a->fprev->fnext = a->fnext;
a->fnext->fprev = a->fprev;
 
/*
* q->t_c += (F - q->R_c)*q->rate
*/
 
tmp = ((F-q->R_c)*q->rate)<<q->R_log;
R_c = F;
q->rate -= a->slice.rate;
 
if ((long)(delay - tmp) >= 0) {
delay -= tmp;
continue;
}
delay = 0;
}
 
q->R_c = tmp;
return tmp;
}
 
unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q)
{
return CSZ_GUARANTEED;
}
 
static int
csz_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
unsigned flow_id = csz_classify(skb, q);
unsigned long R;
int prio = 0;
struct csz_flow *this;
 
if (flow_id >= CSZ_GUARANTEED) {
prio = flow_id - CSZ_GUARANTEED;
flow_id = 0;
}
 
this = &q->flow[flow_id];
if (this->q.qlen >= this->limit || this->L_tab == NULL) {
sch->stats.drops++;
kfree_skb(skb);
return NET_XMIT_DROP;
}
 
R = csz_update(sch);
 
if ((long)(this->finish - R) >= 0) {
/* It was active */
this->finish += L2R(this,skb->len);
} else {
/* It is inactive; activate it */
this->finish = R + L2R(this,skb->len);
q->rate += this->slice.rate;
csz_insert_finish(&q->f, this);
}
 
/* If this flow was empty, remember start number
and insert it into start queue */
if (this->q.qlen == 0) {
this->start = this->finish;
csz_insert_start(&q->s, this);
}
if (flow_id)
skb_queue_tail(&this->q, skb);
else
skb_queue_tail(&q->other[prio], skb);
sch->q.qlen++;
sch->stats.bytes += skb->len;
sch->stats.packets++;
return 0;
}
 
static __inline__ struct sk_buff *
skb_dequeue_best(struct csz_sched_data * q)
{
int i;
struct sk_buff *skb;
 
for (i=0; i<4; i++) {
skb = skb_dequeue(&q->other[i]);
if (skb) {
q->flow[0].q.qlen--;
return skb;
}
}
return NULL;
}
 
static __inline__ struct sk_buff *
skb_peek_best(struct csz_sched_data * q)
{
int i;
struct sk_buff *skb;
 
for (i=0; i<4; i++) {
skb = skb_peek(&q->other[i]);
if (skb)
return skb;
}
return NULL;
}
 
#ifdef CSZ_PLUS_TBF
 
static void csz_watchdog(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
 
qdisc_wakeup(sch->dev);
}
 
static __inline__ void
csz_move_queue(struct csz_flow *this, long delta)
{
this->fprev->fnext = this->fnext;
this->fnext->fprev = this->fprev;
 
this->start += delta;
this->finish += delta;
 
csz_insert_finish(this);
}
 
static __inline__ int csz_enough_tokens(struct csz_sched_data *q,
struct csz_flow *this,
struct sk_buff *skb)
{
long toks;
long shift;
psched_time_t now;
 
PSCHED_GET_TIME(now);
 
toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len);
 
shift = 0;
if (this->throttled) {
/* Remember aposteriory delay */
 
unsigned long R = csz_update(q);
shift = R - this->R_tbf;
this->R_tbf = R;
}
 
if (toks >= 0) {
/* Now we have enough tokens to proceed */
 
this->tokens = toks <= this->depth ? toks : this->depth;
this->t_tbf = now;
if (!this->throttled)
return 1;
 
/* Flow was throttled. Update its start&finish numbers
with delay calculated aposteriori.
*/
 
this->throttled = 0;
if (shift > 0)
csz_move_queue(this, shift);
return 1;
}
 
if (!this->throttled) {
/* Flow has just been throttled; remember
current round number to calculate aposteriori delay
*/
this->throttled = 1;
this->R_tbf = csz_update(q);
}
 
/* Move all the queue to the time when it will be allowed to send.
We should translate time to round number, but it is impossible,
so that we made the most conservative estimate i.e. we suppose
that only this flow is active and, hence, R = t.
Really toks <= R <= toks/r_a.
 
This apriory shift in R will be adjusted later to reflect
real delay. We cannot avoid it because of:
- throttled flow continues to be active from the viewpoint
of CSZ, so that it would acquire the highest priority,
if you not adjusted start numbers.
- Eventually, finish number would become less than round
number and flow were declared inactive.
*/
 
toks = -toks;
 
/* Remeber, that we should start watchdog */
if (toks < q->wd_expires)
q->wd_expires = toks;
 
toks >>= q->R_log;
shift += toks;
if (shift > 0) {
this->R_tbf += toks;
csz_move_queue(this, shift);
}
csz_insert_start(this);
return 0;
}
#endif
 
 
static struct sk_buff *
csz_dequeue(struct Qdisc* sch)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
struct sk_buff *skb;
struct csz_flow *this;
 
#ifdef CSZ_PLUS_TBF
q->wd_expires = 0;
#endif
this = (struct csz_flow*)q->s.snext;
 
while (this != (struct csz_flow*)&q->s) {
 
/* First of all: unlink from start list */
this->sprev->snext = this->snext;
this->snext->sprev = this->sprev;
 
if (this != &q->flow[0]) { /* Guaranteed flow */
skb = __skb_dequeue(&this->q);
if (skb) {
#ifdef CSZ_PLUS_TBF
if (this->depth) {
if (!csz_enough_tokens(q, this, skb))
continue;
}
#endif
if (this->q.qlen) {
struct sk_buff *nskb = skb_peek(&this->q);
this->start += L2R(this,nskb->len);
csz_insert_start(&q->s, this);
}
sch->q.qlen--;
return skb;
}
} else { /* Predicted or best effort flow */
skb = skb_dequeue_best(q);
if (skb) {
unsigned peeked = this->peeked;
this->peeked = 0;
 
if (--this->q.qlen) {
struct sk_buff *nskb;
unsigned dequeued = L2R(this,skb->len);
 
/* We got not the same thing that
peeked earlier; adjust start number
*/
if (peeked != dequeued && peeked)
this->start += dequeued - peeked;
 
nskb = skb_peek_best(q);
peeked = L2R(this,nskb->len);
this->start += peeked;
this->peeked = peeked;
csz_insert_start(&q->s, this);
}
sch->q.qlen--;
return skb;
}
}
}
#ifdef CSZ_PLUS_TBF
/* We are about to return no skb.
Schedule watchdog timer, if it occurred because of shaping.
*/
if (q->wd_expires) {
unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires);
if (delay == 0)
delay = 1;
mod_timer(&q->wd_timer, jiffies + delay);
sch->stats.overlimits++;
}
#endif
return NULL;
}
 
static void
csz_reset(struct Qdisc* sch)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
int i;
 
for (i=0; i<4; i++)
skb_queue_purge(&q->other[i]);
 
for (i=0; i<CSZ_GUARANTEED; i++) {
struct csz_flow *this = q->flow + i;
skb_queue_purge(&this->q);
this->snext = this->sprev =
this->fnext = this->fprev = (struct csz_head*)this;
this->start = this->finish = 0;
}
q->s.snext = q->s.sprev = &q->s;
q->f.fnext = q->f.fprev = &q->f;
q->R_c = 0;
#ifdef CSZ_PLUS_TBF
PSCHED_GET_TIME(&q->t_tbf);
q->tokens = q->depth;
del_timer(&q->wd_timer);
#endif
sch->q.qlen = 0;
}
 
static void
csz_destroy(struct Qdisc* sch)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
struct tcf_proto *tp;
 
while ((tp = q->filter_list) != NULL) {
q->filter_list = tp->next;
tcf_destroy(tp);
}
 
MOD_DEC_USE_COUNT;
}
 
static int csz_init(struct Qdisc *sch, struct rtattr *opt)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
struct rtattr *tb[TCA_CSZ_PTAB];
struct tc_csz_qopt *qopt;
int i;
 
rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
if (tb[TCA_CSZ_PARMS-1] == NULL ||
RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt))
return -EINVAL;
qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
 
q->R_log = qopt->R_log;
q->delta_log = qopt->delta_log;
for (i=0; i<=TC_PRIO_MAX; i++) {
if (qopt->priomap[i] >= CSZ_FLOWS)
return -EINVAL;
q->prio2band[i] = qopt->priomap[i];
}
 
for (i=0; i<4; i++)
skb_queue_head_init(&q->other[i]);
 
for (i=0; i<CSZ_GUARANTEED; i++) {
struct csz_flow *this = q->flow + i;
skb_queue_head_init(&this->q);
this->snext = this->sprev =
this->fnext = this->fprev = (struct csz_head*)this;
this->start = this->finish = 0;
}
q->s.snext = q->s.sprev = &q->s;
q->f.fnext = q->f.fprev = &q->f;
q->R_c = 0;
#ifdef CSZ_PLUS_TBF
init_timer(&q->wd_timer);
q->wd_timer.data = (unsigned long)sch;
q->wd_timer.function = csz_watchdog;
#endif
MOD_INC_USE_COUNT;
return 0;
}
 
static int csz_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
unsigned char *b = skb->tail;
struct rtattr *rta;
struct tc_csz_qopt opt;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
opt.flows = CSZ_FLOWS;
memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
rta->rta_len = skb->tail - b;
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
struct Qdisc **old)
{
return -EINVAL;
}
 
static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl)
{
return NULL;
}
 
 
static unsigned long csz_get(struct Qdisc *sch, u32 classid)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
unsigned long band = TC_H_MIN(classid) - 1;
 
if (band >= CSZ_FLOWS)
return 0;
 
if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL)
return 0;
 
return band+1;
}
 
static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
{
return csz_get(sch, classid);
}
 
 
static void csz_put(struct Qdisc *sch, unsigned long cl)
{
return;
}
 
static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
{
unsigned long cl = *arg;
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_CSZ_PTAB];
struct tc_csz_copt *copt;
 
rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
if (tb[TCA_CSZ_PARMS-1] == NULL ||
RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt))
return -EINVAL;
copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
 
if (tb[TCA_CSZ_RTAB-1] &&
RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024)
return -EINVAL;
 
if (cl) {
struct csz_flow *a;
cl--;
if (cl >= CSZ_FLOWS)
return -ENOENT;
if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
return -EINVAL;
 
a = &q->flow[cl];
 
spin_lock_bh(&sch->dev->queue_lock);
#if 0
a->rate_log = copt->rate_log;
#endif
#ifdef CSZ_PLUS_TBF
a->limit = copt->limit;
a->rate = copt->rate;
a->buffer = copt->buffer;
a->mtu = copt->mtu;
#endif
 
if (tb[TCA_CSZ_RTAB-1])
memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024);
 
spin_unlock_bh(&sch->dev->queue_lock);
return 0;
}
/* NI */
return 0;
}
 
static int csz_delete(struct Qdisc *sch, unsigned long cl)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
struct csz_flow *a;
 
cl--;
 
if (cl >= CSZ_FLOWS)
return -ENOENT;
if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
return -EINVAL;
 
a = &q->flow[cl];
 
spin_lock_bh(&sch->dev->queue_lock);
a->fprev->fnext = a->fnext;
a->fnext->fprev = a->fprev;
a->sprev->snext = a->snext;
a->snext->sprev = a->sprev;
a->start = a->finish = 0;
kfree(xchg(&q->flow[cl].L_tab, NULL));
spin_unlock_bh(&sch->dev->queue_lock);
 
return 0;
}
 
static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
unsigned char *b = skb->tail;
struct rtattr *rta;
struct tc_csz_copt opt;
 
tcm->tcm_handle = sch->handle|cl;
 
cl--;
 
if (cl > CSZ_FLOWS)
goto rtattr_failure;
 
if (cl < CSZ_GUARANTEED) {
struct csz_flow *f = &q->flow[cl];
 
if (f->L_tab == NULL)
goto rtattr_failure;
 
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
opt.limit = f->limit;
opt.rate = f->rate;
opt.slice = f->slice;
memset(&opt.peakrate, 0, sizeof(opt.peakrate));
#ifdef CSZ_PLUS_TBF
opt.buffer = f->buffer;
opt.mtu = f->mtu;
#else
opt.buffer = 0;
opt.mtu = 0;
#endif
 
RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
rta->rta_len = skb->tail - b;
}
 
return skb->len;
 
rtattr_failure:
skb_trim(skb, b - skb->data);
return -1;
}
 
static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
int prio = 0;
 
if (arg->stop)
return;
 
for (prio = 0; prio < CSZ_FLOWS; prio++) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) {
arg->count++;
continue;
}
if (arg->fn(sch, prio+1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
 
static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 
if (cl)
return NULL;
 
return &q->filter_list;
}
 
struct Qdisc_class_ops csz_class_ops =
{
csz_graft,
csz_leaf,
 
csz_get,
csz_put,
csz_change,
csz_delete,
csz_walk,
 
csz_find_tcf,
csz_bind,
csz_put,
 
csz_dump_class,
};
 
struct Qdisc_ops csz_qdisc_ops =
{
NULL,
&csz_class_ops,
"csz",
sizeof(struct csz_sched_data),
 
csz_enqueue,
csz_dequeue,
NULL,
NULL,
 
csz_init,
csz_reset,
csz_destroy,
NULL /* csz_change */,
 
csz_dump,
};
 
 
#ifdef MODULE
int init_module(void)
{
return register_qdisc(&csz_qdisc_ops);
}
 
void cleanup_module(void)
{
unregister_qdisc(&csz_qdisc_ops);
}
#endif
MODULE_LICENSE("GPL");

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.