OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [net/] [ipv4/] [ipvs/] [ip_vs_conn.c] - Rev 1765

Compare with Previous | Blame | View Log

/*
 * IPVS         An implementation of the IP virtual server support for the
 *              LINUX operating system.  IPVS is now implemented as a module
 *              over the Netfilter framework. IPVS can be used to build a
 *              high-performance and highly available server based on a
 *              cluster of servers.
 *
 * Version:     $Id: ip_vs_conn.c,v 1.1.1.1 2004-04-15 01:14:00 phoenix Exp $
 *
 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
 *              Peter Kese <peter.kese@ijs.si>
 *              Julian Anastasov <ja@ssi.bg>
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or (at your option) any later version.
 *
 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
 *
 * Changes:
 *
 */
 
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/ip.h>
#include <linux/tcp.h>                  /* for tcphdr */
#include <linux/in.h>
#include <linux/proc_fs.h>              /* for proc_net_* */
#include <asm/softirq.h>                /* for local_bh_* */
#include <net/ip.h>
#include <net/tcp.h>                    /* for csum_tcpudp_magic */
#include <net/udp.h>
#include <net/icmp.h>                   /* for icmp_send */
#include <net/route.h>                  /* for ip_route_output */
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/jhash.h>
#include <linux/random.h>
 
#include <net/ip_vs.h>
 
 
/*
 *  Connection hash table: for input and output packets lookups of IPVS
 */
static struct list_head *ip_vs_conn_tab;
 
/* SLAB cache for IPVS connections */
static kmem_cache_t *ip_vs_conn_cachep;
 
/* counter for current IPVS connections */
static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
 
/* counter for no-client-port connections */
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
 
/* random value for IPVS connection hash */
static unsigned int ip_vs_conn_rnd;
 
/*
 *  Fine locking granularity for big connection hash table
 */
#define CT_LOCKARRAY_BITS  4
#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
 
struct ip_vs_aligned_lock
{
	rwlock_t	l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
/* lock array for conn table */
struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
 
static inline void ct_read_lock(unsigned key)
{
	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
 
static inline void ct_read_unlock(unsigned key)
{
	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
 
static inline void ct_write_lock(unsigned key)
{
	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
 
static inline void ct_write_unlock(unsigned key)
{
	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
 
static inline void ct_read_lock_bh(unsigned key)
{
	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
 
static inline void ct_read_unlock_bh(unsigned key)
{
	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
 
static inline void ct_write_lock_bh(unsigned key)
{
	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
 
static inline void ct_write_unlock_bh(unsigned key)
{
	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
 
 
/*
 *	Returns hash value for IPVS connection entry
 */
static unsigned
ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
{
	return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
		& IP_VS_CONN_TAB_MASK;
}
 
 
/*
 *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
 *	returns bool success.
 */
static int ip_vs_conn_hash(struct ip_vs_conn *cp)
{
	unsigned hash;
	int ret;
 
	/* Hash by protocol, client address and port */
	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
 
	ct_write_lock(hash);
 
	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
		cp->flags |= IP_VS_CONN_F_HASHED;
		atomic_inc(&cp->refcnt);
		ret = 1;
	} else {
		IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
			  "called from %p\n", __builtin_return_address(0));
		ret = 0;
	}
 
	ct_write_unlock(hash);
 
	return ret;
}
 
 
/*
 *	UNhashes ip_vs_conn from ip_vs_conn_tab.
 *	returns bool success.
 */
static int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
	unsigned hash;
	int ret;
 
	/* unhash it and decrease its reference counter */
	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
	ct_write_lock(hash);
 
	if (cp->flags & IP_VS_CONN_F_HASHED) {
		list_del(&cp->c_list);
		cp->flags &= ~IP_VS_CONN_F_HASHED;
		atomic_dec(&cp->refcnt);
		ret = 1;
	} else
		ret = 0;
 
	ct_write_unlock(hash);
 
	return ret;
}
 
 
/*
 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
 *  Called for pkts coming from OUTside-to-INside.
 *	s_addr, s_port: pkt source address (foreign host)
 *	d_addr, d_port: pkt dest address (load balancer)
 */
static inline struct ip_vs_conn *__ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
	unsigned hash;
	struct ip_vs_conn *cp;
	struct list_head *l,*e;
 
	hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
	l = &ip_vs_conn_tab[hash];
 
	ct_read_lock(hash);
 
	for (e=l->next; e!=l; e=e->next) {
		cp = list_entry(e, struct ip_vs_conn, c_list);
		if (s_addr==cp->caddr && s_port==cp->cport &&
		    d_port==cp->vport && d_addr==cp->vaddr &&
		    protocol==cp->protocol) {
			/* HIT */
			atomic_inc(&cp->refcnt);
			ct_read_unlock(hash);
			return cp;
		}
	}
 
	ct_read_unlock(hash);
 
	return NULL;
}
 
struct ip_vs_conn *ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
	struct ip_vs_conn *cp;
 
	cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
		cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
 
	IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
		  ip_vs_proto_name(protocol),
		  NIPQUAD(s_addr), ntohs(s_port),
		  NIPQUAD(d_addr), ntohs(d_port),
		  cp?"hit":"not hit");
 
	return cp;
}
 
 
/*
 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
 *  Called for pkts coming from inside-to-OUTside.
 *	s_addr, s_port: pkt source address (inside host)
 *	d_addr, d_port: pkt dest address (foreign host)
 */
struct ip_vs_conn *ip_vs_conn_out_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
	unsigned hash;
	struct ip_vs_conn *cp, *ret=NULL;
	struct list_head *l,*e;
 
	/*
	 *	Check for "full" addressed entries
	 */
	hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
	l = &ip_vs_conn_tab[hash];
 
	ct_read_lock(hash);
 
	for (e=l->next; e!=l; e=e->next) {
		cp = list_entry(e, struct ip_vs_conn, c_list);
		if (d_addr == cp->caddr && d_port == cp->cport &&
		    s_port == cp->dport && s_addr == cp->daddr &&
		    protocol == cp->protocol) {
			/* HIT */
			atomic_inc(&cp->refcnt);
			ret = cp;
			break;
		}
	}
 
	ct_read_unlock(hash);
 
	IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
		  ip_vs_proto_name(protocol),
		  NIPQUAD(s_addr), ntohs(s_port),
		  NIPQUAD(d_addr), ntohs(d_port),
		  ret?"hit":"not hit");
 
	return ret;
}
 
 
/*
 *      Put back the conn and restart its timer with its timeout
 */
void ip_vs_conn_put(struct ip_vs_conn *cp)
{
	/* reset it expire in its timeout */
	mod_timer(&cp->timer, jiffies+cp->timeout);
 
	__ip_vs_conn_put(cp);
}
 
 
/*
 *	Timeout table[state]
 */
struct ip_vs_timeout_table vs_timeout_table = {
	ATOMIC_INIT(0),	/* refcnt */
	0,		/* scale  */
	{
		[IP_VS_S_NONE]          =	30*60*HZ,
		[IP_VS_S_ESTABLISHED]	=	15*60*HZ,
		[IP_VS_S_SYN_SENT]	=	2*60*HZ,
		[IP_VS_S_SYN_RECV]	=	1*60*HZ,
		[IP_VS_S_FIN_WAIT]	=	2*60*HZ,
		[IP_VS_S_TIME_WAIT]	=	2*60*HZ,
		[IP_VS_S_CLOSE]         =	10*HZ,
		[IP_VS_S_CLOSE_WAIT]	=	60*HZ,
		[IP_VS_S_LAST_ACK]	=	30*HZ,
		[IP_VS_S_LISTEN]	=	2*60*HZ,
		[IP_VS_S_SYNACK]	=	120*HZ,
		[IP_VS_S_UDP]		=	5*60*HZ,
		[IP_VS_S_ICMP]          =	1*60*HZ,
		[IP_VS_S_LAST]          =	2*HZ,
	},	/* timeout */
};
 
 
struct ip_vs_timeout_table vs_timeout_table_dos = {
	ATOMIC_INIT(0),	/* refcnt */
	0,		/* scale  */
	{
		[IP_VS_S_NONE]          =	15*60*HZ,
		[IP_VS_S_ESTABLISHED]	=	8*60*HZ,
		[IP_VS_S_SYN_SENT]	=	60*HZ,
		[IP_VS_S_SYN_RECV]	=	10*HZ,
		[IP_VS_S_FIN_WAIT]	=	60*HZ,
		[IP_VS_S_TIME_WAIT]	=	60*HZ,
		[IP_VS_S_CLOSE]         =	10*HZ,
		[IP_VS_S_CLOSE_WAIT]	=	60*HZ,
		[IP_VS_S_LAST_ACK]	=	30*HZ,
		[IP_VS_S_LISTEN]	=	2*60*HZ,
		[IP_VS_S_SYNACK]	=	100*HZ,
		[IP_VS_S_UDP]		=	3*60*HZ,
		[IP_VS_S_ICMP]          =	1*60*HZ,
		[IP_VS_S_LAST]          =	2*HZ,
	},	/* timeout */
};
 
 
/*
 *	Timeout table to use for the VS entries
 *	If NULL we use the default table (vs_timeout_table).
 *	Under flood attack we switch to vs_timeout_table_dos
 */
 
static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table;
 
static const char * state_name_table[IP_VS_S_LAST+1] = {
	[IP_VS_S_NONE]          =	"NONE",
	[IP_VS_S_ESTABLISHED]	=	"ESTABLISHED",
	[IP_VS_S_SYN_SENT]	=	"SYN_SENT",
	[IP_VS_S_SYN_RECV]	=	"SYN_RECV",
	[IP_VS_S_FIN_WAIT]	=	"FIN_WAIT",
	[IP_VS_S_TIME_WAIT]	=	"TIME_WAIT",
	[IP_VS_S_CLOSE]         =	"CLOSE",
	[IP_VS_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
	[IP_VS_S_LAST_ACK]	=	"LAST_ACK",
	[IP_VS_S_LISTEN]	=	"LISTEN",
	[IP_VS_S_SYNACK]	=	"SYNACK",
	[IP_VS_S_UDP]		=	"UDP",
	[IP_VS_S_ICMP]          =	"ICMP",
	[IP_VS_S_LAST]          =	"BUG!",
};
 
#define sNO IP_VS_S_NONE
#define sES IP_VS_S_ESTABLISHED
#define sSS IP_VS_S_SYN_SENT
#define sSR IP_VS_S_SYN_RECV
#define sFW IP_VS_S_FIN_WAIT
#define sTW IP_VS_S_TIME_WAIT
#define sCL IP_VS_S_CLOSE
#define sCW IP_VS_S_CLOSE_WAIT
#define sLA IP_VS_S_LAST_ACK
#define sLI IP_VS_S_LISTEN
#define sSA IP_VS_S_SYNACK
 
struct vs_tcp_states_t {
	int next_state[IP_VS_S_LAST];	/* should be _LAST_TCP */
};
 
const char * ip_vs_state_name(int state)
{
	if (state >= IP_VS_S_LAST)
		return "ERR!";
	return state_name_table[state] ? state_name_table[state] : "?";
}
 
static struct vs_tcp_states_t vs_tcp_states [] = {
/*	INPUT */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
 
/*	OUTPUT */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
 
/*	INPUT-ONLY */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
 
static struct vs_tcp_states_t vs_tcp_states_dos [] = {
/*	INPUT */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 
/*	OUTPUT */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
 
/*	INPUT-ONLY */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
 
static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states;
 
void ip_vs_secure_tcp_set(int on)
{
	if (on) {
		ip_vs_state_table = vs_tcp_states_dos;
		ip_vs_timeout_table = &vs_timeout_table_dos;
	} else {
		ip_vs_state_table = vs_tcp_states;
		ip_vs_timeout_table = &vs_timeout_table;
	}
}
 
 
static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off)
{
	/*
	 *	[0-3]: input states, [4-7]: output, [8-11] input only states.
	 */
	if (th->rst)
		return state_off+3;
	if (th->syn)
		return state_off+0;
	if (th->fin)
		return state_off+1;
	if (th->ack)
		return state_off+2;
	return -1;
}
 
 
static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state)
{
	struct ip_vs_timeout_table *vstim = cp->timeout_table;
 
	/*
	 *	Use default timeout table if no specific for this entry
	 */
	if (!vstim)
		vstim = &vs_timeout_table;
 
	cp->timeout = vstim->timeout[cp->state=state];
 
	if (vstim->scale) {
		int scale = vstim->scale;
 
		if (scale<0)
			cp->timeout >>= -scale;
		else if (scale > 0)
			cp->timeout <<= scale;
	}
 
	return state;
}
 
 
static inline int
vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th)
{
	int state_idx;
	int new_state = IP_VS_S_CLOSE;
 
	/*
	 *    Update state offset to INPUT_ONLY if necessary
	 *    or delete NO_OUTPUT flag if output packet detected
	 */
	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
		if (state_off == VS_STATE_OUTPUT)
			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
		else
			state_off = VS_STATE_INPUT_ONLY;
	}
 
	if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) {
		IP_VS_DBG(8, "vs_tcp_state_idx(%d)=%d!!!\n",
			  state_off, state_idx);
		goto tcp_state_out;
	}
 
	new_state = ip_vs_state_table[state_idx].next_state[cp->state];
 
  tcp_state_out:
	if (new_state != cp->state) {
		struct ip_vs_dest *dest = cp->dest;
 
		IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
			  "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
			  ip_vs_proto_name(cp->protocol),
			  (state_off==VS_STATE_OUTPUT)?"output ":"input ",
			  th->syn? 'S' : '.',
			  th->fin? 'F' : '.',
			  th->ack? 'A' : '.',
			  th->rst? 'R' : '.',
			  NIPQUAD(cp->daddr), ntohs(cp->dport),
			  NIPQUAD(cp->caddr), ntohs(cp->cport),
			  ip_vs_state_name(cp->state),
			  ip_vs_state_name(new_state),
			  atomic_read(&cp->refcnt));
		if (dest) {
			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
			    (new_state != IP_VS_S_ESTABLISHED)) {
				atomic_dec(&dest->activeconns);
				atomic_inc(&dest->inactconns);
				cp->flags |= IP_VS_CONN_F_INACTIVE;
			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
				   (new_state == IP_VS_S_ESTABLISHED)) {
				atomic_inc(&dest->activeconns);
				atomic_dec(&dest->inactconns);
				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
			}
		}
	}
 
	return vs_set_state_timeout(cp, new_state);
}
 
 
/*
 *	Handle state transitions
 */
int ip_vs_set_state(struct ip_vs_conn *cp,
		    int state_off, struct iphdr *iph, void *tp)
{
	int ret;
 
	spin_lock(&cp->lock);
	switch (iph->protocol) {
	case IPPROTO_TCP:
		ret = vs_tcp_state(cp, state_off, tp);
		break;
	case IPPROTO_UDP:
		ret = vs_set_state_timeout(cp, IP_VS_S_UDP);
		break;
	case IPPROTO_ICMP:
		ret = vs_set_state_timeout(cp, IP_VS_S_ICMP);
		break;
	default:
		ret = -1;
	}
	spin_unlock(&cp->lock);
 
	return ret;
}
 
 
/*
 *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
 */
int ip_vs_conn_listen(struct ip_vs_conn *cp)
{
	vs_set_state_timeout(cp, IP_VS_S_LISTEN);
	return cp->timeout;
}
 
 
/*
 *      Bypass transmitter
 *      Let packets bypass the destination when the destination is not
 *      available, it may be only used in transparent cache cluster.
 */
static int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
	struct rtable *rt;			/* Route to the other host */
	struct iphdr  *iph = skb->nh.iph;
	u8     tos = iph->tos;
	int    mtu;
 
	EnterFunction(10);
 
	if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) {
		IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
			     "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
		goto tx_error_icmp;
	}
 
	/* MTU checking */
	mtu = rt->u.dst.pmtu;
	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
		ip_rt_put(rt);
		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
		IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
		goto tx_error;
	}
 
	/* update checksum because skb might be defragmented */
	ip_send_check(iph);
 
	if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
		if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
			ip_rt_put(rt);
			IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
			goto tx_error;
		}
	}
 
	/* drop old route */
	dst_release(skb->dst);
	skb->dst = &rt->u.dst;
 
#ifdef CONFIG_NETFILTER_DEBUG
	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
#endif /* CONFIG_NETFILTER_DEBUG */
	skb->nfcache |= NFC_IPVS_PROPERTY;
	ip_send(skb);
 
	LeaveFunction(10);
	return NF_STOLEN;
 
  tx_error_icmp:
	dst_link_failure(skb);
  tx_error:
	kfree_skb(skb);
	return NF_STOLEN;
}
 
 
/*
 *      NULL transmitter (do nothing except return NF_ACCEPT)
 */
static int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
	return NF_ACCEPT;
}
 
 
/*
 *      NAT transmitter (only for outside-to-inside nat forwarding)
 */
static int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
	struct rtable *rt;		/* Route to the other host */
	struct iphdr  *iph;
	union ip_vs_tphdr h;
	int ihl;
	unsigned short size;
	int mtu;
 
	EnterFunction(10);
 
	/*
	 * If it has ip_vs_app helper, the helper may change the payload,
	 * so it needs full checksum checking and checksum calculation.
	 * If not, only the header (such as IP address and port number)
	 * will be changed, so it is fast to do incremental checksum update,
	 * and let the destination host  do final checksum checking.
	 */
 
	if (cp->app && skb_is_nonlinear(skb)
	    && skb_linearize(skb, GFP_ATOMIC) != 0)
		return NF_DROP;
 
	iph = skb->nh.iph;
	ihl = iph->ihl << 2;
	h.raw = (char*) iph + ihl;
	size = ntohs(iph->tot_len) - ihl;
 
	/* do TCP/UDP checksum checking if it has application helper */
	if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
		switch (skb->ip_summed) {
		case CHECKSUM_NONE:
			skb->csum = csum_partial(h.raw, size, 0);
 
		case CHECKSUM_HW:
			if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
					      iph->protocol, skb->csum)) {
				IP_VS_DBG_RL("Incoming failed %s checksum "
					     "from %d.%d.%d.%d (size=%d)!\n",
					     ip_vs_proto_name(iph->protocol),
					     NIPQUAD(iph->saddr),
					     size);
				goto tx_error;
			}
			break;
		default:
			/* CHECKSUM_UNNECESSARY */
			break;
		}
	}
 
	/*
	 *  Check if it is no_cport connection ...
	 */
	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
		if (ip_vs_conn_unhash(cp)) {
			spin_lock(&cp->lock);
			if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
				atomic_dec(&ip_vs_conn_no_cport_cnt);
				cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
				cp->cport = h.portp[0];
				IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
			}
			spin_unlock(&cp->lock);
 
			/* hash on new dport */
			ip_vs_conn_hash(cp);
		}
	}
 
	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
		goto tx_error_icmp;
 
	/* MTU checking */
	mtu = rt->u.dst.pmtu;
	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
		ip_rt_put(rt);
		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
		IP_VS_DBG_RL("ip_vs_nat_xmit(): frag needed\n");
		goto tx_error;
	}
 
	/* drop old route */
	dst_release(skb->dst);
	skb->dst = &rt->u.dst;
 
	/* copy-on-write the packet before mangling it */
	if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
		return NF_DROP;
 
	/* mangle the packet */
	iph->daddr = cp->daddr;
	h.portp[1] = cp->dport;
 
	/*
	 *	Attempt ip_vs_app call.
	 *	will fix ip_vs_conn and iph ack_seq stuff
	 */
	if (ip_vs_app_pkt_in(cp, skb) != 0) {
		/* skb data has probably changed, update pointers */
		iph = skb->nh.iph;
		h.raw = (char*) iph + ihl;
		size = skb->len - ihl;
	}
 
	/*
	 *	Adjust TCP/UDP checksums
	 */
	if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
		/* Only port and addr are changed, do fast csum update */
		ip_vs_fast_check_update(&h, cp->vaddr, cp->daddr,
					cp->vport, cp->dport, iph->protocol);
		if (skb->ip_summed == CHECKSUM_HW)
			skb->ip_summed = CHECKSUM_NONE;
	} else {
		/* full checksum calculation */
		switch (iph->protocol) {
		case IPPROTO_TCP:
			h.th->check = 0;
			h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
							size, iph->protocol,
							csum_partial(h.raw, size, 0));
			break;
		case IPPROTO_UDP:
			h.uh->check = 0;
			h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
							size, iph->protocol,
							csum_partial(h.raw, size, 0));
			if (h.uh->check == 0)
				h.uh->check = 0xFFFF;
			break;
		}
		skb->ip_summed = CHECKSUM_UNNECESSARY;
	}
	ip_send_check(iph);
 
	IP_VS_DBG(10, "NAT to %u.%u.%u.%u:%d\n",
		  NIPQUAD(iph->daddr), ntohs(h.portp[1]));
 
	/* FIXME: when application helper enlarges the packet and the length
	   is larger than the MTU of outgoing device, there will be still
	   MTU problem. */
 
#ifdef CONFIG_NETFILTER_DEBUG
	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
#endif /* CONFIG_NETFILTER_DEBUG */
	skb->nfcache |= NFC_IPVS_PROPERTY;
	ip_send(skb);
 
	LeaveFunction(10);
	return NF_STOLEN;
 
  tx_error_icmp:
	dst_link_failure(skb);
  tx_error:
	kfree_skb(skb);
	return NF_STOLEN;
}
 
 
/*
 *   IP Tunneling transmitter
 *
 *   This function encapsulates the packet in a new IP packet, its
 *   destination will be set to cp->daddr. Most code of this function
 *   is taken from ipip.c.
 *
 *   It is used in VS/TUN cluster. The load balancer selects a real
 *   server from a cluster based on a scheduling algorithm,
 *   encapsulates the request packet and forwards it to the selected
 *   server. For example, all real servers are configured with
 *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
 *   the encapsulated packet, it will decapsulate the packet, processe
 *   the request and return the response packets directly to the client
 *   without passing the load balancer. This can greatly increase the
 *   scalability of virtual server.
 */
static int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
	struct rtable *rt;			/* Route to the other host */
	struct net_device *tdev;		/* Device to other host */
	struct iphdr  *old_iph = skb->nh.iph;
	u8     tos = old_iph->tos;
	u16    df = old_iph->frag_off;
	struct iphdr  *iph;			/* Our new IP header */
	int    max_headroom;			/* The extra header space needed */
	int    mtu;
 
	EnterFunction(10);
 
	if (skb->protocol != __constant_htons(ETH_P_IP)) {
		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
			     "ETH_P_IP: %d, skb protocol: %d\n",
			     __constant_htons(ETH_P_IP), skb->protocol);
		goto tx_error;
	}
 
	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
		goto tx_error_icmp;
 
	tdev = rt->u.dst.dev;
 
	mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
	if (mtu < 68) {
		ip_rt_put(rt);
		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
		goto tx_error;
	}
	if (skb->dst && mtu < skb->dst->pmtu)
		skb->dst->pmtu = mtu;
 
	df |= (old_iph->frag_off&__constant_htons(IP_DF));
 
	if ((old_iph->frag_off&__constant_htons(IP_DF))
	    && mtu < ntohs(old_iph->tot_len)) {
		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
		ip_rt_put(rt);
		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
		goto tx_error;
	}
 
	/* update checksum because skb might be defragmented */
	ip_send_check(old_iph);
 
	/*
	 * Okay, now see if we can stuff it in the buffer as-is.
	 */
	max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
 
	if (skb_headroom(skb) < max_headroom
	    || skb_cloned(skb) || skb_shared(skb)) {
		struct sk_buff *new_skb =
			skb_realloc_headroom(skb, max_headroom);
		if (!new_skb) {
			ip_rt_put(rt);
			IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
			return NF_DROP;
		}
		kfree_skb(skb);
		skb = new_skb;
		old_iph = skb->nh.iph;
	}
 
	skb->h.raw = skb->nh.raw;
	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
	/* drop old route */
	dst_release(skb->dst);
	skb->dst = &rt->u.dst;
 
	/*
	 *	Push down and install the IPIP header.
	 */
	iph			=	skb->nh.iph;
	iph->version		=	4;
	iph->ihl		=	sizeof(struct iphdr)>>2;
	iph->frag_off		=	df;
	iph->protocol		=	IPPROTO_IPIP;
	iph->tos		=	tos;
	iph->daddr		=	rt->rt_dst;
	iph->saddr		=	rt->rt_src;
	iph->ttl		=	old_iph->ttl;
	iph->tot_len		=	htons(skb->len);
	ip_select_ident(iph, &rt->u.dst, NULL);
	ip_send_check(iph);
 
	skb->ip_summed = CHECKSUM_NONE;
#ifdef CONFIG_NETFILTER_DEBUG
	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
#endif /* CONFIG_NETFILTER_DEBUG */
	skb->nfcache |= NFC_IPVS_PROPERTY;
	ip_send(skb);
 
	LeaveFunction(10);
 
	return NF_STOLEN;
 
  tx_error_icmp:
	dst_link_failure(skb);
  tx_error:
	kfree_skb(skb);
	return NF_STOLEN;
}
 
 
/*
 *      Direct Routing transmitter
 */
static int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
	struct rtable *rt;			/* Route to the other host */
	struct iphdr  *iph = skb->nh.iph;
	int    mtu;
 
	EnterFunction(10);
 
	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
		goto tx_error_icmp;
 
	/* MTU checking */
	mtu = rt->u.dst.pmtu;
	if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
		ip_rt_put(rt);
		IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
		goto tx_error;
	}
 
	/* update checksum because skb might be defragmented */
	ip_send_check(iph);
 
	if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
		if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
			ip_rt_put(rt);
			IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
			goto tx_error;
		}
	}
 
	/* drop old route */
	dst_release(skb->dst);
	skb->dst = &rt->u.dst;
 
#ifdef CONFIG_NETFILTER_DEBUG
	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
#endif /* CONFIG_NETFILTER_DEBUG */
	skb->nfcache |= NFC_IPVS_PROPERTY;
	ip_send(skb);
 
#if 0000
	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
		do_ip_send);
#endif
	LeaveFunction(10);
	return NF_STOLEN;
 
  tx_error_icmp:
	dst_link_failure(skb);
  tx_error:
	kfree_skb(skb);
	return NF_STOLEN;
}
 
 
/*
 *  Bind a connection entry with the corresponding packet_xmit.
 *  Called by ip_vs_conn_new.
 */
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
	switch (IP_VS_FWD_METHOD(cp)) {
	case IP_VS_CONN_F_MASQ:
		cp->packet_xmit = ip_vs_nat_xmit;
		break;
 
	case IP_VS_CONN_F_TUNNEL:
		cp->packet_xmit = ip_vs_tunnel_xmit;
		break;
 
	case IP_VS_CONN_F_DROUTE:
		cp->packet_xmit = ip_vs_dr_xmit;
		break;
 
	case IP_VS_CONN_F_LOCALNODE:
		cp->packet_xmit = ip_vs_null_xmit;
		break;
 
	case IP_VS_CONN_F_BYPASS:
		cp->packet_xmit = ip_vs_bypass_xmit;
		break;
	}
}
 
 
/*
 *  Bind a connection entry with a virtual service destination
 *  Called just after a new connection entry is created.
 */
static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
	/* if dest is NULL, then return directly */
	if (!dest)
		return;
 
	/* Increase the refcnt counter of the dest */
	atomic_inc(&dest->refcnt);
 
	/* Bind with the destination and its corresponding transmitter */
	cp->flags |= atomic_read(&dest->conn_flags);
	cp->dest = dest;
 
	IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
		  "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
		  ip_vs_proto_name(cp->protocol),
		  NIPQUAD(cp->caddr), ntohs(cp->cport),
		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
		  NIPQUAD(cp->daddr), ntohs(cp->dport),
		  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
		  cp->flags, atomic_read(&cp->refcnt),
		  atomic_read(&dest->refcnt));
}
 
 
/*
 *  Unbind a connection entry with its VS destination
 *  Called by the ip_vs_conn_expire function.
 */
static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
{
	struct ip_vs_dest *dest = cp->dest;
 
	/* if dest is NULL, then return directly */
	if (!dest)
		return;
 
	IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
		  "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
		  "s:%s flg:%X cnt:%d destcnt:%d",
		  ip_vs_proto_name(cp->protocol),
		  NIPQUAD(cp->caddr), ntohs(cp->cport),
		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
		  NIPQUAD(cp->daddr), ntohs(cp->dport),
		  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
		  cp->flags, atomic_read(&cp->refcnt),
		  atomic_read(&dest->refcnt));
 
	/*
	 * Decrease the inactconns or activeconns counter
	 * if it is not a connection template ((cp->cport!=0)
	 *   || (cp->flags & IP_VS_CONN_F_NO_CPORT)).
	 */
	if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
			atomic_dec(&dest->inactconns);
		} else {
			atomic_dec(&dest->activeconns);
		}
	}
 
	/*
	 * Simply decrease the refcnt of the dest, because the
	 * dest will be either in service's destination list
	 * or in the trash.
	 */
	atomic_dec(&dest->refcnt);
}
 
 
/*
 *  Checking if the destination of a connection template is available.
 *  If available, return 1, otherwise invalidate this connection
 *  template and return 0.
 */
int ip_vs_check_template(struct ip_vs_conn *ct)
{
	struct ip_vs_dest *dest = ct->dest;
 
	/*
	 * Checking the dest server status.
	 */
	if ((dest == NULL) ||
	    !(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
		IP_VS_DBG(9, "check_template: dest not available for "
			  "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
			  "-> d:%u.%u.%u.%u:%d\n",
			  ip_vs_proto_name(ct->protocol),
			  NIPQUAD(ct->caddr), ntohs(ct->cport),
			  NIPQUAD(ct->vaddr), ntohs(ct->vport),
			  NIPQUAD(ct->daddr), ntohs(ct->dport));
 
		/*
		 * Invalidate the connection template
		 */
		if (ct->cport) {
			if (ip_vs_conn_unhash(ct)) {
				ct->dport = 65535;
				ct->vport = 65535;
				ct->cport = 0;
				ip_vs_conn_hash(ct);
			}
		}
 
		/*
		 * Simply decrease the refcnt of the template,
		 * don't restart its timer.
		 */
		atomic_dec(&ct->refcnt);
		return 0;
	}
	return 1;
}
 
 
static inline void
ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim)
{
	atomic_inc(&vstim->refcnt);
	cp->timeout_table = vstim;
}
 
static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp)
{
	struct ip_vs_timeout_table *vstim = cp->timeout_table;
 
	if (!vstim)
		return;
	cp->timeout_table = NULL;
	atomic_dec(&vstim->refcnt);
}
 
 
static void ip_vs_conn_expire(unsigned long data)
{
	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
 
	if (cp->timeout_table)
		cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT];
	else
		cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT];
 
	/*
	 *	hey, I'm using it
	 */
	atomic_inc(&cp->refcnt);
 
	/*
	 *	do I control anybody?
	 */
	if (atomic_read(&cp->n_control))
		goto expire_later;
 
	/*
	 *	unhash it if it is hashed in the conn table
	 */
	if (!ip_vs_conn_unhash(cp))
		goto expire_later;
 
	/*
	 *	refcnt==1 implies I'm the only one referrer
	 */
	if (likely(atomic_read(&cp->refcnt) == 1)) {
		/* make sure that there is no timer on it now */
		if (timer_pending(&cp->timer))
			del_timer(&cp->timer);
 
		/* does anybody control me? */
		if (cp->control)
			ip_vs_control_del(cp);
 
		ip_vs_unbind_dest(cp);
		ip_vs_unbind_app(cp);
		ip_vs_timeout_detach(cp);
		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
			atomic_dec(&ip_vs_conn_no_cport_cnt);
		atomic_dec(&ip_vs_conn_count);
 
		kmem_cache_free(ip_vs_conn_cachep, cp);
		return;
	}
 
	/* hash it back to the table */
	ip_vs_conn_hash(cp);
 
  expire_later:
	IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
		  atomic_read(&cp->refcnt)-1,
		  atomic_read(&cp->n_control));
 
	ip_vs_conn_put(cp);
}
 
 
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
	cp->timeout = 0;
	mod_timer(&cp->timer, jiffies);
	__ip_vs_conn_put(cp);
}
 
/*
 *  Create a new connection entry and hash it into the ip_vs_conn_tab.
 */
struct ip_vs_conn *
ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
	       __u32 daddr, __u16 dport, unsigned flags,
	       struct ip_vs_dest *dest)
{
	struct ip_vs_conn *cp;
 
	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
	if (cp == NULL) {
		IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
		return NULL;
	}
 
	memset(cp, 0, sizeof(*cp));
	INIT_LIST_HEAD(&cp->c_list);
	init_timer(&cp->timer);
	cp->timer.data     = (unsigned long)cp;
	cp->timer.function = ip_vs_conn_expire;
	ip_vs_timeout_attach(cp, ip_vs_timeout_table);
	cp->protocol	   = proto;
	cp->caddr	   = caddr;
	cp->cport	   = cport;
	cp->vaddr	   = vaddr;
	cp->vport	   = vport;
	cp->daddr          = daddr;
	cp->dport          = dport;
	cp->flags	   = flags;
	cp->app_data	   = NULL;
	cp->control	   = NULL;
	cp->lock           = SPIN_LOCK_UNLOCKED;
 
	atomic_set(&cp->n_control, 0);
	atomic_set(&cp->in_pkts, 0);
 
	atomic_inc(&ip_vs_conn_count);
	if (flags & IP_VS_CONN_F_NO_CPORT)
		atomic_inc(&ip_vs_conn_no_cport_cnt);
 
	/* Bind its application helper (only for VS/NAT) if any */
	ip_vs_bind_app(cp);
 
	/* Bind the connection with a destination server */
	ip_vs_bind_dest(cp, dest);
 
	/* Set its state and timeout */
	vs_set_state_timeout(cp, IP_VS_S_NONE);
 
	/* Bind its packet transmitter */
	ip_vs_bind_xmit(cp);
 
	/*
	 * Set the entry is referenced by the current thread before hashing
	 * it in the table, so that other thread run ip_vs_random_dropentry
	 * but cannot drop this entry.
	 */
	atomic_set(&cp->refcnt, 1);
 
	/* Hash it in the ip_vs_conn_tab finally */
	ip_vs_conn_hash(cp);
 
	return cp;
}
 
 
/*
 *	/proc/net/ip_vs_conn entries
 */
static int
ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
{
	off_t pos=0;
	int idx, len=0;
	char temp[70];
	struct ip_vs_conn *cp;
	struct list_head *l, *e;
 
	pos = 128;
	if (pos > offset) {
		len += sprintf(buffer+len, "%-127s\n",
			       "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires");
	}
 
	for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
		/*
		 *	Lock is actually only need in next loop
		 *	we are called from uspace: must stop bh.
		 */
		ct_read_lock_bh(idx);
 
		l = &ip_vs_conn_tab[idx];
		for (e=l->next; e!=l; e=e->next) {
			cp = list_entry(e, struct ip_vs_conn, c_list);
			pos += 128;
			if (pos <= offset)
				continue;
			sprintf(temp,
				"%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
				ip_vs_proto_name(cp->protocol),
				ntohl(cp->caddr), ntohs(cp->cport),
				ntohl(cp->vaddr), ntohs(cp->vport),
				ntohl(cp->daddr), ntohs(cp->dport),
				ip_vs_state_name(cp->state),
				(cp->timer.expires-jiffies)/HZ);
			len += sprintf(buffer+len, "%-127s\n", temp);
			if (pos >= offset+length) {
				ct_read_unlock_bh(idx);
				goto done;
			}
		}
		ct_read_unlock_bh(idx);
	}
 
  done:
	*start = buffer+len-(pos-offset);       /* Start of wanted data */
	len = pos-offset;
	if (len > length)
		len = length;
	if (len < 0)
		len = 0;
	return len;
}
 
 
/*
 *      Randomly drop connection entries before running out of memory
 */
static inline int todrop_entry(struct ip_vs_conn *cp)
{
	/*
	 * The drop rate array needs tuning for real environments.
	 * Called from timer bh only => no locking
	 */
	static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
	static char todrop_counter[9] = {0};
	int i;
 
	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
	   This will leave enough time for normal connection to get
	   through. */
	if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
		return 0;
 
	/* Don't drop the entry if its number of incoming packets is not
	   located in [0, 8] */
	i = atomic_read(&cp->in_pkts);
	if (i > 8 || i < 0) return 0;
 
	if (!todrop_rate[i]) return 0;
	if (--todrop_counter[i] > 0) return 0;
 
	todrop_counter[i] = todrop_rate[i];
	return 1;
}
 
 
void ip_vs_random_dropentry(void)
{
	int idx;
	struct ip_vs_conn *cp;
	struct list_head *l,*e;
	struct ip_vs_conn *ct;
 
	/*
	 * Randomly scan 1/32 of the whole table every second
	 */
	for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) {
		unsigned hash = net_random()&IP_VS_CONN_TAB_MASK;
 
		/*
		 *  Lock is actually needed in this loop.
		 */
		ct_write_lock(hash);
 
		l = &ip_vs_conn_tab[hash];
		for (e=l->next; e!=l; e=e->next) {
			cp = list_entry(e, struct ip_vs_conn, c_list);
			if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
				/* connection template */
				continue;
			switch(cp->state) {
			case IP_VS_S_SYN_RECV:
			case IP_VS_S_SYNACK:
				break;
 
			case IP_VS_S_ESTABLISHED:
			case IP_VS_S_UDP:
				if (todrop_entry(cp))
					break;
				continue;
 
			default:
				continue;
			}
 
			/*
			 * Drop the entry, and drop its ct if not referenced
			 */
			atomic_inc(&cp->refcnt);
			ct_write_unlock(hash);
 
			if ((ct = cp->control))
				atomic_inc(&ct->refcnt);
			IP_VS_DBG(4, "del connection\n");
			ip_vs_conn_expire_now(cp);
			if (ct) {
				IP_VS_DBG(4, "del conn template\n");
				ip_vs_conn_expire_now(ct);
			}
			ct_write_lock(hash);
		}
		ct_write_unlock(hash);
	}
}
 
 
/*
 *      Flush all the connection entries in the ip_vs_conn_tab
 */
static void ip_vs_conn_flush(void)
{
	int idx;
	struct ip_vs_conn *cp;
	struct list_head *l,*e;
	struct ip_vs_conn *ct;
 
  flush_again:
	for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
		/*
		 *  Lock is actually needed in this loop.
		 */
		ct_write_lock_bh(idx);
 
		l = &ip_vs_conn_tab[idx];
		for (e=l->next; e!=l; e=e->next) {
			cp = list_entry(e, struct ip_vs_conn, c_list);
			atomic_inc(&cp->refcnt);
			ct_write_unlock(idx);
 
			if ((ct = cp->control))
				atomic_inc(&ct->refcnt);
			IP_VS_DBG(4, "del connection\n");
			ip_vs_conn_expire_now(cp);
			if (ct) {
				IP_VS_DBG(4, "del conn template\n");
				ip_vs_conn_expire_now(ct);
			}
			ct_write_lock(idx);
		}
		ct_write_unlock_bh(idx);
	}
 
	/* the counter may be not NULL, because maybe some conn entries
	   are run by slow timer handler or unhashed but still referred */
	if (atomic_read(&ip_vs_conn_count) != 0) {
		schedule();
		goto flush_again;
	}
}
 
 
int ip_vs_conn_init(void)
{
	int idx;
 
	/*
	 * Allocate the connection hash table and initialize its list heads
	 */
	ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
	if (!ip_vs_conn_tab)
		return -ENOMEM;
 
	IP_VS_INFO("Connection hash table configured "
		   "(size=%d, memory=%ldKbytes)\n",
		   IP_VS_CONN_TAB_SIZE,
		   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
	IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
		  sizeof(struct ip_vs_conn));
 
	for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
	}
 
	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
		__ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
	}
 
	/* Allocate ip_vs_conn slab cache */
	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
					      sizeof(struct ip_vs_conn), 0,
					      SLAB_HWCACHE_ALIGN, NULL, NULL);
	if (!ip_vs_conn_cachep) {
		vfree(ip_vs_conn_tab);
		return -ENOMEM;
	}
 
	proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
 
	/* calculate the random value for connection hash */
	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
 
	return 0;
}
 
void ip_vs_conn_cleanup(void)
{
	/* flush all the connection entries first */
	ip_vs_conn_flush();
 
	/* Release the empty cache */
	kmem_cache_destroy(ip_vs_conn_cachep);
	proc_net_remove("ip_vs_conn");
	vfree(ip_vs_conn_tab);
}
 

Compare with Previous | Blame | View Log

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.