OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

Compare Revisions

  • This comparison shows the changes necessary to convert path
    /or1k/trunk/linux/linux-2.4/kernel
    from Rev 1275 to Rev 1765
    Reverse comparison

Rev 1275 → Rev 1765

/time.c
0,0 → 1,411
/*
* linux/kernel/time.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* This file contains the interface functions for the various
* time related system calls: time, stime, gettimeofday, settimeofday,
* adjtime
*/
/*
* Modification history kernel/time.c
*
* 1993-09-02 Philip Gladstone
* Created file with time related functions from sched.c and adjtimex()
* 1993-10-08 Torsten Duwe
* adjtime interface update and CMOS clock write code
* 1995-08-13 Torsten Duwe
* kernel PLL updated to 1994-12-13 specs (rfc-1589)
* 1999-01-16 Ulrich Windl
* Introduced error checking for many cases in adjtimex().
* Updated NTP code according to technical memorandum Jan '96
* "A Kernel Model for Precision Timekeeping" by Dave Mills
* Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
* (Even though the technical memorandum forbids it)
*/
 
#include <linux/mm.h>
#include <linux/timex.h>
#include <linux/smp_lock.h>
 
#include <asm/uaccess.h>
 
/*
* The timezone where the local system is located. Used as a default by some
* programs who obtain this value by using gettimeofday.
*/
struct timezone sys_tz;
 
/* The xtime_lock is not only serializing the xtime read/writes but it's also
serializing all accesses to the global NTP variables now. */
extern rwlock_t xtime_lock;
 
#if !defined(__alpha__) && !defined(__ia64__)
 
/*
* sys_time() can be implemented in user-level using
* sys_gettimeofday(). Is this for backwards compatibility? If so,
* why not move it into the appropriate arch directory (for those
* architectures that need it).
*
* XXX This function is NOT 64-bit clean!
*/
asmlinkage long sys_time(int * tloc)
{
struct timeval now;
int i;
 
do_gettimeofday(&now);
i = now.tv_sec;
if (tloc) {
if (put_user(i,tloc))
i = -EFAULT;
}
return i;
}
 
/*
* sys_stime() can be implemented in user-level using
* sys_settimeofday(). Is this for backwards compatibility? If so,
* why not move it into the appropriate arch directory (for those
* architectures that need it).
*/
asmlinkage long sys_stime(int * tptr)
{
int value;
 
if (!capable(CAP_SYS_TIME))
return -EPERM;
if (get_user(value, tptr))
return -EFAULT;
write_lock_irq(&xtime_lock);
vxtime_lock();
xtime.tv_sec = value;
xtime.tv_usec = 0;
vxtime_unlock();
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
write_unlock_irq(&xtime_lock);
return 0;
}
 
#endif
 
asmlinkage long sys_gettimeofday(struct timeval *tv, struct timezone *tz)
{
if (tv) {
struct timeval ktv;
do_gettimeofday(&ktv);
if (copy_to_user(tv, &ktv, sizeof(ktv)))
return -EFAULT;
}
if (tz) {
if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
return -EFAULT;
}
return 0;
}
 
/*
* Adjust the time obtained from the CMOS to be UTC time instead of
* local time.
*
* This is ugly, but preferable to the alternatives. Otherwise we
* would either need to write a program to do it in /etc/rc (and risk
* confusion if the program gets run more than once; it would also be
* hard to make the program warp the clock precisely n hours) or
* compile in the timezone information into the kernel. Bad, bad....
*
* - TYT, 1992-01-01
*
* The best thing to do is to keep the CMOS clock in universal time (UTC)
* as real UNIX machines always do it. This avoids all headaches about
* daylight saving times and warping kernel clocks.
*/
inline static void warp_clock(void)
{
write_lock_irq(&xtime_lock);
vxtime_lock();
xtime.tv_sec += sys_tz.tz_minuteswest * 60;
vxtime_unlock();
write_unlock_irq(&xtime_lock);
}
 
/*
* In case for some reason the CMOS clock has not already been running
* in UTC, but in some local time: The first time we set the timezone,
* we will warp the clock so that it is ticking UTC time instead of
* local time. Presumably, if someone is setting the timezone then we
* are running in an environment where the programs understand about
* timezones. This should be done at boot time in the /etc/rc script,
* as soon as possible, so that the clock can be set right. Otherwise,
* various programs will get confused when the clock gets warped.
*/
 
int do_sys_settimeofday(struct timeval *tv, struct timezone *tz)
{
static int firsttime = 1;
 
if (!capable(CAP_SYS_TIME))
return -EPERM;
if (tz) {
/* SMP safe, global irq locking makes it work. */
sys_tz = *tz;
if (firsttime) {
firsttime = 0;
if (!tv)
warp_clock();
}
}
if (tv)
{
/* SMP safe, again the code in arch/foo/time.c should
* globally block out interrupts when it runs.
*/
do_settimeofday(tv);
}
return 0;
}
 
asmlinkage long sys_settimeofday(struct timeval *tv, struct timezone *tz)
{
struct timeval new_tv;
struct timezone new_tz;
 
if (tv) {
if (copy_from_user(&new_tv, tv, sizeof(*tv)))
return -EFAULT;
}
if (tz) {
if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
 
return do_sys_settimeofday(tv ? &new_tv : NULL, tz ? &new_tz : NULL);
}
 
long pps_offset; /* pps time offset (us) */
long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */
 
long pps_freq; /* frequency offset (scaled ppm) */
long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
 
long pps_valid = PPS_VALID; /* pps signal watchdog counter */
 
int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
 
long pps_jitcnt; /* jitter limit exceeded */
long pps_calcnt; /* calibration intervals */
long pps_errcnt; /* calibration errors */
long pps_stbcnt; /* stability limit exceeded */
 
/* hook for a loadable hardpps kernel module */
void (*hardpps_ptr)(struct timeval *);
 
/* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
int do_adjtimex(struct timex *txc)
{
long ltemp, mtemp, save_adjust;
int result;
 
/* In order to modify anything, you gotta be super-user! */
if (txc->modes && !capable(CAP_SYS_TIME))
return -EPERM;
/* Now we validate the data before disabling interrupts */
 
if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
/* singleshot must not be used with any other mode bits */
if (txc->modes != ADJ_OFFSET_SINGLESHOT)
return -EINVAL;
 
if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
/* adjustment Offset limited to +- .512 seconds */
if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
return -EINVAL;
 
/* if the quartz is off by more than 10% something is VERY wrong ! */
if (txc->modes & ADJ_TICK)
if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ)
return -EINVAL;
 
write_lock_irq(&xtime_lock);
result = time_state; /* mostly `TIME_OK' */
 
/* Save for later - semantics of adjtime is to return old value */
save_adjust = time_adjust;
 
#if 0 /* STA_CLOCKERR is never set yet */
time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
#endif
/* If there are input parameters, then process them */
if (txc->modes)
{
if (txc->modes & ADJ_STATUS) /* only set allowed bits */
time_status = (txc->status & ~STA_RONLY) |
(time_status & STA_RONLY);
 
if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
result = -EINVAL;
goto leave;
}
time_freq = txc->freq - pps_freq;
}
 
if (txc->modes & ADJ_MAXERROR) {
if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
result = -EINVAL;
goto leave;
}
time_maxerror = txc->maxerror;
}
 
if (txc->modes & ADJ_ESTERROR) {
if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
result = -EINVAL;
goto leave;
}
time_esterror = txc->esterror;
}
 
if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
if (txc->constant < 0) { /* NTP v4 uses values > 6 */
result = -EINVAL;
goto leave;
}
time_constant = txc->constant;
}
 
if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
/* adjtime() is independent from ntp_adjtime() */
time_adjust = txc->offset;
}
else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
(STA_PPSTIME | STA_PPSSIGNAL) ?
pps_offset : txc->offset;
 
/*
* Scale the phase adjustment and
* clamp to the operating range.
*/
if (ltemp > MAXPHASE)
time_offset = MAXPHASE << SHIFT_UPDATE;
else if (ltemp < -MAXPHASE)
time_offset = -(MAXPHASE << SHIFT_UPDATE);
else
time_offset = ltemp << SHIFT_UPDATE;
 
/*
* Select whether the frequency is to be controlled
* and in which mode (PLL or FLL). Clamp to the operating
* range. Ugly multiply/divide should be replaced someday.
*/
 
if (time_status & STA_FREQHOLD || time_reftime == 0)
time_reftime = xtime.tv_sec;
mtemp = xtime.tv_sec - time_reftime;
time_reftime = xtime.tv_sec;
if (time_status & STA_FLL) {
if (mtemp >= MINSEC) {
ltemp = (time_offset / mtemp) << (SHIFT_USEC -
SHIFT_UPDATE);
if (ltemp < 0)
time_freq -= -ltemp >> SHIFT_KH;
else
time_freq += ltemp >> SHIFT_KH;
} else /* calibration interval too short (p. 12) */
result = TIME_ERROR;
} else { /* PLL mode */
if (mtemp < MAXSEC) {
ltemp *= mtemp;
if (ltemp < 0)
time_freq -= -ltemp >> (time_constant +
time_constant +
SHIFT_KF - SHIFT_USEC);
else
time_freq += ltemp >> (time_constant +
time_constant +
SHIFT_KF - SHIFT_USEC);
} else /* calibration interval too long (p. 12) */
result = TIME_ERROR;
}
if (time_freq > time_tolerance)
time_freq = time_tolerance;
else if (time_freq < -time_tolerance)
time_freq = -time_tolerance;
} /* STA_PLL || STA_PPSTIME */
} /* txc->modes & ADJ_OFFSET */
if (txc->modes & ADJ_TICK) {
/* if the quartz is off by more than 10% something is
VERY wrong ! */
if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ) {
result = -EINVAL;
goto leave;
}
tick = txc->tick;
}
} /* txc->modes */
leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
|| ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
&& (time_status & STA_PPSSIGNAL) == 0)
/* p. 24, (b) */
|| ((time_status & (STA_PPSTIME|STA_PPSJITTER))
== (STA_PPSTIME|STA_PPSJITTER))
/* p. 24, (c) */
|| ((time_status & STA_PPSFREQ) != 0
&& (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
/* p. 24, (d) */
result = TIME_ERROR;
if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
txc->offset = save_adjust;
else {
if (time_offset < 0)
txc->offset = -(-time_offset >> SHIFT_UPDATE);
else
txc->offset = time_offset >> SHIFT_UPDATE;
}
txc->freq = time_freq + pps_freq;
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
txc->constant = time_constant;
txc->precision = time_precision;
txc->tolerance = time_tolerance;
txc->tick = tick;
txc->ppsfreq = pps_freq;
txc->jitter = pps_jitter >> PPS_AVG;
txc->shift = pps_shift;
txc->stabil = pps_stabil;
txc->jitcnt = pps_jitcnt;
txc->calcnt = pps_calcnt;
txc->errcnt = pps_errcnt;
txc->stbcnt = pps_stbcnt;
write_unlock_irq(&xtime_lock);
do_gettimeofday(&txc->time);
return(result);
}
 
asmlinkage long sys_adjtimex(struct timex *txc_p)
{
struct timex txc; /* Local copy of parameter */
int ret;
 
/* Copy the user data space into the kernel copy
* structure. But bear in mind that the structures
* may change
*/
if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
return -EFAULT;
ret = do_adjtimex(&txc);
return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
}
/dma.c
0,0 → 1,128
/* $Id: dma.c,v 1.1.1.1 2004-04-15 02:30:21 phoenix Exp $
* linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
*
* Written by Hennus Bergman, 1992.
*
* 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
* In the previous version the reported device could end up being wrong,
* if a device requested a DMA channel that was already in use.
* [It also happened to remove the sizeof(char *) == sizeof(int)
* assumption introduced because of those /proc/dma patches. -- Hennus]
*/
 
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <asm/dma.h>
#include <asm/system.h>
 
 
/* A note on resource allocation:
*
* All drivers needing DMA channels, should allocate and release them
* through the public routines `request_dma()' and `free_dma()'.
*
* In order to avoid problems, all processes should allocate resources in
* the same sequence and release them in the reverse order.
*
* So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
* When releasing them, first release the DMA, then release the IRQ.
* If you don't, you may cause allocation requests to fail unnecessarily.
* This doesn't really matter now, but it will once we get real semaphores
* in the kernel.
*/
 
 
spinlock_t dma_spin_lock = SPIN_LOCK_UNLOCKED;
 
/*
* If our port doesn't define this it has no PC like DMA
*/
 
#ifdef MAX_DMA_CHANNELS
 
 
/* Channel n is busy iff dma_chan_busy[n].lock != 0.
* DMA0 used to be reserved for DRAM refresh, but apparently not any more...
* DMA4 is reserved for cascading.
*/
 
struct dma_chan {
int lock;
const char *device_id;
};
 
static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 1, "cascade" },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
 
int get_dma_list(char *buf)
{
int i, len = 0;
 
for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
if (dma_chan_busy[i].lock) {
len += sprintf(buf+len, "%2d: %s\n",
i,
dma_chan_busy[i].device_id);
}
}
return len;
} /* get_dma_list */
 
 
int request_dma(unsigned int dmanr, const char * device_id)
{
if (dmanr >= MAX_DMA_CHANNELS)
return -EINVAL;
 
if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
return -EBUSY;
 
dma_chan_busy[dmanr].device_id = device_id;
 
/* old flag was 0, now contains 1 to indicate busy */
return 0;
} /* request_dma */
 
 
void free_dma(unsigned int dmanr)
{
if (dmanr >= MAX_DMA_CHANNELS) {
printk("Trying to free DMA%d\n", dmanr);
return;
}
 
if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
printk("Trying to free free DMA%d\n", dmanr);
return;
}
 
} /* free_dma */
 
#else
 
int request_dma(unsigned int dmanr, const char *device_id)
{
return -EINVAL;
}
 
void free_dma(unsigned int dmanr)
{
}
 
int get_dma_list(char *buf)
{
strcpy(buf, "No DMA\n");
return 7;
}
#endif
/fork.c
0,0 → 1,895
/*
* linux/kernel/fork.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
 
/*
* 'fork.c' contains the help-routines for the 'fork' system call
* (see also entry.S and others).
* Fork is rather simple, once you get the hang of it, but the memory
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
*/
 
#include <linux/config.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/namespace.h>
#include <linux/personality.h>
#include <linux/compiler.h>
 
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/processor.h>
 
/* The idle threads do not count.. */
int nr_threads;
int nr_running;
 
int max_threads;
unsigned long total_forks; /* Handle normal Linux uptimes. */
int last_pid;
 
struct task_struct *pidhash[PIDHASH_SZ];
 
void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
 
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
wq_write_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait);
wq_write_unlock_irqrestore(&q->lock, flags);
}
 
void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
 
wait->flags |= WQ_FLAG_EXCLUSIVE;
wq_write_lock_irqsave(&q->lock, flags);
__add_wait_queue_tail(q, wait);
wq_write_unlock_irqrestore(&q->lock, flags);
}
 
void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
 
wq_write_lock_irqsave(&q->lock, flags);
__remove_wait_queue(q, wait);
wq_write_unlock_irqrestore(&q->lock, flags);
}
 
void __init fork_init(unsigned long mempages)
{
/*
* The default maximum number of threads is set to a safe
* value: the thread structures can take up at most half
* of memory.
*/
max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;
 
init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
}
 
/* Protects next_safe and last_pid. */
spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
 
static int get_pid(unsigned long flags)
{
static int next_safe = PID_MAX;
struct task_struct *p;
int pid, beginpid;
 
if (flags & CLONE_PID)
return current->pid;
 
spin_lock(&lastpid_lock);
beginpid = last_pid;
if((++last_pid) & 0xffff8000) {
last_pid = 300; /* Skip daemons etc. */
goto inside;
}
if(last_pid >= next_safe) {
inside:
next_safe = PID_MAX;
read_lock(&tasklist_lock);
repeat:
for_each_task(p) {
if(p->pid == last_pid ||
p->pgrp == last_pid ||
p->tgid == last_pid ||
p->session == last_pid) {
if(++last_pid >= next_safe) {
if(last_pid & 0xffff8000)
last_pid = 300;
next_safe = PID_MAX;
}
if(unlikely(last_pid == beginpid)) {
next_safe = 0;
goto nomorepids;
}
goto repeat;
}
if(p->pid > last_pid && next_safe > p->pid)
next_safe = p->pid;
if(p->pgrp > last_pid && next_safe > p->pgrp)
next_safe = p->pgrp;
if(p->tgid > last_pid && next_safe > p->tgid)
next_safe = p->tgid;
if(p->session > last_pid && next_safe > p->session)
next_safe = p->session;
}
read_unlock(&tasklist_lock);
}
pid = last_pid;
spin_unlock(&lastpid_lock);
 
return pid;
 
nomorepids:
read_unlock(&tasklist_lock);
spin_unlock(&lastpid_lock);
return 0;
}
 
static inline int dup_mmap(struct mm_struct * mm)
{
struct vm_area_struct * mpnt, *tmp, **pprev;
int retval;
 
flush_cache_mm(current->mm);
mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_cache = NULL;
mm->map_count = 0;
mm->rss = 0;
mm->cpu_vm_mask = 0;
mm->swap_address = 0;
pprev = &mm->mmap;
 
/*
* Add it to the mmlist after the parent.
* Doing it this way means that we can order the list,
* and fork() won't mess up the ordering significantly.
* Add it first so that swapoff can see any swap entries.
*/
spin_lock(&mmlist_lock);
list_add(&mm->mmlist, &current->mm->mmlist);
mmlist_nr++;
spin_unlock(&mmlist_lock);
 
for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
struct file *file;
 
retval = -ENOMEM;
if(mpnt->vm_flags & VM_DONTCOPY)
continue;
tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!tmp)
goto fail_nomem;
*tmp = *mpnt;
tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm;
tmp->vm_next = NULL;
file = tmp->vm_file;
if (file) {
struct inode *inode = file->f_dentry->d_inode;
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
/* insert tmp into the share list, just after mpnt */
spin_lock(&inode->i_mapping->i_shared_lock);
if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
mpnt->vm_next_share->vm_pprev_share =
&tmp->vm_next_share;
mpnt->vm_next_share = tmp;
tmp->vm_pprev_share = &mpnt->vm_next_share;
spin_unlock(&inode->i_mapping->i_shared_lock);
}
 
/*
* Link in the new vma and copy the page table entries:
* link in first so that swapoff can see swap entries.
*/
spin_lock(&mm->page_table_lock);
*pprev = tmp;
pprev = &tmp->vm_next;
mm->map_count++;
retval = copy_page_range(mm, current->mm, tmp);
spin_unlock(&mm->page_table_lock);
 
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
 
if (retval)
goto fail_nomem;
}
retval = 0;
build_mmap_rb(mm);
 
fail_nomem:
flush_tlb_mm(current->mm);
return retval;
}
 
spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;
int mmlist_nr;
 
#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
 
static struct mm_struct * mm_init(struct mm_struct * mm)
{
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
mm->page_table_lock = SPIN_LOCK_UNLOCKED;
mm->pgd = pgd_alloc(mm);
mm->def_flags = 0;
if (mm->pgd)
return mm;
free_mm(mm);
return NULL;
}
 
/*
* Allocate and initialize an mm_struct.
*/
struct mm_struct * mm_alloc(void)
{
struct mm_struct * mm;
 
mm = allocate_mm();
if (mm) {
memset(mm, 0, sizeof(*mm));
return mm_init(mm);
}
return NULL;
}
 
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
* mmput. Free the page directory and the mm.
*/
inline void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
pgd_free(mm->pgd);
check_pgt_cache();
destroy_context(mm);
free_mm(mm);
}
 
/*
* Decrement the use count and release all resources for an mm.
*/
void mmput(struct mm_struct *mm)
{
if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
extern struct mm_struct *swap_mm;
if (swap_mm == mm)
swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
list_del(&mm->mmlist);
mmlist_nr--;
spin_unlock(&mmlist_lock);
exit_mmap(mm);
mmdrop(mm);
}
}
 
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
*
* mm_release is called after a mm_struct has been removed
* from the current process.
*
* This difference is important for error handling, when we
* only half set up a mm_struct for a new process and need to restore
* the old one. Because we mmput the new mm_struct before
* restoring the old one. . .
* Eric Biederman 10 January 1998
*/
void mm_release(void)
{
struct task_struct *tsk = current;
struct completion *vfork_done = tsk->vfork_done;
 
/* notify parent sleeping on vfork() */
if (vfork_done) {
tsk->vfork_done = NULL;
complete(vfork_done);
}
}
 
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
struct mm_struct * mm, *oldmm;
int retval;
 
tsk->min_flt = tsk->maj_flt = 0;
tsk->cmin_flt = tsk->cmaj_flt = 0;
tsk->nswap = tsk->cnswap = 0;
 
tsk->mm = NULL;
tsk->active_mm = NULL;
 
/*
* Are we cloning a kernel thread?
*
* We need to steal a active VM for that..
*/
oldmm = current->mm;
if (!oldmm)
return 0;
 
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
goto good_mm;
}
 
retval = -ENOMEM;
mm = allocate_mm();
if (!mm)
goto fail_nomem;
 
/* Copy the current MM stuff.. */
memcpy(mm, oldmm, sizeof(*mm));
if (!mm_init(mm))
goto fail_nomem;
 
if (init_new_context(tsk,mm))
goto free_pt;
 
down_write(&oldmm->mmap_sem);
retval = dup_mmap(mm);
up_write(&oldmm->mmap_sem);
 
if (retval)
goto free_pt;
 
/*
* child gets a private LDT (if there was an LDT in the parent)
*/
copy_segments(tsk, mm);
 
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
 
free_pt:
mmput(mm);
fail_nomem:
return retval;
}
 
static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
{
struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
/* We don't need to lock fs - think why ;-) */
if (fs) {
atomic_set(&fs->count, 1);
fs->lock = RW_LOCK_UNLOCKED;
fs->umask = old->umask;
read_lock(&old->lock);
fs->rootmnt = mntget(old->rootmnt);
fs->root = dget(old->root);
fs->pwdmnt = mntget(old->pwdmnt);
fs->pwd = dget(old->pwd);
if (old->altroot) {
fs->altrootmnt = mntget(old->altrootmnt);
fs->altroot = dget(old->altroot);
} else {
fs->altrootmnt = NULL;
fs->altroot = NULL;
}
read_unlock(&old->lock);
}
return fs;
}
 
struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
return __copy_fs_struct(old);
}
 
static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
{
if (clone_flags & CLONE_FS) {
atomic_inc(&current->fs->count);
return 0;
}
tsk->fs = __copy_fs_struct(current->fs);
if (!tsk->fs)
return -1;
return 0;
}
 
static int count_open_files(struct files_struct *files, int size)
{
int i;
/* Find the last open fd */
for (i = size/(8*sizeof(long)); i > 0; ) {
if (files->open_fds->fds_bits[--i])
break;
}
i = (i+1) * 8 * sizeof(long);
return i;
}
 
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
struct file **old_fds, **new_fds;
int open_files, nfds, size, i, error = 0;
 
/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
goto out;
 
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
}
 
/*
* Note: we may be using current for both targets (See exec.c)
* This works because we cache current->files (old) as oldf. Don't
* break this.
*/
tsk->files = NULL;
error = -ENOMEM;
newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
if (!newf)
goto out;
 
atomic_set(&newf->count, 1);
 
newf->file_lock = RW_LOCK_UNLOCKED;
newf->next_fd = 0;
newf->max_fds = NR_OPEN_DEFAULT;
newf->max_fdset = __FD_SETSIZE;
newf->close_on_exec = &newf->close_on_exec_init;
newf->open_fds = &newf->open_fds_init;
newf->fd = &newf->fd_array[0];
 
/* We don't yet have the oldf readlock, but even if the old
fdset gets grown now, we'll only copy up to "size" fds */
size = oldf->max_fdset;
if (size > __FD_SETSIZE) {
newf->max_fdset = 0;
write_lock(&newf->file_lock);
error = expand_fdset(newf, size-1);
write_unlock(&newf->file_lock);
if (error)
goto out_release;
}
read_lock(&oldf->file_lock);
 
open_files = count_open_files(oldf, size);
 
/*
* Check whether we need to allocate a larger fd array.
* Note: we're not a clone task, so the open count won't
* change.
*/
nfds = NR_OPEN_DEFAULT;
if (open_files > nfds) {
read_unlock(&oldf->file_lock);
newf->max_fds = 0;
write_lock(&newf->file_lock);
error = expand_fd_array(newf, open_files-1);
write_unlock(&newf->file_lock);
if (error)
goto out_release;
nfds = newf->max_fds;
read_lock(&oldf->file_lock);
}
 
old_fds = oldf->fd;
new_fds = newf->fd;
 
memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
 
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
if (f)
get_file(f);
*new_fds++ = f;
}
read_unlock(&oldf->file_lock);
 
/* compute the remainder to be cleared */
size = (newf->max_fds - open_files) * sizeof(struct file *);
 
/* This is long word aligned thus could use a optimized version */
memset(new_fds, 0, size);
 
if (newf->max_fdset > open_files) {
int left = (newf->max_fdset-open_files)/8;
int start = open_files / (8 * sizeof(unsigned long));
memset(&newf->open_fds->fds_bits[start], 0, left);
memset(&newf->close_on_exec->fds_bits[start], 0, left);
}
 
tsk->files = newf;
error = 0;
out:
return error;
 
out_release:
free_fdset (newf->close_on_exec, newf->max_fdset);
free_fdset (newf->open_fds, newf->max_fdset);
kmem_cache_free(files_cachep, newf);
goto out;
}
 
/*
* Helper to unshare the files of the current task.
* We don't want to expose copy_files internals to
* the exec layer of the kernel.
*/
 
int unshare_files(void)
{
struct files_struct *files = current->files;
int rc;
if(!files)
BUG();
/* This can race but the race causes us to copy when we don't
need to and drop the copy */
if(atomic_read(&files->count) == 1)
{
atomic_inc(&files->count);
return 0;
}
rc = copy_files(0, current);
if(rc)
current->files = files;
return rc;
}
 
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
{
struct signal_struct *sig;
 
if (clone_flags & CLONE_SIGHAND) {
atomic_inc(&current->sig->count);
return 0;
}
sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
tsk->sig = sig;
if (!sig)
return -1;
spin_lock_init(&sig->siglock);
atomic_set(&sig->count, 1);
memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
return 0;
}
 
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
 
new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU);
new_flags |= PF_FORKNOEXEC;
if (!(clone_flags & CLONE_PTRACE))
p->ptrace = 0;
p->flags = new_flags;
}
 
long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
{
struct task_struct *task = current;
unsigned old_task_dumpable;
long ret;
 
/* lock out any potential ptracer */
task_lock(task);
if (task->ptrace) {
task_unlock(task);
return -EPERM;
}
 
old_task_dumpable = task->task_dumpable;
task->task_dumpable = 0;
task_unlock(task);
 
ret = arch_kernel_thread(fn, arg, flags);
 
/* never reached in child process, only in parent */
current->task_dumpable = old_task_dumpable;
 
return ret;
}
 
/*
* Ok, this is the main fork-routine. It copies the system process
* information (task[nr]) and sets up the necessary registers. It also
* copies the data segment in its entirety. The "stack_start" and
* "stack_top" arguments are simply passed along to the platform
* specific copy_thread() routine. Most platforms ignore stack_top.
* For an example that's using stack_top, see
* arch/ia64/kernel/process.c.
*/
int do_fork(unsigned long clone_flags, unsigned long stack_start,
struct pt_regs *regs, unsigned long stack_size)
{
int retval;
struct task_struct *p;
struct completion vfork;
 
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return -EINVAL;
 
retval = -EPERM;
 
/*
* CLONE_PID is only allowed for the initial SMP swapper
* calls
*/
if (clone_flags & CLONE_PID) {
if (current->pid)
goto fork_out;
}
 
retval = -ENOMEM;
p = alloc_task_struct();
if (!p)
goto fork_out;
 
*p = *current;
 
retval = -EAGAIN;
/*
* Check if we are over our maximum process limit, but be sure to
* exclude root. This is needed to make it possible for login and
* friends to set the per-user process limit to something lower
* than the amount of processes root is running. -- Rik
*/
if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur
&& p->user != &root_user
&& !capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE))
goto bad_fork_free;
 
atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);
 
/*
* Counter increases are protected by
* the kernel lock so nr_threads can't
* increase under us (but it may decrease).
*/
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
get_exec_domain(p->exec_domain);
 
if (p->binfmt && p->binfmt->module)
__MOD_INC_USE_COUNT(p->binfmt->module);
 
p->did_exec = 0;
p->swappable = 0;
p->state = TASK_UNINTERRUPTIBLE;
 
copy_flags(clone_flags, p);
p->pid = get_pid(clone_flags);
if (p->pid == 0 && current->pid != 0)
goto bad_fork_cleanup;
 
p->run_list.next = NULL;
p->run_list.prev = NULL;
 
p->p_cptr = NULL;
init_waitqueue_head(&p->wait_chldexit);
p->vfork_done = NULL;
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
}
spin_lock_init(&p->alloc_lock);
 
p->sigpending = 0;
init_sigpending(&p->pending);
 
p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
init_timer(&p->real_timer);
p->real_timer.data = (unsigned long) p;
 
p->leader = 0; /* session leadership doesn't inherit */
p->tty_old_pgrp = 0;
p->times.tms_utime = p->times.tms_stime = 0;
p->times.tms_cutime = p->times.tms_cstime = 0;
#ifdef CONFIG_SMP
{
int i;
p->cpus_runnable = ~0UL;
p->processor = current->processor;
/* ?? should we just memset this ?? */
for(i = 0; i < smp_num_cpus; i++)
p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
spin_lock_init(&p->sigmask_lock);
}
#endif
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
 
INIT_LIST_HEAD(&p->local_pages);
 
retval = -ENOMEM;
/* copy all the process information */
if (copy_files(clone_flags, p))
goto bad_fork_cleanup;
if (copy_fs(clone_flags, p))
goto bad_fork_cleanup_files;
if (copy_sighand(clone_flags, p))
goto bad_fork_cleanup_fs;
if (copy_mm(clone_flags, p))
goto bad_fork_cleanup_sighand;
retval = copy_namespace(clone_flags, p);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_namespace;
p->semundo = NULL;
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
p->parent_exec_id = p->self_exec_id;
 
/* ok, now we should be set up.. */
p->swappable = 1;
p->exit_signal = clone_flags & CSIGNAL;
p->pdeath_signal = 0;
 
/*
* "share" dynamic priority between parent and child, thus the
* total amount of dynamic priorities in the system doesn't change,
* more scheduling fairness. This is only important in the first
* timeslice, on the long run the scheduling behaviour is unchanged.
*/
p->counter = (current->counter + 1) >> 1;
current->counter >>= 1;
if (!current->counter)
current->need_resched = 1;
 
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
*
* Let it rip!
*/
retval = p->pid;
p->tgid = retval;
INIT_LIST_HEAD(&p->thread_group);
 
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
 
/* CLONE_PARENT re-uses the old parent */
p->p_opptr = current->p_opptr;
p->p_pptr = current->p_pptr;
if (!(clone_flags & CLONE_PARENT)) {
p->p_opptr = current;
if (!(p->ptrace & PT_PTRACED))
p->p_pptr = current;
}
 
if (clone_flags & CLONE_THREAD) {
p->tgid = current->tgid;
list_add(&p->thread_group, &current->thread_group);
}
 
SET_LINKS(p);
hash_pid(p);
nr_threads++;
write_unlock_irq(&tasklist_lock);
 
if (p->ptrace & PT_PTRACED)
send_sig(SIGSTOP, p, 1);
 
wake_up_process(p); /* do this last */
++total_forks;
if (clone_flags & CLONE_VFORK)
wait_for_completion(&vfork);
 
fork_out:
return retval;
 
bad_fork_cleanup_namespace:
exit_namespace(p);
bad_fork_cleanup_mm:
exit_mm(p);
if (p->active_mm)
mmdrop(p->active_mm);
bad_fork_cleanup_sighand:
exit_sighand(p);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup:
put_exec_domain(p->exec_domain);
if (p->binfmt && p->binfmt->module)
__MOD_DEC_USE_COUNT(p->binfmt->module);
bad_fork_cleanup_count:
atomic_dec(&p->user->processes);
free_uid(p->user);
bad_fork_free:
free_task_struct(p);
goto fork_out;
}
 
/* SLAB cache for signal_struct structures (tsk->sig) */
kmem_cache_t *sigact_cachep;
 
/* SLAB cache for files_struct structures (tsk->files) */
kmem_cache_t *files_cachep;
 
/* SLAB cache for fs_struct structures (tsk->fs) */
kmem_cache_t *fs_cachep;
 
/* SLAB cache for vm_area_struct structures */
kmem_cache_t *vm_area_cachep;
 
/* SLAB cache for mm_struct structures (tsk->mm) */
kmem_cache_t *mm_cachep;
 
void __init proc_caches_init(void)
{
sigact_cachep = kmem_cache_create("signal_act",
sizeof(struct signal_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!sigact_cachep)
panic("Cannot create signal action SLAB cache");
 
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!files_cachep)
panic("Cannot create files SLAB cache");
 
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!fs_cachep)
panic("Cannot create fs_struct SLAB cache");
vm_area_cachep = kmem_cache_create("vm_area_struct",
sizeof(struct vm_area_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if(!vm_area_cachep)
panic("vma_init: Cannot alloc vm_area_struct SLAB cache");
 
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if(!mm_cachep)
panic("vma_init: Cannot alloc mm_struct SLAB cache");
}
/ksyms.c
0,0 → 1,624
/*
* Herein lies all the functions/variables that are "exported" for linkage
* with dynamically loaded kernel modules.
* Jon.
*
* - Stacked module support and unified symbol table added (June 1994)
* - External symbol table support added (December 1994)
* - Versions on symbols added (December 1994)
* by Bjorn Ekwall <bj0rn@blox.se>
*/
 
#include <linux/config.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/cdrom.h>
#include <linux/kernel_stat.h>
#include <linux/vmalloc.h>
#include <linux/sys.h>
#include <linux/utsname.h>
#include <linux/interrupt.h>
#include <linux/ioport.h>
#include <linux/serial.h>
#include <linux/locks.h>
#include <linux/delay.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/pagemap.h>
#include <linux/sysctl.h>
#include <linux/hdreg.h>
#include <linux/skbuff.h>
#include <linux/genhd.h>
#include <linux/blkpg.h>
#include <linux/swap.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/iobuf.h>
#include <linux/console.h>
#include <linux/poll.h>
#include <linux/mmzone.h>
#include <linux/mm.h>
#include <linux/capability.h>
#include <linux/highuid.h>
#include <linux/brlock.h>
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/in6.h>
#include <linux/completion.h>
#include <linux/seq_file.h>
#include <linux/dnotify.h>
#include <linux/crc32.h>
#include <linux/firmware.h>
#include <asm/checksum.h>
 
#if defined(CONFIG_PROC_FS)
#include <linux/proc_fs.h>
#endif
#ifdef CONFIG_KMOD
#include <linux/kmod.h>
#endif
 
extern void set_device_ro(kdev_t dev,int flag);
 
extern void *sys_call_table;
 
extern struct timezone sys_tz;
extern int request_dma(unsigned int dmanr, char * deviceID);
extern void free_dma(unsigned int dmanr);
extern spinlock_t dma_spin_lock;
extern int panic_timeout;
 
#ifdef CONFIG_MODVERSIONS
const struct module_symbol __export_Using_Versions
__attribute__((section("__ksymtab"))) = {
1 /* Version version */, "Using_Versions"
};
#endif
 
 
EXPORT_SYMBOL(inter_module_register);
EXPORT_SYMBOL(inter_module_unregister);
EXPORT_SYMBOL(inter_module_get);
EXPORT_SYMBOL(inter_module_get_request);
EXPORT_SYMBOL(inter_module_put);
EXPORT_SYMBOL(try_inc_mod_count);
 
/* process memory management */
EXPORT_SYMBOL(do_mmap_pgoff);
EXPORT_SYMBOL(do_munmap);
EXPORT_SYMBOL(do_brk);
EXPORT_SYMBOL(exit_mm);
EXPORT_SYMBOL(exit_files);
EXPORT_SYMBOL(exit_fs);
EXPORT_SYMBOL(exit_sighand);
 
/* internal kernel memory management */
EXPORT_SYMBOL(_alloc_pages);
EXPORT_SYMBOL(__alloc_pages);
EXPORT_SYMBOL(alloc_pages_node);
EXPORT_SYMBOL(__get_free_pages);
EXPORT_SYMBOL(get_zeroed_page);
EXPORT_SYMBOL(__free_pages);
EXPORT_SYMBOL(free_pages);
EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(kmem_find_general_cachep);
EXPORT_SYMBOL(kmem_cache_create);
EXPORT_SYMBOL(kmem_cache_destroy);
EXPORT_SYMBOL(kmem_cache_shrink);
EXPORT_SYMBOL(kmem_cache_alloc);
EXPORT_SYMBOL(kmem_cache_free);
EXPORT_SYMBOL(kmem_cache_size);
EXPORT_SYMBOL(kmalloc);
EXPORT_SYMBOL(kfree);
EXPORT_SYMBOL(vfree);
EXPORT_SYMBOL(__vmalloc);
EXPORT_SYMBOL(vmap);
EXPORT_SYMBOL(vmalloc_to_page);
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(remap_page_range);
EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(vmtruncate);
EXPORT_SYMBOL(find_vma);
EXPORT_SYMBOL(get_unmapped_area);
EXPORT_SYMBOL(init_mm);
#ifdef CONFIG_HIGHMEM
EXPORT_SYMBOL(kmap_high);
EXPORT_SYMBOL(kunmap_high);
EXPORT_SYMBOL(highmem_start_page);
EXPORT_SYMBOL(create_bounce);
EXPORT_SYMBOL(kmap_prot);
EXPORT_SYMBOL(kmap_pte);
#endif
 
/* filesystem internal functions */
EXPORT_SYMBOL(def_blk_fops);
EXPORT_SYMBOL(update_atime);
EXPORT_SYMBOL(get_fs_type);
EXPORT_SYMBOL(get_super);
EXPORT_SYMBOL(drop_super);
EXPORT_SYMBOL(getname);
EXPORT_SYMBOL(names_cachep);
EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(fget);
EXPORT_SYMBOL(igrab);
EXPORT_SYMBOL(iunique);
EXPORT_SYMBOL(ilookup);
EXPORT_SYMBOL(iget4_locked);
EXPORT_SYMBOL(unlock_new_inode);
EXPORT_SYMBOL(iput);
EXPORT_SYMBOL(inode_init_once);
EXPORT_SYMBOL(__inode_init_once);
EXPORT_SYMBOL(force_delete);
EXPORT_SYMBOL(follow_up);
EXPORT_SYMBOL(follow_down);
EXPORT_SYMBOL(lookup_mnt);
EXPORT_SYMBOL(path_init);
EXPORT_SYMBOL(path_walk);
EXPORT_SYMBOL(path_lookup);
EXPORT_SYMBOL(path_release);
EXPORT_SYMBOL(__user_walk);
EXPORT_SYMBOL(lookup_one_len);
EXPORT_SYMBOL(lookup_hash);
EXPORT_SYMBOL(sys_close);
EXPORT_SYMBOL(dcache_lock);
EXPORT_SYMBOL(d_alloc_root);
EXPORT_SYMBOL(d_delete);
EXPORT_SYMBOL(dget_locked);
EXPORT_SYMBOL(d_validate);
EXPORT_SYMBOL(d_rehash);
EXPORT_SYMBOL(d_invalidate); /* May be it will be better in dcache.h? */
EXPORT_SYMBOL(d_move);
EXPORT_SYMBOL(d_instantiate);
EXPORT_SYMBOL(d_alloc);
EXPORT_SYMBOL(d_lookup);
EXPORT_SYMBOL(__d_path);
EXPORT_SYMBOL(mark_buffer_dirty);
EXPORT_SYMBOL(set_buffer_async_io); /* for reiserfs_writepage */
EXPORT_SYMBOL(end_buffer_io_async);
EXPORT_SYMBOL(__mark_buffer_dirty);
EXPORT_SYMBOL(__mark_inode_dirty);
EXPORT_SYMBOL(fd_install);
EXPORT_SYMBOL(get_empty_filp);
EXPORT_SYMBOL(init_private_file);
EXPORT_SYMBOL(filp_open);
EXPORT_SYMBOL(filp_close);
EXPORT_SYMBOL(put_filp);
EXPORT_SYMBOL(files_lock);
EXPORT_SYMBOL(check_disk_change);
EXPORT_SYMBOL(__invalidate_buffers);
EXPORT_SYMBOL(invalidate_bdev);
EXPORT_SYMBOL(invalidate_inodes);
EXPORT_SYMBOL(invalidate_device);
EXPORT_SYMBOL(invalidate_inode_pages);
EXPORT_SYMBOL(truncate_inode_pages);
EXPORT_SYMBOL(fsync_dev);
EXPORT_SYMBOL(fsync_no_super);
EXPORT_SYMBOL(permission);
EXPORT_SYMBOL(vfs_permission);
EXPORT_SYMBOL(inode_setattr);
EXPORT_SYMBOL(inode_change_ok);
EXPORT_SYMBOL(write_inode_now);
EXPORT_SYMBOL(notify_change);
EXPORT_SYMBOL(set_blocksize);
EXPORT_SYMBOL(sb_set_blocksize);
EXPORT_SYMBOL(sb_min_blocksize);
EXPORT_SYMBOL(getblk);
EXPORT_SYMBOL(cdget);
EXPORT_SYMBOL(cdput);
EXPORT_SYMBOL(bdget);
EXPORT_SYMBOL(bdput);
EXPORT_SYMBOL(bread);
EXPORT_SYMBOL(__brelse);
EXPORT_SYMBOL(__bforget);
EXPORT_SYMBOL(ll_rw_block);
EXPORT_SYMBOL(submit_bh);
EXPORT_SYMBOL(unlock_buffer);
EXPORT_SYMBOL(__wait_on_buffer);
EXPORT_SYMBOL(___wait_on_page);
EXPORT_SYMBOL(generic_direct_IO);
EXPORT_SYMBOL(discard_bh_page);
EXPORT_SYMBOL(block_write_full_page);
EXPORT_SYMBOL(block_read_full_page);
EXPORT_SYMBOL(block_prepare_write);
EXPORT_SYMBOL(block_sync_page);
EXPORT_SYMBOL(generic_cont_expand);
EXPORT_SYMBOL(cont_prepare_write);
EXPORT_SYMBOL(generic_commit_write);
EXPORT_SYMBOL(block_truncate_page);
EXPORT_SYMBOL(generic_block_bmap);
EXPORT_SYMBOL(generic_file_read);
EXPORT_SYMBOL(do_generic_file_read);
EXPORT_SYMBOL(do_generic_file_write);
EXPORT_SYMBOL(do_generic_direct_read);
EXPORT_SYMBOL(do_generic_direct_write);
EXPORT_SYMBOL(generic_file_write);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_ro_fops);
EXPORT_SYMBOL(generic_buffer_fdatasync);
EXPORT_SYMBOL(page_hash_bits);
EXPORT_SYMBOL(page_hash_table);
EXPORT_SYMBOL(file_lock_list);
EXPORT_SYMBOL(locks_init_lock);
EXPORT_SYMBOL(locks_copy_lock);
EXPORT_SYMBOL(posix_lock_file);
EXPORT_SYMBOL(posix_test_lock);
EXPORT_SYMBOL(posix_block_lock);
EXPORT_SYMBOL(posix_unblock_lock);
EXPORT_SYMBOL(posix_locks_deadlock);
EXPORT_SYMBOL(locks_mandatory_area);
EXPORT_SYMBOL(dput);
EXPORT_SYMBOL(have_submounts);
EXPORT_SYMBOL(d_find_alias);
EXPORT_SYMBOL(d_prune_aliases);
EXPORT_SYMBOL(prune_dcache);
EXPORT_SYMBOL(shrink_dcache_sb);
EXPORT_SYMBOL(shrink_dcache_parent);
EXPORT_SYMBOL(find_inode_number);
EXPORT_SYMBOL(is_subdir);
EXPORT_SYMBOL(get_unused_fd);
EXPORT_SYMBOL(put_unused_fd);
EXPORT_SYMBOL(vfs_create);
EXPORT_SYMBOL(vfs_mkdir);
EXPORT_SYMBOL(vfs_mknod);
EXPORT_SYMBOL(vfs_symlink);
EXPORT_SYMBOL(vfs_link);
EXPORT_SYMBOL(vfs_rmdir);
EXPORT_SYMBOL(vfs_unlink);
EXPORT_SYMBOL(vfs_rename);
EXPORT_SYMBOL(vfs_statfs);
EXPORT_SYMBOL(generic_read_dir);
EXPORT_SYMBOL(generic_file_llseek);
EXPORT_SYMBOL(no_llseek);
EXPORT_SYMBOL(__pollwait);
EXPORT_SYMBOL(poll_freewait);
EXPORT_SYMBOL(ROOT_DEV);
EXPORT_SYMBOL(__find_get_page);
EXPORT_SYMBOL(__find_lock_page);
EXPORT_SYMBOL(find_trylock_page);
EXPORT_SYMBOL(find_or_create_page);
EXPORT_SYMBOL(grab_cache_page_nowait);
EXPORT_SYMBOL(read_cache_page);
EXPORT_SYMBOL(set_page_dirty);
EXPORT_SYMBOL(mark_page_accessed);
EXPORT_SYMBOL(vfs_readlink);
EXPORT_SYMBOL(vfs_follow_link);
EXPORT_SYMBOL(page_readlink);
EXPORT_SYMBOL(page_follow_link);
EXPORT_SYMBOL(page_symlink_inode_operations);
EXPORT_SYMBOL(block_symlink);
EXPORT_SYMBOL(vfs_readdir);
EXPORT_SYMBOL(__get_lease);
EXPORT_SYMBOL(lease_get_mtime);
EXPORT_SYMBOL(lock_may_read);
EXPORT_SYMBOL(lock_may_write);
EXPORT_SYMBOL(dcache_dir_open);
EXPORT_SYMBOL(dcache_dir_close);
EXPORT_SYMBOL(dcache_dir_lseek);
EXPORT_SYMBOL(dcache_dir_fsync);
EXPORT_SYMBOL(dcache_readdir);
EXPORT_SYMBOL(dcache_dir_ops);
 
/* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
EXPORT_SYMBOL(default_llseek);
EXPORT_SYMBOL(dentry_open);
EXPORT_SYMBOL(filemap_nopage);
EXPORT_SYMBOL(filemap_sync);
EXPORT_SYMBOL(filemap_fdatawrite);
EXPORT_SYMBOL(filemap_fdatasync);
EXPORT_SYMBOL(filemap_fdatawait);
EXPORT_SYMBOL(lock_page);
EXPORT_SYMBOL(unlock_page);
EXPORT_SYMBOL(wakeup_page_waiters);
 
/* device registration */
EXPORT_SYMBOL(register_chrdev);
EXPORT_SYMBOL(unregister_chrdev);
EXPORT_SYMBOL(register_blkdev);
EXPORT_SYMBOL(unregister_blkdev);
EXPORT_SYMBOL(tty_register_driver);
EXPORT_SYMBOL(tty_unregister_driver);
EXPORT_SYMBOL(tty_std_termios);
 
/* block device driver support */
EXPORT_SYMBOL(blksize_size);
EXPORT_SYMBOL(hardsect_size);
EXPORT_SYMBOL(blk_size);
EXPORT_SYMBOL(blk_dev);
EXPORT_SYMBOL(is_read_only);
EXPORT_SYMBOL(set_device_ro);
EXPORT_SYMBOL(bmap);
EXPORT_SYMBOL(sync_dev);
EXPORT_SYMBOL(devfs_register_partitions);
EXPORT_SYMBOL(blkdev_open);
EXPORT_SYMBOL(blkdev_get);
EXPORT_SYMBOL(blkdev_put);
EXPORT_SYMBOL(ioctl_by_bdev);
EXPORT_SYMBOL(grok_partitions);
EXPORT_SYMBOL(register_disk);
EXPORT_SYMBOL(tq_disk);
EXPORT_SYMBOL(init_buffer);
EXPORT_SYMBOL(refile_buffer);
EXPORT_SYMBOL(max_sectors);
EXPORT_SYMBOL(max_readahead);
 
/* tty routines */
EXPORT_SYMBOL(tty_hangup);
EXPORT_SYMBOL(tty_wait_until_sent);
EXPORT_SYMBOL(tty_check_change);
EXPORT_SYMBOL(tty_hung_up_p);
EXPORT_SYMBOL(tty_flip_buffer_push);
EXPORT_SYMBOL(tty_get_baud_rate);
EXPORT_SYMBOL(do_SAK);
 
/* filesystem registration */
EXPORT_SYMBOL(register_filesystem);
EXPORT_SYMBOL(unregister_filesystem);
EXPORT_SYMBOL(kern_mount);
EXPORT_SYMBOL(__mntput);
EXPORT_SYMBOL(may_umount);
 
/* executable format registration */
EXPORT_SYMBOL(register_binfmt);
EXPORT_SYMBOL(unregister_binfmt);
EXPORT_SYMBOL(search_binary_handler);
EXPORT_SYMBOL(prepare_binprm);
EXPORT_SYMBOL(compute_creds);
EXPORT_SYMBOL(remove_arg_zero);
EXPORT_SYMBOL(set_binfmt);
 
/* sysctl table registration */
EXPORT_SYMBOL(register_sysctl_table);
EXPORT_SYMBOL(unregister_sysctl_table);
EXPORT_SYMBOL(sysctl_string);
EXPORT_SYMBOL(sysctl_intvec);
EXPORT_SYMBOL(sysctl_jiffies);
EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_dointvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
EXPORT_SYMBOL(proc_doulongvec_minmax);
 
/* interrupt handling */
EXPORT_SYMBOL(add_timer);
EXPORT_SYMBOL(del_timer);
EXPORT_SYMBOL(request_irq);
EXPORT_SYMBOL(free_irq);
#if !defined(CONFIG_IA64) /* irq_stat is part of struct cpuinfo_ia64 */
EXPORT_SYMBOL(irq_stat);
#endif
 
/* waitqueue handling */
EXPORT_SYMBOL(add_wait_queue);
EXPORT_SYMBOL(add_wait_queue_exclusive);
EXPORT_SYMBOL(remove_wait_queue);
 
/* completion handling */
EXPORT_SYMBOL(wait_for_completion);
EXPORT_SYMBOL(complete);
 
/* The notion of irq probe/assignment is foreign to S/390 */
 
#if !defined(CONFIG_ARCH_S390)
EXPORT_SYMBOL(probe_irq_on);
EXPORT_SYMBOL(probe_irq_off);
#endif
 
#ifdef CONFIG_SMP
EXPORT_SYMBOL(del_timer_sync);
#endif
EXPORT_SYMBOL(mod_timer);
EXPORT_SYMBOL(tq_timer);
EXPORT_SYMBOL(tq_immediate);
 
#ifdef CONFIG_SMP
/* Various random spinlocks we want to export */
EXPORT_SYMBOL(tqueue_lock);
 
/* Big-Reader lock implementation */
EXPORT_SYMBOL(__brlock_array);
#ifndef __BRLOCK_USE_ATOMICS
EXPORT_SYMBOL(__br_write_locks);
#endif
EXPORT_SYMBOL(__br_write_lock);
EXPORT_SYMBOL(__br_write_unlock);
#endif
 
/* Kiobufs */
EXPORT_SYMBOL(alloc_kiovec);
EXPORT_SYMBOL(free_kiovec);
EXPORT_SYMBOL(expand_kiobuf);
 
EXPORT_SYMBOL(map_user_kiobuf);
EXPORT_SYMBOL(unmap_kiobuf);
EXPORT_SYMBOL(lock_kiovec);
EXPORT_SYMBOL(unlock_kiovec);
EXPORT_SYMBOL(brw_kiovec);
EXPORT_SYMBOL(kiobuf_wait_for_io);
 
/* dma handling */
EXPORT_SYMBOL(request_dma);
EXPORT_SYMBOL(free_dma);
EXPORT_SYMBOL(dma_spin_lock);
#ifdef HAVE_DISABLE_HLT
EXPORT_SYMBOL(disable_hlt);
EXPORT_SYMBOL(enable_hlt);
#endif
 
/* resource handling */
EXPORT_SYMBOL(request_resource);
EXPORT_SYMBOL(release_resource);
EXPORT_SYMBOL(allocate_resource);
EXPORT_SYMBOL(check_resource);
EXPORT_SYMBOL(__request_region);
EXPORT_SYMBOL(__check_region);
EXPORT_SYMBOL(__release_region);
EXPORT_SYMBOL(ioport_resource);
EXPORT_SYMBOL(iomem_resource);
 
/* process management */
EXPORT_SYMBOL(complete_and_exit);
EXPORT_SYMBOL(__wake_up);
EXPORT_SYMBOL(__wake_up_sync);
EXPORT_SYMBOL(wake_up_process);
EXPORT_SYMBOL(sleep_on);
EXPORT_SYMBOL(sleep_on_timeout);
EXPORT_SYMBOL(interruptible_sleep_on);
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
EXPORT_SYMBOL(schedule);
EXPORT_SYMBOL(schedule_timeout);
#if CONFIG_SMP
EXPORT_SYMBOL(set_cpus_allowed);
#endif
EXPORT_SYMBOL(yield);
EXPORT_SYMBOL(__cond_resched);
EXPORT_SYMBOL(jiffies);
EXPORT_SYMBOL(xtime);
EXPORT_SYMBOL(do_gettimeofday);
EXPORT_SYMBOL(do_settimeofday);
 
#if !defined(__ia64__)
EXPORT_SYMBOL(loops_per_jiffy);
#endif
 
EXPORT_SYMBOL(kstat);
EXPORT_SYMBOL(nr_running);
 
/* misc */
EXPORT_SYMBOL(panic);
EXPORT_SYMBOL(panic_notifier_list);
EXPORT_SYMBOL(panic_timeout);
EXPORT_SYMBOL(__out_of_line_bug);
EXPORT_SYMBOL(sprintf);
EXPORT_SYMBOL(snprintf);
EXPORT_SYMBOL(sscanf);
EXPORT_SYMBOL(vsprintf);
EXPORT_SYMBOL(vsnprintf);
EXPORT_SYMBOL(vsscanf);
EXPORT_SYMBOL(kdevname);
EXPORT_SYMBOL(bdevname);
EXPORT_SYMBOL(cdevname);
EXPORT_SYMBOL(simple_strtol);
EXPORT_SYMBOL(simple_strtoul);
EXPORT_SYMBOL(simple_strtoull);
EXPORT_SYMBOL(system_utsname); /* UTS data */
EXPORT_SYMBOL(uts_sem); /* UTS semaphore */
#ifndef __mips__
EXPORT_SYMBOL(sys_call_table);
#endif
EXPORT_SYMBOL(machine_restart);
EXPORT_SYMBOL(machine_halt);
EXPORT_SYMBOL(machine_power_off);
EXPORT_SYMBOL(_ctype);
EXPORT_SYMBOL(secure_tcp_sequence_number);
EXPORT_SYMBOL(get_random_bytes);
EXPORT_SYMBOL(securebits);
EXPORT_SYMBOL(cap_bset);
EXPORT_SYMBOL(reparent_to_init);
EXPORT_SYMBOL(daemonize);
EXPORT_SYMBOL(csum_partial); /* for networking and md */
EXPORT_SYMBOL(seq_escape);
EXPORT_SYMBOL(seq_printf);
EXPORT_SYMBOL(seq_open);
EXPORT_SYMBOL(seq_release);
EXPORT_SYMBOL(seq_read);
EXPORT_SYMBOL(seq_lseek);
EXPORT_SYMBOL(single_open);
EXPORT_SYMBOL(single_release);
EXPORT_SYMBOL(seq_release_private);
 
/* Program loader interfaces */
EXPORT_SYMBOL(setup_arg_pages);
EXPORT_SYMBOL(copy_strings_kernel);
EXPORT_SYMBOL(do_execve);
EXPORT_SYMBOL(flush_old_exec);
EXPORT_SYMBOL(kernel_read);
EXPORT_SYMBOL(open_exec);
 
/* Miscellaneous access points */
EXPORT_SYMBOL(si_meminfo);
 
/* Added to make file system as module */
EXPORT_SYMBOL(sys_tz);
EXPORT_SYMBOL(file_fsync);
EXPORT_SYMBOL(fsync_buffers_list);
EXPORT_SYMBOL(clear_inode);
EXPORT_SYMBOL(___strtok);
EXPORT_SYMBOL(init_special_inode);
EXPORT_SYMBOL(read_ahead);
EXPORT_SYMBOL(get_hash_table);
EXPORT_SYMBOL(new_inode);
EXPORT_SYMBOL(insert_inode_hash);
EXPORT_SYMBOL(remove_inode_hash);
EXPORT_SYMBOL(buffer_insert_list);
EXPORT_SYMBOL(make_bad_inode);
EXPORT_SYMBOL(is_bad_inode);
EXPORT_SYMBOL(event);
EXPORT_SYMBOL(brw_page);
EXPORT_SYMBOL(__inode_dir_notify);
 
#ifdef CONFIG_UID16
EXPORT_SYMBOL(overflowuid);
EXPORT_SYMBOL(overflowgid);
#endif
EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
 
/* all busmice */
EXPORT_SYMBOL(fasync_helper);
EXPORT_SYMBOL(kill_fasync);
 
EXPORT_SYMBOL(disk_name); /* for md.c */
 
/* binfmt_aout */
EXPORT_SYMBOL(get_write_access);
 
/* library functions */
EXPORT_SYMBOL(strnicmp);
EXPORT_SYMBOL(strspn);
EXPORT_SYMBOL(strsep);
 
#ifdef CONFIG_CRC32
EXPORT_SYMBOL(crc32_le);
EXPORT_SYMBOL(crc32_be);
EXPORT_SYMBOL(bitreverse);
#endif
 
#ifdef CONFIG_FW_LOADER
EXPORT_SYMBOL(release_firmware);
EXPORT_SYMBOL(request_firmware);
EXPORT_SYMBOL(request_firmware_nowait);
EXPORT_SYMBOL(register_firmware);
#endif
 
/* software interrupts */
EXPORT_SYMBOL(tasklet_hi_vec);
EXPORT_SYMBOL(tasklet_vec);
EXPORT_SYMBOL(bh_task_vec);
EXPORT_SYMBOL(init_bh);
EXPORT_SYMBOL(remove_bh);
EXPORT_SYMBOL(tasklet_init);
EXPORT_SYMBOL(tasklet_kill);
EXPORT_SYMBOL(__run_task_queue);
EXPORT_SYMBOL(do_softirq);
EXPORT_SYMBOL(raise_softirq);
EXPORT_SYMBOL(cpu_raise_softirq);
EXPORT_SYMBOL(__tasklet_schedule);
EXPORT_SYMBOL(__tasklet_hi_schedule);
 
/* init task, for moving kthread roots - ought to export a function ?? */
 
EXPORT_SYMBOL(init_task_union);
 
EXPORT_SYMBOL(tasklist_lock);
EXPORT_SYMBOL(pidhash);
EXPORT_SYMBOL(unshare_files);
 
/* debug */
EXPORT_SYMBOL(dump_stack);
 
/* To match ksyms with System.map */
extern const char _end[];
EXPORT_SYMBOL(_end);
/printk.c
0,0 → 1,698
/*
* linux/kernel/printk.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* Modified to make sys_syslog() more flexible: added commands to
* return the last 4k of kernel messages, regardless of whether
* they've been read or not. Added option to suppress kernel printk's
* to the console. Added hook for sending the console messages
* elsewhere, in preparation for a serial line console (someday).
* Ted Ts'o, 2/11/93.
* Modified for sysctl support, 1/8/97, Chris Horn.
* Fixed SMP synchronization, 08/08/99, Manfred Spraul
* manfreds@colorfullife.com
* Rewrote bits to get rid of console_lock
* 01Mar01 Andrew Morton <andrewm@uow.edu.au>
*/
 
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/smp_lock.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/interrupt.h> /* For in_interrupt() */
#include <linux/config.h>
 
#include <asm/uaccess.h>
 
#if !defined(CONFIG_LOG_BUF_SHIFT) || (CONFIG_LOG_BUF_SHIFT == 0)
#if defined(CONFIG_MULTIQUAD) || defined(CONFIG_IA64)
#define LOG_BUF_LEN (65536)
#elif defined(CONFIG_ARCH_S390)
#define LOG_BUF_LEN (131072)
#elif defined(CONFIG_SMP)
#define LOG_BUF_LEN (32768)
#else
#define LOG_BUF_LEN (16384) /* This must be a power of two */
#endif
#else /* CONFIG_LOG_BUF_SHIFT */
#define LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#endif
 
#define LOG_BUF_MASK (LOG_BUF_LEN-1)
 
#ifndef arch_consoles_callable
#define arch_consoles_callable() (1)
#endif
 
/* printk's without a loglevel use this.. */
#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
 
/* We show everything that is MORE important than this.. */
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
 
DECLARE_WAIT_QUEUE_HEAD(log_wait);
 
int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */
DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
};
 
int oops_in_progress;
 
/*
* console_sem protects the console_drivers list, and also
* provides serialisation for access to the entire console
* driver system.
*/
static DECLARE_MUTEX(console_sem);
struct console *console_drivers;
 
/*
* logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
* It is also used in interesting ways to provide interlocking in
* release_console_sem().
*/
static spinlock_t logbuf_lock = SPIN_LOCK_UNLOCKED;
 
static char log_buf[LOG_BUF_LEN];
#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
 
/*
* The indices into log_buf are not constrained to LOG_BUF_LEN - they
* must be masked before subscripting
*/
static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */
static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */
static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */
static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
 
struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int preferred_console = -1;
 
/* Flag: console code may call schedule() */
static int console_may_schedule;
 
/*
* Setup a list of consoles. Called from init/main.c
*/
static int __init console_setup(char *str)
{
struct console_cmdline *c;
char name[sizeof(c->name)];
char *s, *options;
int i, idx;
 
/*
* Decode str into name, index, options.
*/
if (str[0] >= '0' && str[0] <= '9') {
strcpy(name, "ttyS");
strncpy(name + 4, str, sizeof(name) - 5);
} else
strncpy(name, str, sizeof(name) - 1);
name[sizeof(name) - 1] = 0;
if ((options = strchr(str, ',')) != NULL)
*(options++) = 0;
#ifdef __sparc__
if (!strcmp(str, "ttya"))
strcpy(name, "ttyS0");
if (!strcmp(str, "ttyb"))
strcpy(name, "ttyS1");
#endif
for(s = name; *s; s++)
if (*s >= '0' && *s <= '9')
break;
idx = simple_strtoul(s, NULL, 10);
*s = 0;
 
/*
* See if this tty is not yet registered, and
* if we have a slot free.
*/
for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
if (strcmp(console_cmdline[i].name, name) == 0 &&
console_cmdline[i].index == idx) {
preferred_console = i;
return 1;
}
if (i == MAX_CMDLINECONSOLES)
return 1;
preferred_console = i;
c = &console_cmdline[i];
memcpy(c->name, name, sizeof(c->name));
c->options = options;
c->index = idx;
return 1;
}
 
__setup("console=", console_setup);
 
/*
* Commands to do_syslog:
*
* 0 -- Close the log. Currently a NOP.
* 1 -- Open the log. Currently a NOP.
* 2 -- Read from the log.
* 3 -- Read all messages remaining in the ring buffer.
* 4 -- Read and clear all messages remaining in the ring buffer
* 5 -- Clear ring buffer.
* 6 -- Disable printk's to console
* 7 -- Enable printk's to console
* 8 -- Set level of messages printed to console
* 9 -- Return number of unread characters in the log buffer
*/
int do_syslog(int type, char * buf, int len)
{
unsigned long i, j, limit, count;
int do_clear = 0;
char c;
int error = 0;
 
switch (type) {
case 0: /* Close log */
break;
case 1: /* Open log */
break;
case 2: /* Read from log */
error = -EINVAL;
if (!buf || len < 0)
goto out;
error = 0;
if (!len)
goto out;
error = verify_area(VERIFY_WRITE,buf,len);
if (error)
goto out;
error = wait_event_interruptible(log_wait, (log_start - log_end));
if (error)
goto out;
i = 0;
spin_lock_irq(&logbuf_lock);
while ((log_start != log_end) && i < len) {
c = LOG_BUF(log_start);
log_start++;
spin_unlock_irq(&logbuf_lock);
__put_user(c,buf);
buf++;
i++;
spin_lock_irq(&logbuf_lock);
}
spin_unlock_irq(&logbuf_lock);
error = i;
break;
case 4: /* Read/clear last kernel messages */
do_clear = 1;
/* FALL THRU */
case 3: /* Read last kernel messages */
error = -EINVAL;
if (!buf || len < 0)
goto out;
error = 0;
if (!len)
goto out;
error = verify_area(VERIFY_WRITE,buf,len);
if (error)
goto out;
count = len;
if (count > LOG_BUF_LEN)
count = LOG_BUF_LEN;
spin_lock_irq(&logbuf_lock);
if (count > logged_chars)
count = logged_chars;
if (do_clear)
logged_chars = 0;
limit = log_end;
/*
* __put_user() could sleep, and while we sleep
* printk() could overwrite the messages
* we try to copy to user space. Therefore
* the messages are copied in reverse. <manfreds>
*/
for(i=0;i < count;i++) {
j = limit-1-i;
if (j+LOG_BUF_LEN < log_end)
break;
c = LOG_BUF(j);
spin_unlock_irq(&logbuf_lock);
__put_user(c,&buf[count-1-i]);
spin_lock_irq(&logbuf_lock);
}
spin_unlock_irq(&logbuf_lock);
error = i;
if(i != count) {
int offset = count-error;
/* buffer overflow during copy, correct user buffer. */
for(i=0;i<error;i++) {
__get_user(c,&buf[i+offset]);
__put_user(c,&buf[i]);
}
}
 
break;
case 5: /* Clear ring buffer */
spin_lock_irq(&logbuf_lock);
logged_chars = 0;
spin_unlock_irq(&logbuf_lock);
break;
case 6: /* Disable logging to console */
spin_lock_irq(&logbuf_lock);
console_loglevel = minimum_console_loglevel;
spin_unlock_irq(&logbuf_lock);
break;
case 7: /* Enable logging to console */
spin_lock_irq(&logbuf_lock);
console_loglevel = default_console_loglevel;
spin_unlock_irq(&logbuf_lock);
break;
case 8: /* Set level of messages printed to console */
error = -EINVAL;
if (len < 1 || len > 8)
goto out;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
spin_lock_irq(&logbuf_lock);
console_loglevel = len;
spin_unlock_irq(&logbuf_lock);
error = 0;
break;
case 9: /* Number of chars in the log buffer */
spin_lock_irq(&logbuf_lock);
error = log_end - log_start;
spin_unlock_irq(&logbuf_lock);
break;
default:
error = -EINVAL;
break;
}
out:
return error;
}
 
asmlinkage long sys_syslog(int type, char * buf, int len)
{
if ((type != 3) && !capable(CAP_SYS_ADMIN))
return -EPERM;
return do_syslog(type, buf, len);
}
 
/*
* Call the console drivers on a range of log_buf
*/
static void __call_console_drivers(unsigned long start, unsigned long end)
{
struct console *con;
 
for (con = console_drivers; con; con = con->next) {
if ((con->flags & CON_ENABLED) && con->write)
con->write(con, &LOG_BUF(start), end - start);
}
}
 
/*
* Write out chars from start to end - 1 inclusive
*/
static void _call_console_drivers(unsigned long start, unsigned long end, int msg_log_level)
{
if (msg_log_level < console_loglevel && console_drivers && start != end) {
if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
/* wrapped write */
__call_console_drivers(start & LOG_BUF_MASK, LOG_BUF_LEN);
__call_console_drivers(0, end & LOG_BUF_MASK);
} else {
__call_console_drivers(start, end);
}
}
}
 
/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
* The console_sem must be held.
*/
static void call_console_drivers(unsigned long start, unsigned long end)
{
unsigned long cur_index, start_print;
static int msg_level = -1;
 
if (((long)(start - end)) > 0)
BUG();
 
cur_index = start;
start_print = start;
while (cur_index != end) {
if ( msg_level < 0 &&
((end - cur_index) > 2) &&
LOG_BUF(cur_index + 0) == '<' &&
LOG_BUF(cur_index + 1) >= '0' &&
LOG_BUF(cur_index + 1) <= '7' &&
LOG_BUF(cur_index + 2) == '>')
{
msg_level = LOG_BUF(cur_index + 1) - '0';
cur_index += 3;
start_print = cur_index;
}
while (cur_index != end) {
char c = LOG_BUF(cur_index);
cur_index++;
 
if (c == '\n') {
if (msg_level < 0) {
/*
* printk() has already given us loglevel tags in
* the buffer. This code is here in case the
* log buffer has wrapped right round and scribbled
* on those tags
*/
msg_level = default_message_loglevel;
}
_call_console_drivers(start_print, cur_index, msg_level);
msg_level = -1;
start_print = cur_index;
break;
}
}
}
_call_console_drivers(start_print, end, msg_level);
}
 
static void emit_log_char(char c)
{
LOG_BUF(log_end) = c;
log_end++;
if (log_end - log_start > LOG_BUF_LEN)
log_start = log_end - LOG_BUF_LEN;
if (log_end - con_start > LOG_BUF_LEN)
con_start = log_end - LOG_BUF_LEN;
if (logged_chars < LOG_BUF_LEN)
logged_chars++;
}
 
/*
* This is printk. It can be called from any context. We want it to work.
*
* We try to grab the console_sem. If we succeed, it's easy - we log the output and
* call the console drivers. If we fail to get the semaphore we place the output
* into the log buffer and return. The current holder of the console_sem will
* notice the new output in release_console_sem() and will send it to the
* consoles before releasing the semaphore.
*
* One effect of this deferred printing is that code which calls printk() and
* then changes console_loglevel may break. This is because console_loglevel
* is inspected when the actual printing occurs.
*/
asmlinkage int printk(const char *fmt, ...)
{
va_list args;
unsigned long flags;
int printed_len;
char *p;
static char printk_buf[1024];
static int log_level_unknown = 1;
 
if (oops_in_progress) {
/* If a crash is occurring, make sure we can't deadlock */
spin_lock_init(&logbuf_lock);
/* And make sure that we print immediately */
init_MUTEX(&console_sem);
}
 
/* This stops the holder of console_sem just where we want him */
spin_lock_irqsave(&logbuf_lock, flags);
 
/* Emit the output into the temporary buffer */
va_start(args, fmt);
printed_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
va_end(args);
 
/*
* Copy the output into log_buf. If the caller didn't provide
* appropriate log level tags, we insert them here
*/
for (p = printk_buf; *p; p++) {
if (log_level_unknown) {
if (p[0] != '<' || p[1] < '0' || p[1] > '7' || p[2] != '>') {
emit_log_char('<');
emit_log_char(default_message_loglevel + '0');
emit_log_char('>');
}
log_level_unknown = 0;
}
emit_log_char(*p);
if (*p == '\n')
log_level_unknown = 1;
}
 
if (!arch_consoles_callable()) {
/*
* On some architectures, the consoles are not usable
* on secondary CPUs early in the boot process.
*/
spin_unlock_irqrestore(&logbuf_lock, flags);
goto out;
}
if (!down_trylock(&console_sem)) {
/*
* We own the drivers. We can drop the spinlock and let
* release_console_sem() print the text
*/
spin_unlock_irqrestore(&logbuf_lock, flags);
console_may_schedule = 0;
release_console_sem();
} else {
/*
* Someone else owns the drivers. We drop the spinlock, which
* allows the semaphore holder to proceed and to call the
* console drivers with the output which we just produced.
*/
spin_unlock_irqrestore(&logbuf_lock, flags);
}
out:
return printed_len;
}
EXPORT_SYMBOL(printk);
 
/**
* acquire_console_sem - lock the console system for exclusive use.
*
* Acquires a semaphore which guarantees that the caller has
* exclusive access to the console system and the console_drivers list.
*
* Can sleep, returns nothing.
*/
void acquire_console_sem(void)
{
if (in_interrupt())
BUG();
down(&console_sem);
console_may_schedule = 1;
}
EXPORT_SYMBOL(acquire_console_sem);
 
/**
* release_console_sem - unlock the console system
*
* Releases the semaphore which the caller holds on the console system
* and the console driver list.
*
* While the semaphore was held, console output may have been buffered
* by printk(). If this is the case, release_console_sem() emits
* the output prior to releasing the semaphore.
*
* If there is output waiting for klogd, we wake it up.
*
* release_console_sem() may be called from any context.
*/
void release_console_sem(void)
{
unsigned long flags;
unsigned long _con_start, _log_end;
unsigned long must_wake_klogd = 0;
 
for ( ; ; ) {
spin_lock_irqsave(&logbuf_lock, flags);
must_wake_klogd |= log_start - log_end;
if (con_start == log_end)
break; /* Nothing to print */
_con_start = con_start;
_log_end = log_end;
con_start = log_end; /* Flush */
spin_unlock_irqrestore(&logbuf_lock, flags);
call_console_drivers(_con_start, _log_end);
}
console_may_schedule = 0;
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
if (must_wake_klogd && !oops_in_progress)
wake_up_interruptible(&log_wait);
}
 
/** console_conditional_schedule - yield the CPU if required
*
* If the console code is currently allowed to sleep, and
* if this CPU should yield the CPU to another task, do
* so here.
*
* Must be called within acquire_console_sem().
*/
void console_conditional_schedule(void)
{
if (console_may_schedule && current->need_resched) {
set_current_state(TASK_RUNNING);
schedule();
}
}
 
void console_print(const char *s)
{
printk(KERN_EMERG "%s", s);
}
EXPORT_SYMBOL(console_print);
 
void console_unblank(void)
{
struct console *c;
 
/*
* Try to get the console semaphore. If someone else owns it
* we have to return without unblanking because console_unblank
* may be called in interrupt context.
*/
if (down_trylock(&console_sem) != 0)
return;
console_may_schedule = 0;
for (c = console_drivers; c != NULL; c = c->next)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
release_console_sem();
}
EXPORT_SYMBOL(console_unblank);
 
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
* print any messages that were printed by the kernel before the
* console driver was initialized.
*/
void register_console(struct console * console)
{
int i;
unsigned long flags;
 
/*
* See if we want to use this console driver. If we
* didn't select a console we take the first one
* that registers here.
*/
if (preferred_console < 0) {
if (console->index < 0)
console->index = 0;
if (console->setup == NULL ||
console->setup(console, NULL) == 0) {
console->flags |= CON_ENABLED | CON_CONSDEV;
preferred_console = 0;
}
}
 
/*
* See if this console matches one we selected on
* the command line.
*/
for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
if (strcmp(console_cmdline[i].name, console->name) != 0)
continue;
if (console->index >= 0 &&
console->index != console_cmdline[i].index)
continue;
if (console->index < 0)
console->index = console_cmdline[i].index;
if (console->setup &&
console->setup(console, console_cmdline[i].options) != 0)
break;
console->flags |= CON_ENABLED;
console->index = console_cmdline[i].index;
if (i == preferred_console)
console->flags |= CON_CONSDEV;
break;
}
 
if (!(console->flags & CON_ENABLED))
return;
 
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
acquire_console_sem();
if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
console->next = console_drivers;
console_drivers = console;
} else {
console->next = console_drivers->next;
console_drivers->next = console;
}
if (console->flags & CON_PRINTBUFFER) {
/*
* release_console_sem() will print out the buffered messages for us.
*/
spin_lock_irqsave(&logbuf_lock, flags);
con_start = log_start;
spin_unlock_irqrestore(&logbuf_lock, flags);
}
release_console_sem();
}
EXPORT_SYMBOL(register_console);
 
int unregister_console(struct console * console)
{
struct console *a,*b;
int res = 1;
 
acquire_console_sem();
if (console_drivers == console) {
console_drivers=console->next;
res = 0;
} else {
for (a=console_drivers->next, b=console_drivers ;
a; b=a, a=b->next) {
if (a == console) {
b->next = a->next;
res = 0;
break;
}
}
}
/* If last console is removed, we re-enable picking the first
* one that gets registered. Without that, pmac early boot console
* would prevent fbcon from taking over.
*/
if (console_drivers == NULL)
preferred_console = -1;
 
release_console_sem();
return res;
}
EXPORT_SYMBOL(unregister_console);
/**
* tty_write_message - write a message to a certain tty, not just the console.
*
* This is used for messages that need to be redirected to a specific tty.
* We don't put it into the syslog queue right now maybe in the future if
* really needed.
*/
void tty_write_message(struct tty_struct *tty, char *msg)
{
if (tty && tty->driver.write)
tty->driver.write(tty, 0, msg, strlen(msg));
return;
}
/exit.c
0,0 → 1,601
/*
* linux/kernel/exit.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
 
#include <linux/config.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/namespace.h>
#ifdef CONFIG_BSD_PROCESS_ACCT
#include <linux/acct.h>
#endif
 
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
 
extern void sem_exit (void);
extern struct task_struct *child_reaper;
 
int getrusage(struct task_struct *, int, struct rusage *);
 
static void release_task(struct task_struct * p)
{
if (p != current) {
#ifdef CONFIG_SMP
/*
* Wait to make sure the process isn't on the
* runqueue (active on some other CPU still)
*/
for (;;) {
task_lock(p);
if (!task_has_cpu(p))
break;
task_unlock(p);
do {
cpu_relax();
barrier();
} while (task_has_cpu(p));
}
task_unlock(p);
#endif
atomic_dec(&p->user->processes);
free_uid(p->user);
unhash_process(p);
 
release_thread(p);
current->cmin_flt += p->min_flt + p->cmin_flt;
current->cmaj_flt += p->maj_flt + p->cmaj_flt;
current->cnswap += p->nswap + p->cnswap;
/*
* Potentially available timeslices are retrieved
* here - this way the parent does not get penalized
* for creating too many processes.
*
* (this cannot be used to artificially 'generate'
* timeslices, because any timeslice recovered here
* was given away by the parent in the first place.)
*/
current->counter += p->counter;
if (current->counter >= MAX_COUNTER)
current->counter = MAX_COUNTER;
p->pid = 0;
free_task_struct(p);
} else {
printk("task releasing itself\n");
}
}
 
/*
* This checks not only the pgrp, but falls back on the pid if no
* satisfactory pgrp is found. I dunno - gdb doesn't work correctly
* without this...
*/
int session_of_pgrp(int pgrp)
{
struct task_struct *p;
int fallback;
 
fallback = -1;
read_lock(&tasklist_lock);
for_each_task(p) {
if (p->session <= 0)
continue;
if (p->pgrp == pgrp) {
fallback = p->session;
break;
}
if (p->pid == pgrp)
fallback = p->session;
}
read_unlock(&tasklist_lock);
return fallback;
}
 
/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
* to receive a SIGHUP and a SIGCONT.
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
static int will_become_orphaned_pgrp(int pgrp, struct task_struct * ignored_task)
{
struct task_struct *p;
 
read_lock(&tasklist_lock);
for_each_task(p) {
if ((p == ignored_task) || (p->pgrp != pgrp) ||
(p->state == TASK_ZOMBIE) ||
(p->p_pptr->pid == 1))
continue;
if ((p->p_pptr->pgrp != pgrp) &&
(p->p_pptr->session == p->session)) {
read_unlock(&tasklist_lock);
return 0;
}
}
read_unlock(&tasklist_lock);
return 1; /* (sighing) "Often!" */
}
 
int is_orphaned_pgrp(int pgrp)
{
return will_become_orphaned_pgrp(pgrp, 0);
}
 
static inline int has_stopped_jobs(int pgrp)
{
int retval = 0;
struct task_struct * p;
 
read_lock(&tasklist_lock);
for_each_task(p) {
if (p->pgrp != pgrp)
continue;
if (p->state != TASK_STOPPED)
continue;
retval = 1;
break;
}
read_unlock(&tasklist_lock);
return retval;
}
 
/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
* the global child reaper process (ie "init")
*/
static inline void forget_original_parent(struct task_struct * father)
{
struct task_struct * p;
 
read_lock(&tasklist_lock);
 
for_each_task(p) {
if (p->p_opptr == father) {
/* We dont want people slaying init */
p->exit_signal = SIGCHLD;
p->self_exec_id++;
 
/* Make sure we're not reparenting to ourselves */
p->p_opptr = child_reaper;
 
if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
}
}
read_unlock(&tasklist_lock);
}
 
static inline void close_files(struct files_struct * files)
{
int i, j;
 
j = 0;
for (;;) {
unsigned long set;
i = j * __NFDBITS;
if (i >= files->max_fdset || i >= files->max_fds)
break;
set = files->open_fds->fds_bits[j++];
while (set) {
if (set & 1) {
struct file * file = xchg(&files->fd[i], NULL);
if (file)
filp_close(file, files);
}
i++;
set >>= 1;
}
}
}
 
void put_files_struct(struct files_struct *files)
{
if (atomic_dec_and_test(&files->count)) {
close_files(files);
/*
* Free the fd and fdset arrays if we expanded them.
*/
if (files->fd != &files->fd_array[0])
free_fd_array(files->fd, files->max_fds);
if (files->max_fdset > __FD_SETSIZE) {
free_fdset(files->open_fds, files->max_fdset);
free_fdset(files->close_on_exec, files->max_fdset);
}
kmem_cache_free(files_cachep, files);
}
}
 
static inline void __exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
 
if (files) {
task_lock(tsk);
tsk->files = NULL;
task_unlock(tsk);
put_files_struct(files);
}
}
 
void exit_files(struct task_struct *tsk)
{
__exit_files(tsk);
}
 
static inline void __put_fs_struct(struct fs_struct *fs)
{
/* No need to hold fs->lock if we are killing it */
if (atomic_dec_and_test(&fs->count)) {
dput(fs->root);
mntput(fs->rootmnt);
dput(fs->pwd);
mntput(fs->pwdmnt);
if (fs->altroot) {
dput(fs->altroot);
mntput(fs->altrootmnt);
}
kmem_cache_free(fs_cachep, fs);
}
}
 
void put_fs_struct(struct fs_struct *fs)
{
__put_fs_struct(fs);
}
 
static inline void __exit_fs(struct task_struct *tsk)
{
struct fs_struct * fs = tsk->fs;
 
if (fs) {
task_lock(tsk);
tsk->fs = NULL;
task_unlock(tsk);
__put_fs_struct(fs);
}
}
 
void exit_fs(struct task_struct *tsk)
{
__exit_fs(tsk);
}
 
/*
* We can use these to temporarily drop into
* "lazy TLB" mode and back.
*/
struct mm_struct * start_lazy_tlb(void)
{
struct mm_struct *mm = current->mm;
current->mm = NULL;
/* active_mm is still 'mm' */
atomic_inc(&mm->mm_count);
enter_lazy_tlb(mm, current, smp_processor_id());
return mm;
}
 
void end_lazy_tlb(struct mm_struct *mm)
{
struct mm_struct *active_mm = current->active_mm;
 
current->mm = mm;
if (mm != active_mm) {
current->active_mm = mm;
activate_mm(active_mm, mm);
}
mmdrop(active_mm);
}
 
/*
* Turn us into a lazy TLB process if we
* aren't already..
*/
static inline void __exit_mm(struct task_struct * tsk)
{
struct mm_struct * mm = tsk->mm;
 
mm_release();
if (mm) {
atomic_inc(&mm->mm_count);
BUG_ON(mm != tsk->active_mm);
/* more a memory barrier than a real lock */
task_lock(tsk);
tsk->mm = NULL;
task_unlock(tsk);
enter_lazy_tlb(mm, current, smp_processor_id());
mmput(mm);
}
}
 
void exit_mm(struct task_struct *tsk)
{
__exit_mm(tsk);
}
 
/*
* Send signals to all our closest relatives so that they know
* to properly mourn us..
*/
static void exit_notify(void)
{
struct task_struct * p, *t;
 
forget_original_parent(current);
/*
* Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*
* Case i: Our father is in a different pgrp than we are
* and we were the only connection outside, so our pgrp
* is about to become orphaned.
*/
t = current->p_pptr;
if ((t->pgrp != current->pgrp) &&
(t->session == current->session) &&
will_become_orphaned_pgrp(current->pgrp, current) &&
has_stopped_jobs(current->pgrp)) {
kill_pg(current->pgrp,SIGHUP,1);
kill_pg(current->pgrp,SIGCONT,1);
}
 
/* Let father know we died
*
* Thread signals are configurable, but you aren't going to use
* that to send signals to arbitary processes.
* That stops right now.
*
* If the parent exec id doesn't match the exec id we saved
* when we started then we know the parent has changed security
* domain.
*
* If our self_exec id doesn't match our parent_exec_id then
* we have changed execution domain as these two values started
* the same after a fork.
*
*/
if(current->exit_signal != SIGCHLD &&
( current->parent_exec_id != t->self_exec_id ||
current->self_exec_id != current->parent_exec_id)
&& !capable(CAP_KILL))
current->exit_signal = SIGCHLD;
 
 
/*
* This loop does two things:
*
* A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
 
write_lock_irq(&tasklist_lock);
current->state = TASK_ZOMBIE;
do_notify_parent(current, current->exit_signal);
while (current->p_cptr != NULL) {
p = current->p_cptr;
current->p_cptr = p->p_osptr;
p->p_ysptr = NULL;
p->ptrace = 0;
 
p->p_pptr = p->p_opptr;
p->p_osptr = p->p_pptr->p_cptr;
if (p->p_osptr)
p->p_osptr->p_ysptr = p;
p->p_pptr->p_cptr = p;
if (p->state == TASK_ZOMBIE)
do_notify_parent(p, p->exit_signal);
/*
* process group orphan check
* Case ii: Our child is in a different pgrp
* than we are, and it was the only connection
* outside, so the child pgrp is now orphaned.
*/
if ((p->pgrp != current->pgrp) &&
(p->session == current->session)) {
int pgrp = p->pgrp;
 
write_unlock_irq(&tasklist_lock);
if (is_orphaned_pgrp(pgrp) && has_stopped_jobs(pgrp)) {
kill_pg(pgrp,SIGHUP,1);
kill_pg(pgrp,SIGCONT,1);
}
write_lock_irq(&tasklist_lock);
}
}
write_unlock_irq(&tasklist_lock);
}
 
NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
 
if (in_interrupt())
panic("Aiee, killing interrupt handler!");
if (!tsk->pid)
panic("Attempted to kill the idle task!");
if (tsk->pid == 1)
panic("Attempted to kill init!");
tsk->flags |= PF_EXITING;
del_timer_sync(&tsk->real_timer);
 
fake_volatile:
#ifdef CONFIG_BSD_PROCESS_ACCT
acct_process(code);
#endif
__exit_mm(tsk);
 
lock_kernel();
sem_exit();
__exit_files(tsk);
__exit_fs(tsk);
exit_namespace(tsk);
exit_sighand(tsk);
exit_thread();
 
if (current->leader)
disassociate_ctty(1);
 
put_exec_domain(tsk->exec_domain);
if (tsk->binfmt && tsk->binfmt->module)
__MOD_DEC_USE_COUNT(tsk->binfmt->module);
 
tsk->exit_code = code;
exit_notify();
schedule();
BUG();
/*
* In order to get rid of the "volatile function does return" message
* I did this little loop that confuses gcc to think do_exit really
* is volatile. In fact it's schedule() that is volatile in some
* circumstances: when current->state = ZOMBIE, schedule() never
* returns.
*
* In fact the natural way to do all this is to have the label and the
* goto right after each other, but I put the fake_volatile label at
* the start of the function just in case something /really/ bad
* happens, and the schedule returns. This way we can try again. I'm
* not paranoid: it's just that everybody is out to get me.
*/
goto fake_volatile;
}
 
NORET_TYPE void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
do_exit(code);
}
 
asmlinkage long sys_exit(int error_code)
{
do_exit((error_code&0xff)<<8);
}
 
asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru)
{
int flag, retval;
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
 
if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
 
add_wait_queue(&current->wait_chldexit,&wait);
repeat:
flag = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(&tasklist_lock);
tsk = current;
do {
struct task_struct *p;
for (p = tsk->p_cptr ; p ; p = p->p_osptr) {
if (pid>0) {
if (p->pid != pid)
continue;
} else if (!pid) {
if (p->pgrp != current->pgrp)
continue;
} else if (pid != -1) {
if (p->pgrp != -pid)
continue;
}
/* Wait for all children (clone and not) if __WALL is set;
* otherwise, wait for clone children *only* if __WCLONE is
* set; otherwise, wait for non-clone children *only*. (Note:
* A "clone" child here is one that reports to its parent
* using a signal other than SIGCHLD.) */
if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
&& !(options & __WALL))
continue;
flag = 1;
switch (p->state) {
case TASK_STOPPED:
if (!p->exit_code)
continue;
if (!(options & WUNTRACED) && !(p->ptrace & PT_PTRACED))
continue;
read_unlock(&tasklist_lock);
retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
if (!retval && stat_addr)
retval = put_user((p->exit_code << 8) | 0x7f, stat_addr);
if (!retval) {
p->exit_code = 0;
retval = p->pid;
}
goto end_wait4;
case TASK_ZOMBIE:
current->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime;
current->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime;
read_unlock(&tasklist_lock);
retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
if (!retval && stat_addr)
retval = put_user(p->exit_code, stat_addr);
if (retval)
goto end_wait4;
retval = p->pid;
if (p->p_opptr != p->p_pptr) {
write_lock_irq(&tasklist_lock);
REMOVE_LINKS(p);
p->p_pptr = p->p_opptr;
SET_LINKS(p);
do_notify_parent(p, SIGCHLD);
write_unlock_irq(&tasklist_lock);
} else
release_task(p);
goto end_wait4;
default:
continue;
}
}
if (options & __WNOTHREAD)
break;
tsk = next_thread(tsk);
} while (tsk != current);
read_unlock(&tasklist_lock);
if (flag) {
retval = 0;
if (options & WNOHANG)
goto end_wait4;
retval = -ERESTARTSYS;
if (signal_pending(current))
goto end_wait4;
schedule();
goto repeat;
}
retval = -ECHILD;
end_wait4:
current->state = TASK_RUNNING;
remove_wait_queue(&current->wait_chldexit,&wait);
return retval;
}
 
#if !defined(__alpha__) && !defined(__ia64__)
 
/*
* sys_waitpid() remains for compatibility. waitpid() should be
* implemented by calling sys_wait4() from libc.a.
*/
asmlinkage long sys_waitpid(pid_t pid,unsigned int * stat_addr, int options)
{
return sys_wait4(pid, stat_addr, options, NULL);
}
 
#endif
/acct.c
0,0 → 1,381
/*
* linux/kernel/acct.c
*
* BSD Process Accounting for Linux
*
* Author: Marco van Wieringen <mvw@planets.elm.net>
*
* Some code based on ideas and code from:
* Thomas K. Dyas <tdyas@eden.rutgers.edu>
*
* This file implements BSD-style process accounting. Whenever any
* process exits, an accounting record of type "struct acct" is
* written to the file specified with the acct() system call. It is
* up to user-level programs to do useful things with the accounting
* log. The kernel just provides the raw accounting information.
*
* (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
*
* Plugged two leaks. 1) It didn't return acct_file into the free_filps if
* the file happened to be read-only. 2) If the accounting was suspended
* due to the lack of space it happily allowed to reopen it and completely
* lost the old acct_file. 3/10/98, Al Viro.
*
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
* Fixed a nasty interaction with with sys_umount(). If the accointing
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
* CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
* unless we are messing with the root. In that case we are getting a
* real mess with do_remount_sb(). 9/11/98, AV.
*
* Fixed a bunch of races (and pair of leaks). Probably not the best way,
* but this one obviously doesn't introduce deadlocks. Later. BTW, found
* one race (and leak) in BSD implementation.
* OK, that's better. ANOTHER race and leak in BSD variant. There always
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
* ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
 
#include <linux/config.h>
#include <linux/errno.h>
#include <linux/kernel.h>
 
#ifdef CONFIG_BSD_PROCESS_ACCT
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/tty.h>
 
#include <asm/uaccess.h>
 
/*
* These constants control the amount of freespace that suspend and
* resume the process accounting system, and the time delay between
* each check.
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
 
int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
 
/*
* External references and all of the globals.
*/
 
static volatile int acct_active;
static volatile int acct_needcheck;
static struct file *acct_file;
static struct timer_list acct_timer;
static void do_acct_process(long, struct file *);
 
/*
* Called whenever the timer says to check the free space.
*/
static void acct_timeout(unsigned long unused)
{
acct_needcheck = 1;
}
 
/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct file *file)
{
struct statfs sbuf;
int res;
int act;
 
lock_kernel();
res = acct_active;
if (!file || !acct_needcheck)
goto out;
unlock_kernel();
 
/* May block */
if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
return res;
 
if (sbuf.f_bavail <= SUSPEND * sbuf.f_blocks / 100)
act = -1;
else if (sbuf.f_bavail >= RESUME * sbuf.f_blocks / 100)
act = 1;
else
act = 0;
 
/*
* If some joker switched acct_file under us we'ld better be
* silent and _not_ touch anything.
*/
lock_kernel();
if (file != acct_file) {
if (act)
res = act>0;
goto out;
}
 
if (acct_active) {
if (act < 0) {
acct_active = 0;
printk(KERN_INFO "Process accounting paused\n");
}
} else {
if (act > 0) {
acct_active = 1;
printk(KERN_INFO "Process accounting resumed\n");
}
}
 
del_timer(&acct_timer);
acct_needcheck = 0;
acct_timer.expires = jiffies + ACCT_TIMEOUT*HZ;
add_timer(&acct_timer);
res = acct_active;
out:
unlock_kernel();
return res;
}
 
/*
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
*/
asmlinkage long sys_acct(const char *name)
{
struct file *file = NULL, *old_acct = NULL;
char *tmp;
int error;
 
if (!capable(CAP_SYS_PACCT))
return -EPERM;
 
if (name) {
tmp = getname(name);
error = PTR_ERR(tmp);
if (IS_ERR(tmp))
goto out;
/* Difference from BSD - they don't do O_APPEND */
file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
putname(tmp);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out;
}
error = -EACCES;
if (!S_ISREG(file->f_dentry->d_inode->i_mode))
goto out_err;
 
error = -EIO;
if (!file->f_op->write)
goto out_err;
}
 
error = 0;
lock_kernel();
if (acct_file) {
old_acct = acct_file;
del_timer(&acct_timer);
acct_active = 0;
acct_needcheck = 0;
acct_file = NULL;
}
if (name) {
acct_file = file;
acct_needcheck = 0;
acct_active = 1;
/* It's been deleted if it was used before so this is safe */
init_timer(&acct_timer);
acct_timer.function = acct_timeout;
acct_timer.expires = jiffies + ACCT_TIMEOUT*HZ;
add_timer(&acct_timer);
}
unlock_kernel();
if (old_acct) {
do_acct_process(0,old_acct);
filp_close(old_acct, NULL);
}
out:
return error;
out_err:
filp_close(file, NULL);
goto out;
}
 
void acct_auto_close(kdev_t dev)
{
lock_kernel();
if (acct_file && acct_file->f_dentry->d_inode->i_dev == dev)
sys_acct(NULL);
unlock_kernel();
}
 
/*
* encode an unsigned long into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
* is a 13-bit fraction with a 3-bit (base 8) exponent.
*/
 
#define MANTSIZE 13 /* 13 bit mantissa. */
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
 
static comp_t encode_comp_t(unsigned long value)
{
int exp, rnd;
 
exp = rnd = 0;
while (value > MAXFRACT) {
rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
exp++;
}
 
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT)) {
value >>= EXPSIZE;
exp++;
}
 
/*
* Clean it up and polish it off.
*/
exp <<= MANTSIZE; /* Shift the exponent into place */
exp += value; /* and add on the mantissa. */
return exp;
}
 
/*
* Write an accounting entry for an exiting process
*
* The acct_process() call is the workhorse of the process
* accounting system. The struct acct is built here and then written
* into the accounting file. This function should only be called from
* do_exit().
*/
 
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(long exitcode, struct file *file)
{
struct acct ac;
mm_segment_t fs;
unsigned long vsize;
unsigned long flim;
 
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(file))
return;
 
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
memset((caddr_t)&ac, 0, sizeof(struct acct));
 
strncpy(ac.ac_comm, current->comm, ACCT_COMM);
ac.ac_comm[ACCT_COMM - 1] = '\0';
 
ac.ac_btime = CT_TO_SECS(current->start_time) + (xtime.tv_sec - (jiffies / HZ));
ac.ac_etime = encode_comp_t(jiffies - current->start_time);
ac.ac_utime = encode_comp_t(current->times.tms_utime);
ac.ac_stime = encode_comp_t(current->times.tms_stime);
ac.ac_uid = current->uid;
ac.ac_gid = current->gid;
ac.ac_tty = (current->tty) ? kdev_t_to_nr(current->tty->device) : 0;
 
ac.ac_flag = 0;
if (current->flags & PF_FORKNOEXEC)
ac.ac_flag |= AFORK;
if (current->flags & PF_SUPERPRIV)
ac.ac_flag |= ASU;
if (current->flags & PF_DUMPCORE)
ac.ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
ac.ac_flag |= AXSIG;
 
vsize = 0;
if (current->mm) {
struct vm_area_struct *vma;
down_read(&current->mm->mmap_sem);
vma = current->mm->mmap;
while (vma) {
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
up_read(&current->mm->mmap_sem);
}
vsize = vsize / 1024;
ac.ac_mem = encode_comp_t(vsize);
ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
ac.ac_minflt = encode_comp_t(current->min_flt);
ac.ac_majflt = encode_comp_t(current->maj_flt);
ac.ac_swaps = encode_comp_t(current->nswap);
ac.ac_exitcode = exitcode;
 
/*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
fs = get_fs();
set_fs(KERNEL_DS);
/*
* Accounting records are not subject to resource limits.
*/
flim = current->rlim[RLIMIT_FSIZE].rlim_cur;
current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
file->f_op->write(file, (char *)&ac,
sizeof(struct acct), &file->f_pos);
current->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
}
 
/*
* acct_process - now just a wrapper around do_acct_process
*/
int acct_process(long exitcode)
{
struct file *file = NULL;
lock_kernel();
if (acct_file) {
file = acct_file;
get_file(file);
unlock_kernel();
do_acct_process(exitcode, file);
fput(file);
} else
unlock_kernel();
return 0;
}
 
#else
/*
* Dummy system call when BSD process accounting is not configured
* into the kernel.
*/
 
asmlinkage long sys_acct(const char * filename)
{
return -ENOSYS;
}
#endif
/exec_domain.c
0,0 → 1,290
/*
* Handling of different ABIs (personalities).
*
* We group personalities into execution domains which have their
* own handlers for kernel entry points, signal mapping, etc...
*
* 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
*/
 
#include <linux/config.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/personality.h>
#include <linux/sched.h>
#include <linux/sysctl.h>
#include <linux/types.h>
 
 
static void default_handler(int, struct pt_regs *);
 
static struct exec_domain *exec_domains = &default_exec_domain;
static rwlock_t exec_domains_lock = RW_LOCK_UNLOCKED;
 
 
static u_long ident_map[32] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31
};
 
struct exec_domain default_exec_domain = {
"Linux", /* name */
default_handler, /* lcall7 causes a seg fault. */
0, 0, /* PER_LINUX personality. */
ident_map, /* Identity map signals. */
ident_map, /* - both ways. */
};
 
 
static void
default_handler(int segment, struct pt_regs *regp)
{
u_long pers = 0;
 
/*
* This may have been a static linked SVr4 binary, so we would
* have the personality set incorrectly. Or it might have been
* a Solaris/x86 binary. We can tell which because the former
* uses lcall7, while the latter used lcall 0x27.
* Try to find or load the appropriate personality, and fall back
* to just forcing a SEGV.
*
* XXX: this is IA32-specific and should be moved to the MD-tree.
*/
switch (segment) {
#ifdef __i386__
case 0x07:
pers = abi_defhandler_lcall7;
break;
case 0x27:
pers = PER_SOLARIS;
break;
#endif
}
set_personality(pers);
 
if (current->exec_domain->handler != default_handler)
current->exec_domain->handler(segment, regp);
else
send_sig(SIGSEGV, current, 1);
}
 
static struct exec_domain *
lookup_exec_domain(u_long personality)
{
struct exec_domain * ep;
u_long pers = personality(personality);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
if (pers >= ep->pers_low && pers <= ep->pers_high)
if (try_inc_mod_count(ep->module))
goto out;
}
 
#ifdef CONFIG_KMOD
read_unlock(&exec_domains_lock);
{
char buffer[30];
sprintf(buffer, "personality-%ld", pers);
request_module(buffer);
}
read_lock(&exec_domains_lock);
 
for (ep = exec_domains; ep; ep = ep->next) {
if (pers >= ep->pers_low && pers <= ep->pers_high)
if (try_inc_mod_count(ep->module))
goto out;
}
#endif
 
ep = &default_exec_domain;
out:
read_unlock(&exec_domains_lock);
return (ep);
}
 
int
register_exec_domain(struct exec_domain *ep)
{
struct exec_domain *tmp;
int err = -EBUSY;
 
if (ep == NULL)
return -EINVAL;
 
if (ep->next != NULL)
return -EBUSY;
 
write_lock(&exec_domains_lock);
for (tmp = exec_domains; tmp; tmp = tmp->next) {
if (tmp == ep)
goto out;
}
 
ep->next = exec_domains;
exec_domains = ep;
err = 0;
 
out:
write_unlock(&exec_domains_lock);
return (err);
}
 
int
unregister_exec_domain(struct exec_domain *ep)
{
struct exec_domain **epp;
 
epp = &exec_domains;
write_lock(&exec_domains_lock);
for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
if (ep == *epp)
goto unregister;
}
write_unlock(&exec_domains_lock);
return -EINVAL;
 
unregister:
*epp = ep->next;
ep->next = NULL;
write_unlock(&exec_domains_lock);
return 0;
}
 
int
__set_personality(u_long personality)
{
struct exec_domain *ep, *oep;
 
ep = lookup_exec_domain(personality);
if (ep == current->exec_domain) {
current->personality = personality;
return 0;
}
 
if (atomic_read(&current->fs->count) != 1) {
struct fs_struct *fsp, *ofsp;
 
fsp = copy_fs_struct(current->fs);
if (fsp == NULL) {
put_exec_domain(ep);
return -ENOMEM;;
}
 
task_lock(current);
ofsp = current->fs;
current->fs = fsp;
task_unlock(current);
 
put_fs_struct(ofsp);
}
 
/*
* At that point we are guaranteed to be the sole owner of
* current->fs.
*/
 
current->personality = personality;
oep = current->exec_domain;
current->exec_domain = ep;
set_fs_altroot();
 
put_exec_domain(oep);
 
return 0;
}
 
int
get_exec_domain_list(char *page)
{
struct exec_domain *ep;
int len = 0;
 
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next)
len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n",
ep->pers_low, ep->pers_high, ep->name,
ep->module ? ep->module->name : "kernel");
read_unlock(&exec_domains_lock);
return (len);
}
 
asmlinkage long
sys_personality(u_long personality)
{
u_long old = current->personality;;
 
if (personality != 0xffffffff) {
set_personality(personality);
if (current->personality != personality)
return -EINVAL;
}
 
return (long)old;
}
 
 
EXPORT_SYMBOL(register_exec_domain);
EXPORT_SYMBOL(unregister_exec_domain);
EXPORT_SYMBOL(__set_personality);
 
/*
* We have to have all sysctl handling for the Linux-ABI
* in one place as the dynamic registration of sysctls is
* horribly crufty in Linux <= 2.4.
*
* I hope the new sysctl schemes discussed for future versions
* will obsolete this.
*
* --hch
*/
 
u_long abi_defhandler_coff = PER_SCOSVR3;
u_long abi_defhandler_elf = PER_LINUX;
u_long abi_defhandler_lcall7 = PER_SVR4;
u_long abi_defhandler_libcso = PER_SVR4;
u_int abi_traceflg;
int abi_fake_utsname;
 
static struct ctl_table abi_table[] = {
{ABI_DEFHANDLER_COFF, "defhandler_coff", &abi_defhandler_coff,
sizeof(int), 0644, NULL, &proc_doulongvec_minmax},
{ABI_DEFHANDLER_ELF, "defhandler_elf", &abi_defhandler_elf,
sizeof(int), 0644, NULL, &proc_doulongvec_minmax},
{ABI_DEFHANDLER_LCALL7, "defhandler_lcall7", &abi_defhandler_lcall7,
sizeof(int), 0644, NULL, &proc_doulongvec_minmax},
{ABI_DEFHANDLER_LIBCSO, "defhandler_libcso", &abi_defhandler_libcso,
sizeof(int), 0644, NULL, &proc_doulongvec_minmax},
{ABI_TRACE, "trace", &abi_traceflg,
sizeof(u_int), 0644, NULL, &proc_dointvec},
{ABI_FAKE_UTSNAME, "fake_utsname", &abi_fake_utsname,
sizeof(int), 0644, NULL, &proc_dointvec},
{0}
};
 
static struct ctl_table abi_root_table[] = {
{CTL_ABI, "abi", NULL, 0, 0555, abi_table},
{0}
};
 
static int __init
abi_register_sysctl(void)
{
register_sysctl_table(abi_root_table, 1);
return 0;
}
 
__initcall(abi_register_sysctl);
 
 
EXPORT_SYMBOL(abi_defhandler_coff);
EXPORT_SYMBOL(abi_defhandler_elf);
EXPORT_SYMBOL(abi_defhandler_lcall7);
EXPORT_SYMBOL(abi_defhandler_libcso);
EXPORT_SYMBOL(abi_traceflg);
EXPORT_SYMBOL(abi_fake_utsname);
/pm.c
0,0 → 1,293
/*
* pm.c - Power management interface
*
* Copyright (C) 2000 Andrew Henroid
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
 
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pm.h>
#include <linux/interrupt.h>
 
int pm_active;
 
/*
* Locking notes:
* pm_devs_lock can be a semaphore providing pm ops are not called
* from an interrupt handler (already a bad idea so no change here). Each
* change must be protected so that an unlink of an entry doesn't clash
* with a pm send - which is permitted to sleep in the current architecture
*
* Module unloads clashing with pm events now work out safely, the module
* unload path will block until the event has been sent. It may well block
* until a resume but that will be fine.
*/
static DECLARE_MUTEX(pm_devs_lock);
static LIST_HEAD(pm_devs);
 
/**
* pm_register - register a device with power management
* @type: device type
* @id: device ID
* @callback: callback function
*
* Add a device to the list of devices that wish to be notified about
* power management events. A &pm_dev structure is returned on success,
* on failure the return is %NULL.
*
* The callback function will be called in process context and
* it may sleep.
*/
struct pm_dev *pm_register(pm_dev_t type,
unsigned long id,
pm_callback callback)
{
struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
if (dev) {
memset(dev, 0, sizeof(*dev));
dev->type = type;
dev->id = id;
dev->callback = callback;
 
down(&pm_devs_lock);
list_add(&dev->entry, &pm_devs);
up(&pm_devs_lock);
}
return dev;
}
 
/**
* pm_unregister - unregister a device with power management
* @dev: device to unregister
*
* Remove a device from the power management notification lists. The
* dev passed must be a handle previously returned by pm_register.
*/
void pm_unregister(struct pm_dev *dev)
{
if (dev) {
down(&pm_devs_lock);
list_del(&dev->entry);
up(&pm_devs_lock);
 
kfree(dev);
}
}
 
static void __pm_unregister(struct pm_dev *dev)
{
if (dev) {
list_del(&dev->entry);
kfree(dev);
}
}
 
/**
* pm_unregister_all - unregister all devices with matching callback
* @callback: callback function pointer
*
* Unregister every device that would call the callback passed. This
* is primarily meant as a helper function for loadable modules. It
* enables a module to give up all its managed devices without keeping
* its own private list.
*/
void pm_unregister_all(pm_callback callback)
{
struct list_head *entry;
 
if (!callback)
return;
 
down(&pm_devs_lock);
entry = pm_devs.next;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
entry = entry->next;
if (dev->callback == callback)
__pm_unregister(dev);
}
up(&pm_devs_lock);
}
 
/**
* pm_send - send request to a single device
* @dev: device to send to
* @rqst: power management request
* @data: data for the callback
*
* Issue a power management request to a given device. The
* %PM_SUSPEND and %PM_RESUME events are handled specially. The
* data field must hold the intended next state. No call is made
* if the state matches.
*
* BUGS: what stops two power management requests occuring in parallel
* and conflicting.
*
* WARNING: Calling pm_send directly is not generally recommended, in
* paticular there is no locking against the pm_dev going away. The
* caller must maintain all needed locking or have 'inside knowledge'
* on the safety. Also remember that this function is not locked against
* pm_unregister. This means that you must handle SMP races on callback
* execution and unload yourself.
*/
int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
{
int status = 0;
int prev_state, next_state;
 
if (in_interrupt())
BUG();
 
switch (rqst) {
case PM_SUSPEND:
case PM_RESUME:
prev_state = dev->state;
next_state = (unsigned long) data;
if (prev_state != next_state) {
if (dev->callback)
status = (*dev->callback)(dev, rqst, data);
if (!status) {
dev->state = next_state;
dev->prev_state = prev_state;
}
}
else {
dev->prev_state = prev_state;
}
break;
default:
if (dev->callback)
status = (*dev->callback)(dev, rqst, data);
break;
}
return status;
}
 
/*
* Undo incomplete request
*/
static void pm_undo_all(struct pm_dev *last)
{
struct list_head *entry = last->entry.prev;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
if (dev->state != dev->prev_state) {
/* previous state was zero (running) resume or
* previous state was non-zero (suspended) suspend
*/
pm_request_t undo = (dev->prev_state
? PM_SUSPEND:PM_RESUME);
pm_send(dev, undo, (void*) dev->prev_state);
}
entry = entry->prev;
}
}
 
/**
* pm_send_all - send request to all managed devices
* @rqst: power management request
* @data: data for the callback
*
* Issue a power management request to a all devices. The
* %PM_SUSPEND events are handled specially. Any device is
* permitted to fail a suspend by returning a non zero (error)
* value from its callback function. If any device vetoes a
* suspend request then all other devices that have suspended
* during the processing of this request are restored to their
* previous state.
*
* WARNING: This function takes the pm_devs_lock. The lock is not dropped until
* the callbacks have completed. This prevents races against pm locking
* functions, races against module unload pm_unregister code. It does
* mean however that you must not issue pm_ functions within the callback
* or you will deadlock and users will hate you.
*
* Zero is returned on success. If a suspend fails then the status
* from the device that vetoes the suspend is returned.
*
* BUGS: what stops two power management requests occuring in parallel
* and conflicting.
*/
int pm_send_all(pm_request_t rqst, void *data)
{
struct list_head *entry;
down(&pm_devs_lock);
entry = pm_devs.next;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
if (dev->callback) {
int status = pm_send(dev, rqst, data);
if (status) {
/* return devices to previous state on
* failed suspend request
*/
if (rqst == PM_SUSPEND)
pm_undo_all(dev);
up(&pm_devs_lock);
return status;
}
}
entry = entry->next;
}
up(&pm_devs_lock);
return 0;
}
 
/**
* pm_find - find a device
* @type: type of device
* @from: where to start looking
*
* Scan the power management list for devices of a specific type. The
* return value for a matching device may be passed to further calls
* to this function to find further matches. A %NULL indicates the end
* of the list.
*
* To search from the beginning pass %NULL as the @from value.
*
* The caller MUST hold the pm_devs_lock lock when calling this
* function. The instant that the lock is dropped all pointers returned
* may become invalid.
*/
struct pm_dev *pm_find(pm_dev_t type, struct pm_dev *from)
{
struct list_head *entry = from ? from->entry.next:pm_devs.next;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
if (type == PM_UNKNOWN_DEV || dev->type == type)
return dev;
entry = entry->next;
}
return 0;
}
 
EXPORT_SYMBOL(pm_register);
EXPORT_SYMBOL(pm_unregister);
EXPORT_SYMBOL(pm_unregister_all);
EXPORT_SYMBOL(pm_send);
EXPORT_SYMBOL(pm_send_all);
EXPORT_SYMBOL(pm_find);
EXPORT_SYMBOL(pm_active);
/signal.c
0,0 → 1,1325
/*
* linux/kernel/signal.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1997-11-02 Modified for POSIX.1b signals by Richard Henderson
*/
 
#include <linux/config.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/unistd.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/sched.h>
 
#include <asm/uaccess.h>
 
/*
* SLAB caches for signal bits.
*/
 
#define DEBUG_SIG 0
 
#if DEBUG_SIG
#define SIG_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
#else
#define SIG_SLAB_DEBUG 0
#endif
 
static kmem_cache_t *sigqueue_cachep;
 
atomic_t nr_queued_signals;
int max_queued_signals = 1024;
 
void __init signals_init(void)
{
sigqueue_cachep =
kmem_cache_create("sigqueue",
sizeof(struct sigqueue),
__alignof__(struct sigqueue),
SIG_SLAB_DEBUG, NULL, NULL);
if (!sigqueue_cachep)
panic("signals_init(): cannot create sigqueue SLAB cache");
}
 
 
/* Given the mask, find the first available signal that should be serviced. */
 
static int
next_signal(struct task_struct *tsk, sigset_t *mask)
{
unsigned long i, *s, *m, x;
int sig = 0;
s = tsk->pending.signal.sig;
m = mask->sig;
switch (_NSIG_WORDS) {
default:
for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
if ((x = *s &~ *m) != 0) {
sig = ffz(~x) + i*_NSIG_BPW + 1;
break;
}
break;
 
case 2: if ((x = s[0] &~ m[0]) != 0)
sig = 1;
else if ((x = s[1] &~ m[1]) != 0)
sig = _NSIG_BPW + 1;
else
break;
sig += ffz(~x);
break;
 
case 1: if ((x = *s &~ *m) != 0)
sig = ffz(~x) + 1;
break;
}
return sig;
}
 
static void flush_sigqueue(struct sigpending *queue)
{
struct sigqueue *q, *n;
 
sigemptyset(&queue->signal);
q = queue->head;
queue->head = NULL;
queue->tail = &queue->head;
 
while (q) {
n = q->next;
kmem_cache_free(sigqueue_cachep, q);
atomic_dec(&nr_queued_signals);
q = n;
}
}
 
/*
* Flush all pending signals for a task.
*/
 
void
flush_signals(struct task_struct *t)
{
t->sigpending = 0;
flush_sigqueue(&t->pending);
}
 
void exit_sighand(struct task_struct *tsk)
{
struct signal_struct * sig = tsk->sig;
 
spin_lock_irq(&tsk->sigmask_lock);
if (sig) {
tsk->sig = NULL;
if (atomic_dec_and_test(&sig->count))
kmem_cache_free(sigact_cachep, sig);
}
tsk->sigpending = 0;
flush_sigqueue(&tsk->pending);
spin_unlock_irq(&tsk->sigmask_lock);
}
 
/*
* Flush all handlers for a task.
*/
 
void
flush_signal_handlers(struct task_struct *t)
{
int i;
struct k_sigaction *ka = &t->sig->action[0];
for (i = _NSIG ; i != 0 ; i--) {
if (ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
sigemptyset(&ka->sa.sa_mask);
ka++;
}
}
 
/*
* sig_exit - cause the current task to exit due to a signal.
*/
 
void
sig_exit(int sig, int exit_code, struct siginfo *info)
{
struct task_struct *t;
 
sigaddset(&current->pending.signal, sig);
recalc_sigpending(current);
current->flags |= PF_SIGNALED;
 
/* Propagate the signal to all the tasks in
* our thread group
*/
if (info && (unsigned long)info != 1
&& info->si_code != SI_TKILL) {
read_lock(&tasklist_lock);
for_each_thread(t) {
force_sig_info(sig, info, t);
}
read_unlock(&tasklist_lock);
}
 
do_exit(exit_code);
/* NOTREACHED */
}
 
/* Notify the system that a driver wants to block all signals for this
* process, and wants to be notified if any signals at all were to be
* sent/acted upon. If the notifier routine returns non-zero, then the
* signal will be acted upon after all. If the notifier routine returns 0,
* then then signal will be blocked. Only one block per process is
* allowed. priv is a pointer to private data that the notifier routine
* can use to determine if the signal should be blocked or not. */
 
void
block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
{
unsigned long flags;
 
spin_lock_irqsave(&current->sigmask_lock, flags);
current->notifier_mask = mask;
current->notifier_data = priv;
current->notifier = notifier;
spin_unlock_irqrestore(&current->sigmask_lock, flags);
}
 
/* Notify the system that blocking has ended. */
 
void
unblock_all_signals(void)
{
unsigned long flags;
 
spin_lock_irqsave(&current->sigmask_lock, flags);
current->notifier = NULL;
current->notifier_data = NULL;
recalc_sigpending(current);
spin_unlock_irqrestore(&current->sigmask_lock, flags);
}
 
static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
{
if (sigismember(&list->signal, sig)) {
/* Collect the siginfo appropriate to this signal. */
struct sigqueue *q, **pp;
pp = &list->head;
while ((q = *pp) != NULL) {
if (q->info.si_signo == sig)
goto found_it;
pp = &q->next;
}
 
/* Ok, it wasn't in the queue. We must have
been out of queue space. So zero out the
info. */
sigdelset(&list->signal, sig);
info->si_signo = sig;
info->si_errno = 0;
info->si_code = 0;
info->si_pid = 0;
info->si_uid = 0;
return 1;
 
found_it:
if ((*pp = q->next) == NULL)
list->tail = pp;
 
/* Copy the sigqueue information and free the queue entry */
copy_siginfo(info, &q->info);
kmem_cache_free(sigqueue_cachep,q);
atomic_dec(&nr_queued_signals);
 
/* Non-RT signals can exist multiple times.. */
if (sig >= SIGRTMIN) {
while ((q = *pp) != NULL) {
if (q->info.si_signo == sig)
goto found_another;
pp = &q->next;
}
}
 
sigdelset(&list->signal, sig);
found_another:
return 1;
}
return 0;
}
 
/*
* Dequeue a signal and return the element to the caller, which is
* expected to free it.
*
* All callers must be holding current->sigmask_lock.
*/
 
int
dequeue_signal(sigset_t *mask, siginfo_t *info)
{
int sig = 0;
 
#if DEBUG_SIG
printk("SIG dequeue (%s:%d): %d ", current->comm, current->pid,
signal_pending(current));
#endif
 
sig = next_signal(current, mask);
if (sig) {
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
if (!(current->notifier)(current->notifier_data)) {
current->sigpending = 0;
return 0;
}
}
}
 
if (!collect_signal(sig, &current->pending, info))
sig = 0;
/* XXX: Once POSIX.1b timers are in, if si_code == SI_TIMER,
we need to xchg out the timer overrun values. */
}
recalc_sigpending(current);
 
#if DEBUG_SIG
printk(" %d -> %d\n", signal_pending(current), sig);
#endif
 
return sig;
}
 
static int rm_from_queue(int sig, struct sigpending *s)
{
struct sigqueue *q, **pp;
 
if (!sigismember(&s->signal, sig))
return 0;
 
sigdelset(&s->signal, sig);
 
pp = &s->head;
 
while ((q = *pp) != NULL) {
if (q->info.si_signo == sig) {
if ((*pp = q->next) == NULL)
s->tail = pp;
kmem_cache_free(sigqueue_cachep,q);
atomic_dec(&nr_queued_signals);
continue;
}
pp = &q->next;
}
return 1;
}
 
/*
* Remove signal sig from t->pending.
* Returns 1 if sig was found.
*
* All callers must be holding t->sigmask_lock.
*/
static int rm_sig_from_queue(int sig, struct task_struct *t)
{
return rm_from_queue(sig, &t->pending);
}
 
/*
* Bad permissions for sending the signal
*/
int bad_signal(int sig, struct siginfo *info, struct task_struct *t)
{
return (!info || ((unsigned long)info != 1 && SI_FROMUSER(info)))
&& ((sig != SIGCONT) || (current->session != t->session))
&& (current->euid ^ t->suid) && (current->euid ^ t->uid)
&& (current->uid ^ t->suid) && (current->uid ^ t->uid)
&& !capable(CAP_KILL);
}
 
/*
* Signal type:
* < 0 : global action (kill - spread to all non-blocked threads)
* = 0 : ignored
* > 0 : wake up.
*/
static int signal_type(int sig, struct signal_struct *signals)
{
unsigned long handler;
 
if (!signals)
return 0;
handler = (unsigned long) signals->action[sig-1].sa.sa_handler;
if (handler > 1)
return 1;
 
/* "Ignore" handler.. Illogical, but that has an implicit handler for SIGCHLD */
if (handler == 1)
return sig == SIGCHLD;
 
/* Default handler. Normally lethal, but.. */
switch (sig) {
 
/* Ignored */
case SIGCONT: case SIGWINCH:
case SIGCHLD: case SIGURG:
return 0;
 
/* Implicit behaviour */
case SIGTSTP: case SIGTTIN: case SIGTTOU:
return 1;
 
/* Implicit actions (kill or do special stuff) */
default:
return -1;
}
}
 
/*
* Determine whether a signal should be posted or not.
*
* Signals with SIG_IGN can be ignored, except for the
* special case of a SIGCHLD.
*
* Some signals with SIG_DFL default to a non-action.
*/
static int ignored_signal(int sig, struct task_struct *t)
{
/* Don't ignore traced or blocked signals */
if ((t->ptrace & PT_PTRACED) || sigismember(&t->blocked, sig))
return 0;
 
return signal_type(sig, t->sig) == 0;
}
 
/*
* Handle TASK_STOPPED cases etc implicit behaviour
* of certain magical signals.
*
* SIGKILL gets spread out to every thread.
*/
static void handle_stop_signal(int sig, struct task_struct *t)
{
switch (sig) {
case SIGKILL: case SIGCONT:
/* Wake up the process if stopped. */
if (t->state == TASK_STOPPED)
wake_up_process(t);
t->exit_code = 0;
rm_sig_from_queue(SIGSTOP, t);
rm_sig_from_queue(SIGTSTP, t);
rm_sig_from_queue(SIGTTOU, t);
rm_sig_from_queue(SIGTTIN, t);
break;
 
case SIGSTOP: case SIGTSTP:
case SIGTTIN: case SIGTTOU:
/* If we're stopping again, cancel SIGCONT */
rm_sig_from_queue(SIGCONT, t);
break;
}
}
 
static int send_signal(int sig, struct siginfo *info, struct sigpending *signals)
{
struct sigqueue * q = NULL;
 
/* Real-time signals must be queued if sent by sigqueue, or
some other real-time mechanism. It is implementation
defined whether kill() does so. We attempt to do so, on
the principle of least surprise, but since kill is not
allowed to fail with EAGAIN when low on memory we just
make sure at least one signal gets delivered and don't
pass on the info struct. */
 
if (atomic_read(&nr_queued_signals) < max_queued_signals) {
q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
}
 
if (q) {
atomic_inc(&nr_queued_signals);
q->next = NULL;
*signals->tail = q;
signals->tail = &q->next;
switch ((unsigned long) info) {
case 0:
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_USER;
q->info.si_pid = current->pid;
q->info.si_uid = current->uid;
break;
case 1:
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_KERNEL;
q->info.si_pid = 0;
q->info.si_uid = 0;
break;
default:
copy_siginfo(&q->info, info);
break;
}
} else if (sig >= SIGRTMIN && info && (unsigned long)info != 1
&& info->si_code != SI_USER) {
/*
* Queue overflow, abort. We may abort if the signal was rt
* and sent by user using something other than kill().
*/
return -EAGAIN;
}
 
sigaddset(&signals->signal, sig);
return 0;
}
 
/*
* Tell a process that it has a new active signal..
*
* NOTE! we rely on the previous spin_lock to
* lock interrupts for us! We can only be called with
* "sigmask_lock" held, and the local interrupt must
* have been disabled when that got acquired!
*
* No need to set need_resched since signal event passing
* goes through ->blocked
*/
static inline void signal_wake_up(struct task_struct *t)
{
t->sigpending = 1;
 
#ifdef CONFIG_SMP
/*
* If the task is running on a different CPU
* force a reschedule on the other CPU to make
* it notice the new signal quickly.
*
* The code below is a tad loose and might occasionally
* kick the wrong CPU if we catch the process in the
* process of changing - but no harm is done by that
* other than doing an extra (lightweight) IPI interrupt.
*/
spin_lock(&runqueue_lock);
if (task_has_cpu(t) && t->processor != smp_processor_id())
smp_send_reschedule(t->processor);
spin_unlock(&runqueue_lock);
#endif /* CONFIG_SMP */
 
if (t->state & TASK_INTERRUPTIBLE) {
wake_up_process(t);
return;
}
}
 
static int deliver_signal(int sig, struct siginfo *info, struct task_struct *t)
{
int retval = send_signal(sig, info, &t->pending);
 
if (!retval && !sigismember(&t->blocked, sig))
signal_wake_up(t);
 
return retval;
}
 
int
send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
unsigned long flags;
int ret;
 
 
#if DEBUG_SIG
printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig);
#endif
 
ret = -EINVAL;
if (sig < 0 || sig > _NSIG)
goto out_nolock;
/* The somewhat baroque permissions check... */
ret = -EPERM;
if (bad_signal(sig, info, t))
goto out_nolock;
 
/* The null signal is a permissions and process existence probe.
No signal is actually delivered. Same goes for zombies. */
ret = 0;
if (!sig || !t->sig)
goto out_nolock;
 
spin_lock_irqsave(&t->sigmask_lock, flags);
handle_stop_signal(sig, t);
 
/* Optimize away the signal, if it's a signal that can be
handled immediately (ie non-blocked and untraced) and
that is ignored (either explicitly or by default). */
 
if (ignored_signal(sig, t))
goto out;
 
/* Support queueing exactly one non-rt signal, so that we
can get more detailed information about the cause of
the signal. */
if (sig < SIGRTMIN && sigismember(&t->pending.signal, sig))
goto out;
 
ret = deliver_signal(sig, info, t);
out:
spin_unlock_irqrestore(&t->sigmask_lock, flags);
out_nolock:
#if DEBUG_SIG
printk(" %d -> %d\n", signal_pending(t), ret);
#endif
 
return ret;
}
 
/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
*/
 
int
force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
unsigned long int flags;
 
spin_lock_irqsave(&t->sigmask_lock, flags);
if (t->sig == NULL) {
spin_unlock_irqrestore(&t->sigmask_lock, flags);
return -ESRCH;
}
 
if (t->sig->action[sig-1].sa.sa_handler == SIG_IGN)
t->sig->action[sig-1].sa.sa_handler = SIG_DFL;
sigdelset(&t->blocked, sig);
recalc_sigpending(t);
spin_unlock_irqrestore(&t->sigmask_lock, flags);
 
return send_sig_info(sig, info, t);
}
 
/*
* kill_pg_info() sends a signal to a process group: this is what the tty
* control characters do (^C, ^Z etc)
*/
 
int
kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
{
int retval = -EINVAL;
if (pgrp > 0) {
struct task_struct *p;
 
retval = -ESRCH;
read_lock(&tasklist_lock);
for_each_task(p) {
if (p->pgrp == pgrp && thread_group_leader(p)) {
int err = send_sig_info(sig, info, p);
if (retval)
retval = err;
}
}
read_unlock(&tasklist_lock);
}
return retval;
}
 
/*
* kill_sl_info() sends a signal to the session leader: this is used
* to send SIGHUP to the controlling process of a terminal when
* the connection is lost.
*/
 
int
kill_sl_info(int sig, struct siginfo *info, pid_t sess)
{
int retval = -EINVAL;
if (sess > 0) {
struct task_struct *p;
 
retval = -ESRCH;
read_lock(&tasklist_lock);
for_each_task(p) {
if (p->leader && p->session == sess) {
int err = send_sig_info(sig, info, p);
if (retval)
retval = err;
}
}
read_unlock(&tasklist_lock);
}
return retval;
}
 
inline int
kill_proc_info(int sig, struct siginfo *info, pid_t pid)
{
int error;
struct task_struct *p;
 
read_lock(&tasklist_lock);
p = find_task_by_pid(pid);
error = -ESRCH;
if (p) {
if (!thread_group_leader(p)) {
struct task_struct *tg;
tg = find_task_by_pid(p->tgid);
if (tg)
p = tg;
}
error = send_sig_info(sig, info, p);
}
read_unlock(&tasklist_lock);
return error;
}
 
 
/*
* kill_something_info() interprets pid in interesting ways just like kill(2).
*
* POSIX specifies that kill(-1,sig) is unspecified, but what we have
* is probably wrong. Should make it like BSD or SYSV.
*/
 
static int kill_something_info(int sig, struct siginfo *info, int pid)
{
if (!pid) {
return kill_pg_info(sig, info, current->pgrp);
} else if (pid == -1) {
int retval = 0, count = 0;
struct task_struct * p;
 
read_lock(&tasklist_lock);
for_each_task(p) {
if (p->pid > 1 && p != current && thread_group_leader(p)) {
int err = send_sig_info(sig, info, p);
++count;
if (err != -EPERM)
retval = err;
}
}
read_unlock(&tasklist_lock);
return count ? retval : -ESRCH;
} else if (pid < 0) {
return kill_pg_info(sig, info, -pid);
} else {
return kill_proc_info(sig, info, pid);
}
}
 
/*
* These are for backward compatibility with the rest of the kernel source.
*/
 
int
send_sig(int sig, struct task_struct *p, int priv)
{
return send_sig_info(sig, (void*)(long)(priv != 0), p);
}
 
void
force_sig(int sig, struct task_struct *p)
{
force_sig_info(sig, (void*)1L, p);
}
 
int
kill_pg(pid_t pgrp, int sig, int priv)
{
return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
}
 
int
kill_sl(pid_t sess, int sig, int priv)
{
return kill_sl_info(sig, (void *)(long)(priv != 0), sess);
}
 
int
kill_proc(pid_t pid, int sig, int priv)
{
return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
}
 
/*
* Joy. Or not. Pthread wants us to wake up every thread
* in our parent group.
*/
static void wake_up_parent(struct task_struct *parent)
{
struct task_struct *tsk = parent;
 
do {
wake_up_interruptible(&tsk->wait_chldexit);
tsk = next_thread(tsk);
} while (tsk != parent);
}
 
/*
* Let a parent know about a status change of a child.
*/
 
void do_notify_parent(struct task_struct *tsk, int sig)
{
struct siginfo info;
int why, status;
 
info.si_signo = sig;
info.si_errno = 0;
info.si_pid = tsk->pid;
info.si_uid = tsk->uid;
 
/* FIXME: find out whether or not this is supposed to be c*time. */
info.si_utime = tsk->times.tms_utime;
info.si_stime = tsk->times.tms_stime;
 
status = tsk->exit_code & 0x7f;
why = SI_KERNEL; /* shouldn't happen */
switch (tsk->state) {
case TASK_STOPPED:
/* FIXME -- can we deduce CLD_TRAPPED or CLD_CONTINUED? */
if (tsk->ptrace & PT_PTRACED)
why = CLD_TRAPPED;
else
why = CLD_STOPPED;
break;
 
default:
if (tsk->exit_code & 0x80)
why = CLD_DUMPED;
else if (tsk->exit_code & 0x7f)
why = CLD_KILLED;
else {
why = CLD_EXITED;
status = tsk->exit_code >> 8;
}
break;
}
info.si_code = why;
info.si_status = status;
 
send_sig_info(sig, &info, tsk->p_pptr);
wake_up_parent(tsk->p_pptr);
}
 
 
/*
* We need the tasklist lock because it's the only
* thing that protects out "parent" pointer.
*
* exit.c calls "do_notify_parent()" directly, because
* it already has the tasklist lock.
*/
void
notify_parent(struct task_struct *tsk, int sig)
{
read_lock(&tasklist_lock);
do_notify_parent(tsk, sig);
read_unlock(&tasklist_lock);
}
 
EXPORT_SYMBOL(dequeue_signal);
EXPORT_SYMBOL(flush_signals);
EXPORT_SYMBOL(force_sig);
EXPORT_SYMBOL(force_sig_info);
EXPORT_SYMBOL(kill_pg);
EXPORT_SYMBOL(kill_pg_info);
EXPORT_SYMBOL(kill_proc);
EXPORT_SYMBOL(kill_proc_info);
EXPORT_SYMBOL(kill_sl);
EXPORT_SYMBOL(kill_sl_info);
EXPORT_SYMBOL(notify_parent);
EXPORT_SYMBOL(recalc_sigpending);
EXPORT_SYMBOL(send_sig);
EXPORT_SYMBOL(send_sig_info);
EXPORT_SYMBOL(block_all_signals);
EXPORT_SYMBOL(unblock_all_signals);
 
 
/*
* System call entry points.
*/
 
/*
* We don't need to get the kernel lock - this is all local to this
* particular thread.. (and that's good, because this is _heavily_
* used by various programs)
*/
 
asmlinkage long
sys_rt_sigprocmask(int how, sigset_t *set, sigset_t *oset, size_t sigsetsize)
{
int error = -EINVAL;
sigset_t old_set, new_set;
 
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
goto out;
 
if (set) {
error = -EFAULT;
if (copy_from_user(&new_set, set, sizeof(*set)))
goto out;
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
 
spin_lock_irq(&current->sigmask_lock);
old_set = current->blocked;
 
error = 0;
switch (how) {
default:
error = -EINVAL;
break;
case SIG_BLOCK:
sigorsets(&current->blocked, &old_set, &new_set);
break;
case SIG_UNBLOCK:
signandsets(&current->blocked, &old_set, &new_set);
break;
case SIG_SETMASK:
current->blocked = new_set;
break;
}
 
recalc_sigpending(current);
spin_unlock_irq(&current->sigmask_lock);
if (error)
goto out;
if (oset)
goto set_old;
} else if (oset) {
spin_lock_irq(&current->sigmask_lock);
old_set = current->blocked;
spin_unlock_irq(&current->sigmask_lock);
 
set_old:
error = -EFAULT;
if (copy_to_user(oset, &old_set, sizeof(*oset)))
goto out;
}
error = 0;
out:
return error;
}
 
long do_sigpending(void *set, unsigned long sigsetsize)
{
long error = -EINVAL;
sigset_t pending;
 
if (sigsetsize > sizeof(sigset_t))
goto out;
 
spin_lock_irq(&current->sigmask_lock);
sigandsets(&pending, &current->blocked, &current->pending.signal);
spin_unlock_irq(&current->sigmask_lock);
 
error = -EFAULT;
if (!copy_to_user(set, &pending, sigsetsize))
error = 0;
out:
return error;
}
 
asmlinkage long
sys_rt_sigpending(sigset_t *set, size_t sigsetsize)
{
return do_sigpending(set, sigsetsize);
}
 
asmlinkage long
sys_rt_sigtimedwait(const sigset_t *uthese, siginfo_t *uinfo,
const struct timespec *uts, size_t sigsetsize)
{
int ret, sig;
sigset_t these;
struct timespec ts;
siginfo_t info;
long timeout = 0;
 
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
 
if (copy_from_user(&these, uthese, sizeof(these)))
return -EFAULT;
/*
* Invert the set of allowed signals to get those we
* want to block.
*/
sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
signotset(&these);
 
if (uts) {
if (copy_from_user(&ts, uts, sizeof(ts)))
return -EFAULT;
if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
|| ts.tv_sec < 0)
return -EINVAL;
}
 
spin_lock_irq(&current->sigmask_lock);
sig = dequeue_signal(&these, &info);
if (!sig) {
timeout = MAX_SCHEDULE_TIMEOUT;
if (uts)
timeout = (timespec_to_jiffies(&ts)
+ (ts.tv_sec || ts.tv_nsec));
 
if (timeout) {
/* None ready -- temporarily unblock those we're
* interested while we are sleeping in so that we'll
* be awakened when they arrive. */
sigset_t oldblocked = current->blocked;
sigandsets(&current->blocked, &current->blocked, &these);
recalc_sigpending(current);
spin_unlock_irq(&current->sigmask_lock);
 
current->state = TASK_INTERRUPTIBLE;
timeout = schedule_timeout(timeout);
 
spin_lock_irq(&current->sigmask_lock);
sig = dequeue_signal(&these, &info);
current->blocked = oldblocked;
recalc_sigpending(current);
}
}
spin_unlock_irq(&current->sigmask_lock);
 
if (sig) {
ret = sig;
if (uinfo) {
if (copy_siginfo_to_user(uinfo, &info))
ret = -EFAULT;
}
} else {
ret = -EAGAIN;
if (timeout)
ret = -EINTR;
}
 
return ret;
}
 
asmlinkage long
sys_kill(int pid, int sig)
{
struct siginfo info;
 
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_USER;
info.si_pid = current->pid;
info.si_uid = current->uid;
 
return kill_something_info(sig, &info, pid);
}
 
/*
* Kill only one task, even if it's a CLONE_THREAD task.
*/
asmlinkage long
sys_tkill(int pid, int sig)
{
struct siginfo info;
int error;
struct task_struct *p;
 
/* This is only valid for single tasks */
if (pid <= 0)
return -EINVAL;
 
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_TKILL;
info.si_pid = current->pid;
info.si_uid = current->uid;
 
read_lock(&tasklist_lock);
p = find_task_by_pid(pid);
error = -ESRCH;
if (p) {
error = send_sig_info(sig, &info, p);
}
read_unlock(&tasklist_lock);
return error;
}
 
asmlinkage long
sys_rt_sigqueueinfo(int pid, int sig, siginfo_t *uinfo)
{
siginfo_t info;
 
if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
return -EFAULT;
 
/* Not even root can pretend to send signals from the kernel.
Nor can they impersonate a kill(), which adds source info. */
if (info.si_code >= 0)
return -EPERM;
info.si_signo = sig;
 
/* POSIX.1b doesn't mention process groups. */
return kill_proc_info(sig, &info, pid);
}
 
int
do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
{
struct k_sigaction *k;
 
if (sig < 1 || sig > _NSIG ||
(act && (sig == SIGKILL || sig == SIGSTOP)))
return -EINVAL;
 
k = &current->sig->action[sig-1];
 
spin_lock(&current->sig->siglock);
 
if (oact)
*oact = *k;
 
if (act) {
*k = *act;
sigdelsetmask(&k->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
 
/*
* POSIX 3.3.1.3:
* "Setting a signal action to SIG_IGN for a signal that is
* pending shall cause the pending signal to be discarded,
* whether or not it is blocked."
*
* "Setting a signal action to SIG_DFL for a signal that is
* pending and whose default action is to ignore the signal
* (for example, SIGCHLD), shall cause the pending signal to
* be discarded, whether or not it is blocked"
*
* Note the silly behaviour of SIGCHLD: SIG_IGN means that the
* signal isn't actually ignored, but does automatic child
* reaping, while SIG_DFL is explicitly said by POSIX to force
* the signal to be ignored.
*/
 
if (k->sa.sa_handler == SIG_IGN
|| (k->sa.sa_handler == SIG_DFL
&& (sig == SIGCONT ||
sig == SIGCHLD ||
sig == SIGURG ||
sig == SIGWINCH))) {
spin_lock_irq(&current->sigmask_lock);
if (rm_sig_from_queue(sig, current))
recalc_sigpending(current);
spin_unlock_irq(&current->sigmask_lock);
}
}
 
spin_unlock(&current->sig->siglock);
return 0;
}
 
int
do_sigaltstack (const stack_t *uss, stack_t *uoss, unsigned long sp)
{
stack_t oss;
int error;
 
if (uoss) {
oss.ss_sp = (void *) current->sas_ss_sp;
oss.ss_size = current->sas_ss_size;
oss.ss_flags = sas_ss_flags(sp);
}
 
if (uss) {
void *ss_sp;
size_t ss_size;
int ss_flags;
 
error = -EFAULT;
if (verify_area(VERIFY_READ, uss, sizeof(*uss))
|| __get_user(ss_sp, &uss->ss_sp)
|| __get_user(ss_flags, &uss->ss_flags)
|| __get_user(ss_size, &uss->ss_size))
goto out;
 
error = -EPERM;
if (on_sig_stack (sp))
goto out;
 
error = -EINVAL;
/*
*
* Note - this code used to test ss_flags incorrectly
* old code may have been written using ss_flags==0
* to mean ss_flags==SS_ONSTACK (as this was the only
* way that worked) - this fix preserves that older
* mechanism
*/
if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
goto out;
 
if (ss_flags == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
error = -ENOMEM;
if (ss_size < MINSIGSTKSZ)
goto out;
}
 
current->sas_ss_sp = (unsigned long) ss_sp;
current->sas_ss_size = ss_size;
}
 
if (uoss) {
error = -EFAULT;
if (copy_to_user(uoss, &oss, sizeof(oss)))
goto out;
}
 
error = 0;
out:
return error;
}
 
asmlinkage long
sys_sigpending(old_sigset_t *set)
{
return do_sigpending(set, sizeof(*set));
}
 
#if !defined(__alpha__)
/* Alpha has its own versions with special arguments. */
 
asmlinkage long
sys_sigprocmask(int how, old_sigset_t *set, old_sigset_t *oset)
{
int error;
old_sigset_t old_set, new_set;
 
if (set) {
error = -EFAULT;
if (copy_from_user(&new_set, set, sizeof(*set)))
goto out;
new_set &= ~(sigmask(SIGKILL)|sigmask(SIGSTOP));
 
spin_lock_irq(&current->sigmask_lock);
old_set = current->blocked.sig[0];
 
error = 0;
switch (how) {
default:
error = -EINVAL;
break;
case SIG_BLOCK:
sigaddsetmask(&current->blocked, new_set);
break;
case SIG_UNBLOCK:
sigdelsetmask(&current->blocked, new_set);
break;
case SIG_SETMASK:
current->blocked.sig[0] = new_set;
break;
}
 
recalc_sigpending(current);
spin_unlock_irq(&current->sigmask_lock);
if (error)
goto out;
if (oset)
goto set_old;
} else if (oset) {
old_set = current->blocked.sig[0];
set_old:
error = -EFAULT;
if (copy_to_user(oset, &old_set, sizeof(*oset)))
goto out;
}
error = 0;
out:
return error;
}
 
#ifndef __sparc__
asmlinkage long
sys_rt_sigaction(int sig, const struct sigaction *act, struct sigaction *oact,
size_t sigsetsize)
{
struct k_sigaction new_sa, old_sa;
int ret = -EINVAL;
 
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
goto out;
 
if (act) {
if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
return -EFAULT;
}
 
ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
 
if (!ret && oact) {
if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
return -EFAULT;
}
out:
return ret;
}
#endif /* __sparc__ */
#endif
 
#if !defined(__alpha__) && !defined(__ia64__)
/*
* For backwards compatibility. Functionality superseded by sigprocmask.
*/
asmlinkage long
sys_sgetmask(void)
{
/* SMP safe */
return current->blocked.sig[0];
}
 
asmlinkage long
sys_ssetmask(int newmask)
{
int old;
 
spin_lock_irq(&current->sigmask_lock);
old = current->blocked.sig[0];
 
siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
sigmask(SIGSTOP)));
recalc_sigpending(current);
spin_unlock_irq(&current->sigmask_lock);
 
return old;
}
#endif /* !defined(__alpha__) */
 
#if !defined(__alpha__) && !defined(__ia64__) && !defined(__mips__)
/*
* For backwards compatibility. Functionality superseded by sigaction.
*/
asmlinkage unsigned long
sys_signal(int sig, __sighandler_t handler)
{
struct k_sigaction new_sa, old_sa;
int ret;
 
new_sa.sa.sa_handler = handler;
new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
 
ret = do_sigaction(sig, &new_sa, &old_sa);
 
return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
}
#endif /* !alpha && !__ia64__ && !defined(__mips__) */
/sys.c
0,0 → 1,1292
/*
* linux/kernel/sys.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
 
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/smp_lock.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/init.h>
#include <linux/highuid.h>
 
#include <asm/uaccess.h>
#include <asm/io.h>
 
#ifndef SET_UNALIGN_CTL
# define SET_UNALIGN_CTL(a,b) (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
# define GET_UNALIGN_CTL(a,b) (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
# define SET_FPEMU_CTL(a,b) (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
# define GET_FPEMU_CTL(a,b) (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
# define SET_FPEXC_CTL(a,b) (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
# define GET_FPEXC_CTL(a,b) (-EINVAL)
#endif
 
/*
* this is where the system-wide overflow UID and GID are defined, for
* architectures that now have 32-bit UID/GID but didn't in the past
*/
 
int overflowuid = DEFAULT_OVERFLOWUID;
int overflowgid = DEFAULT_OVERFLOWGID;
 
/*
* the same as above, but for filesystems which can only store a 16-bit
* UID and GID. as such, this is needed on all architectures
*/
 
int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
 
/*
* this indicates whether you can reboot with ctrl-alt-del: the default is yes
*/
 
int C_A_D = 1;
int cad_pid = 1;
 
 
/*
* Notifier list for kernel code which wants to be called
* at shutdown. This is used to stop any idling DMA operations
* and the like.
*/
 
static struct notifier_block *reboot_notifier_list;
rwlock_t notifier_lock = RW_LOCK_UNLOCKED;
 
/**
* notifier_chain_register - Add notifier to a notifier chain
* @list: Pointer to root list pointer
* @n: New entry in notifier chain
*
* Adds a notifier to a notifier chain.
*
* Currently always returns zero.
*/
int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
{
write_lock(&notifier_lock);
while(*list)
{
if(n->priority > (*list)->priority)
break;
list= &((*list)->next);
}
n->next = *list;
*list=n;
write_unlock(&notifier_lock);
return 0;
}
 
/**
* notifier_chain_unregister - Remove notifier from a notifier chain
* @nl: Pointer to root list pointer
* @n: New entry in notifier chain
*
* Removes a notifier from a notifier chain.
*
* Returns zero on success, or %-ENOENT on failure.
*/
int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
{
write_lock(&notifier_lock);
while((*nl)!=NULL)
{
if((*nl)==n)
{
*nl=n->next;
write_unlock(&notifier_lock);
return 0;
}
nl=&((*nl)->next);
}
write_unlock(&notifier_lock);
return -ENOENT;
}
 
/**
* notifier_call_chain - Call functions in a notifier chain
* @n: Pointer to root pointer of notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
*
* Calls each function in a notifier chain in turn.
*
* If the return value of the notifier can be and'd
* with %NOTIFY_STOP_MASK, then notifier_call_chain
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise, the return value is the return value
* of the last notifier function called.
*/
int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
{
int ret=NOTIFY_DONE;
struct notifier_block *nb = *n;
 
while(nb)
{
ret=nb->notifier_call(nb,val,v);
if(ret&NOTIFY_STOP_MASK)
{
return ret;
}
nb=nb->next;
}
return ret;
}
 
/**
* register_reboot_notifier - Register function to be called at reboot time
* @nb: Info about notifier function to be called
*
* Registers a function with the list of functions
* to be called at reboot time.
*
* Currently always returns zero, as notifier_chain_register
* always returns zero.
*/
int register_reboot_notifier(struct notifier_block * nb)
{
return notifier_chain_register(&reboot_notifier_list, nb);
}
 
/**
* unregister_reboot_notifier - Unregister previously registered reboot notifier
* @nb: Hook to be unregistered
*
* Unregisters a previously registered reboot
* notifier function.
*
* Returns zero on success, or %-ENOENT on failure.
*/
int unregister_reboot_notifier(struct notifier_block * nb)
{
return notifier_chain_unregister(&reboot_notifier_list, nb);
}
 
asmlinkage long sys_ni_syscall(void)
{
return -ENOSYS;
}
 
static int proc_sel(struct task_struct *p, int which, int who)
{
if(p->pid)
{
switch (which) {
case PRIO_PROCESS:
if (!who && p == current)
return 1;
return(p->pid == who);
case PRIO_PGRP:
if (!who)
who = current->pgrp;
return(p->pgrp == who);
case PRIO_USER:
if (!who)
who = current->uid;
return(p->uid == who);
}
}
return 0;
}
 
asmlinkage long sys_setpriority(int which, int who, int niceval)
{
struct task_struct *p;
int error;
 
if (which > 2 || which < 0)
return -EINVAL;
 
/* normalize: avoid signed division (rounding problems) */
error = -ESRCH;
if (niceval < -20)
niceval = -20;
if (niceval > 19)
niceval = 19;
 
read_lock(&tasklist_lock);
for_each_task(p) {
if (!proc_sel(p, which, who))
continue;
if (p->uid != current->euid &&
p->uid != current->uid && !capable(CAP_SYS_NICE)) {
error = -EPERM;
continue;
}
if (error == -ESRCH)
error = 0;
if (niceval < p->nice && !capable(CAP_SYS_NICE))
error = -EACCES;
else
p->nice = niceval;
}
read_unlock(&tasklist_lock);
 
return error;
}
 
/*
* Ugh. To avoid negative return values, "getpriority()" will
* not return the normal nice-value, but a negated value that
* has been offset by 20 (ie it returns 40..1 instead of -20..19)
* to stay compatible.
*/
asmlinkage long sys_getpriority(int which, int who)
{
struct task_struct *p;
long retval = -ESRCH;
 
if (which > 2 || which < 0)
return -EINVAL;
 
read_lock(&tasklist_lock);
for_each_task (p) {
long niceval;
if (!proc_sel(p, which, who))
continue;
niceval = 20 - p->nice;
if (niceval > retval)
retval = niceval;
}
read_unlock(&tasklist_lock);
 
return retval;
}
 
 
/*
* Reboot system call: for obvious reasons only root may call it,
* and even root needs to set up some magic numbers in the registers
* so that some mistake won't make this reboot the whole machine.
* You can also set the meaning of the ctrl-alt-del-key here.
*
* reboot doesn't sync: do that yourself before calling this.
*/
asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void * arg)
{
char buffer[256];
 
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT))
return -EPERM;
 
/* For safety, we require "magic" arguments. */
if (magic1 != LINUX_REBOOT_MAGIC1 ||
(magic2 != LINUX_REBOOT_MAGIC2 && magic2 != LINUX_REBOOT_MAGIC2A &&
magic2 != LINUX_REBOOT_MAGIC2B))
return -EINVAL;
 
lock_kernel();
switch (cmd) {
case LINUX_REBOOT_CMD_RESTART:
notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
printk(KERN_EMERG "Restarting system.\n");
machine_restart(NULL);
break;
 
case LINUX_REBOOT_CMD_CAD_ON:
C_A_D = 1;
break;
 
case LINUX_REBOOT_CMD_CAD_OFF:
C_A_D = 0;
break;
 
case LINUX_REBOOT_CMD_HALT:
notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
printk(KERN_EMERG "System halted.\n");
machine_halt();
do_exit(0);
break;
 
case LINUX_REBOOT_CMD_POWER_OFF:
notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
printk(KERN_EMERG "Power down.\n");
machine_power_off();
do_exit(0);
break;
 
case LINUX_REBOOT_CMD_RESTART2:
if (strncpy_from_user(&buffer[0], (char *)arg, sizeof(buffer) - 1) < 0) {
unlock_kernel();
return -EFAULT;
}
buffer[sizeof(buffer) - 1] = '\0';
 
notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
machine_restart(buffer);
break;
 
default:
unlock_kernel();
return -EINVAL;
}
unlock_kernel();
return 0;
}
 
static void deferred_cad(void *dummy)
{
notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
machine_restart(NULL);
}
 
/*
* This function gets called by ctrl-alt-del - ie the keyboard interrupt.
* As it's called within an interrupt, it may NOT sync: the only choice
* is whether to reboot at once, or just ignore the ctrl-alt-del.
*/
void ctrl_alt_del(void)
{
static struct tq_struct cad_tq = {
routine: deferred_cad,
};
 
if (C_A_D)
schedule_task(&cad_tq);
else
kill_proc(cad_pid, SIGINT, 1);
}
 
/*
* Unprivileged users may change the real gid to the effective gid
* or vice versa. (BSD-style)
*
* If you set the real gid at all, or set the effective gid to a value not
* equal to the real gid, then the saved gid is set to the new effective gid.
*
* This makes it possible for a setgid program to completely drop its
* privileges, which is often a useful assertion to make when you are doing
* a security audit over a program.
*
* The general idea is that a program which uses just setregid() will be
* 100% compatible with BSD. A program which uses just setgid() will be
* 100% compatible with POSIX with saved IDs.
*
* SMP: There are not races, the GIDs are checked only by filesystem
* operations (as far as semantic preservation is concerned).
*/
asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
{
int old_rgid = current->gid;
int old_egid = current->egid;
int new_rgid = old_rgid;
int new_egid = old_egid;
 
if (rgid != (gid_t) -1) {
if ((old_rgid == rgid) ||
(current->egid==rgid) ||
capable(CAP_SETGID))
new_rgid = rgid;
else
return -EPERM;
}
if (egid != (gid_t) -1) {
if ((old_rgid == egid) ||
(current->egid == egid) ||
(current->sgid == egid) ||
capable(CAP_SETGID))
new_egid = egid;
else {
return -EPERM;
}
}
if (new_egid != old_egid)
{
current->mm->dumpable = 0;
wmb();
}
if (rgid != (gid_t) -1 ||
(egid != (gid_t) -1 && egid != old_rgid))
current->sgid = new_egid;
current->fsgid = new_egid;
current->egid = new_egid;
current->gid = new_rgid;
return 0;
}
 
/*
* setgid() is implemented like SysV w/ SAVED_IDS
*
* SMP: Same implicit races as above.
*/
asmlinkage long sys_setgid(gid_t gid)
{
int old_egid = current->egid;
 
if (capable(CAP_SETGID))
{
if(old_egid != gid)
{
current->mm->dumpable=0;
wmb();
}
current->gid = current->egid = current->sgid = current->fsgid = gid;
}
else if ((gid == current->gid) || (gid == current->sgid))
{
if(old_egid != gid)
{
current->mm->dumpable=0;
wmb();
}
current->egid = current->fsgid = gid;
}
else
return -EPERM;
return 0;
}
/*
* cap_emulate_setxuid() fixes the effective / permitted capabilities of
* a process after a call to setuid, setreuid, or setresuid.
*
* 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
* {r,e,s}uid != 0, the permitted and effective capabilities are
* cleared.
*
* 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
* capabilities of the process are cleared.
*
* 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
* capabilities are set to the permitted capabilities.
*
* fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
* never happen.
*
* -astor
*
* cevans - New behaviour, Oct '99
* A process may, via prctl(), elect to keep its capabilities when it
* calls setuid() and switches away from uid==0. Both permitted and
* effective sets will be retained.
* Without this change, it was impossible for a daemon to drop only some
* of its privilege. The call to setuid(!=0) would drop all privileges!
* Keeping uid 0 is not an option because uid 0 owns too many vital
* files..
* Thanks to Olaf Kirch and Peter Benie for spotting this.
*/
static inline void cap_emulate_setxuid(int old_ruid, int old_euid,
int old_suid)
{
if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
(current->uid != 0 && current->euid != 0 && current->suid != 0) &&
!current->keep_capabilities) {
cap_clear(current->cap_permitted);
cap_clear(current->cap_effective);
}
if (old_euid == 0 && current->euid != 0) {
cap_clear(current->cap_effective);
}
if (old_euid != 0 && current->euid == 0) {
current->cap_effective = current->cap_permitted;
}
}
 
static int set_user(uid_t new_ruid, int dumpclear)
{
struct user_struct *new_user;
 
new_user = alloc_uid(new_ruid);
if (!new_user)
return -EAGAIN;
switch_uid(new_user);
 
if(dumpclear)
{
current->mm->dumpable = 0;
wmb();
}
current->uid = new_ruid;
return 0;
}
 
/*
* Unprivileged users may change the real uid to the effective uid
* or vice versa. (BSD-style)
*
* If you set the real uid at all, or set the effective uid to a value not
* equal to the real uid, then the saved uid is set to the new effective uid.
*
* This makes it possible for a setuid program to completely drop its
* privileges, which is often a useful assertion to make when you are doing
* a security audit over a program.
*
* The general idea is that a program which uses just setreuid() will be
* 100% compatible with BSD. A program which uses just setuid() will be
* 100% compatible with POSIX with saved IDs.
*/
asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
{
int old_ruid, old_euid, old_suid, new_ruid, new_euid;
 
new_ruid = old_ruid = current->uid;
new_euid = old_euid = current->euid;
old_suid = current->suid;
 
if (ruid != (uid_t) -1) {
new_ruid = ruid;
if ((old_ruid != ruid) &&
(current->euid != ruid) &&
!capable(CAP_SETUID))
return -EPERM;
}
 
if (euid != (uid_t) -1) {
new_euid = euid;
if ((old_ruid != euid) &&
(current->euid != euid) &&
(current->suid != euid) &&
!capable(CAP_SETUID))
return -EPERM;
}
 
if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
return -EAGAIN;
 
if (new_euid != old_euid)
{
current->mm->dumpable=0;
wmb();
}
current->fsuid = current->euid = new_euid;
if (ruid != (uid_t) -1 ||
(euid != (uid_t) -1 && euid != old_ruid))
current->suid = current->euid;
current->fsuid = current->euid;
 
if (!issecure(SECURE_NO_SETUID_FIXUP)) {
cap_emulate_setxuid(old_ruid, old_euid, old_suid);
}
 
return 0;
}
 
 
/*
* setuid() is implemented like SysV with SAVED_IDS
*
* Note that SAVED_ID's is deficient in that a setuid root program
* like sendmail, for example, cannot set its uid to be a normal
* user and then switch back, because if you're root, setuid() sets
* the saved uid too. If you don't like this, blame the bright people
* in the POSIX committee and/or USG. Note that the BSD-style setreuid()
* will allow a root program to temporarily drop privileges and be able to
* regain them by swapping the real and effective uid.
*/
asmlinkage long sys_setuid(uid_t uid)
{
int old_euid = current->euid;
int old_ruid, old_suid, new_ruid, new_suid;
 
old_ruid = new_ruid = current->uid;
old_suid = current->suid;
new_suid = old_suid;
if (capable(CAP_SETUID)) {
if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
return -EAGAIN;
new_suid = uid;
} else if ((uid != current->uid) && (uid != new_suid))
return -EPERM;
 
if (old_euid != uid)
{
current->mm->dumpable = 0;
wmb();
}
current->fsuid = current->euid = uid;
current->suid = new_suid;
 
if (!issecure(SECURE_NO_SETUID_FIXUP)) {
cap_emulate_setxuid(old_ruid, old_euid, old_suid);
}
 
return 0;
}
 
 
/*
* This function implements a generic ability to update ruid, euid,
* and suid. This allows you to implement the 4.4 compatible seteuid().
*/
asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
{
int old_ruid = current->uid;
int old_euid = current->euid;
int old_suid = current->suid;
 
if (!capable(CAP_SETUID)) {
if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
(ruid != current->euid) && (ruid != current->suid))
return -EPERM;
if ((euid != (uid_t) -1) && (euid != current->uid) &&
(euid != current->euid) && (euid != current->suid))
return -EPERM;
if ((suid != (uid_t) -1) && (suid != current->uid) &&
(suid != current->euid) && (suid != current->suid))
return -EPERM;
}
if (ruid != (uid_t) -1) {
if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
return -EAGAIN;
}
if (euid != (uid_t) -1) {
if (euid != current->euid)
{
current->mm->dumpable = 0;
wmb();
}
current->euid = euid;
}
current->fsuid = current->euid;
if (suid != (uid_t) -1)
current->suid = suid;
 
if (!issecure(SECURE_NO_SETUID_FIXUP)) {
cap_emulate_setxuid(old_ruid, old_euid, old_suid);
}
 
return 0;
}
 
asmlinkage long sys_getresuid(uid_t *ruid, uid_t *euid, uid_t *suid)
{
int retval;
 
if (!(retval = put_user(current->uid, ruid)) &&
!(retval = put_user(current->euid, euid)))
retval = put_user(current->suid, suid);
 
return retval;
}
 
/*
* Same as above, but for rgid, egid, sgid.
*/
asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
{
if (!capable(CAP_SETGID)) {
if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
(rgid != current->egid) && (rgid != current->sgid))
return -EPERM;
if ((egid != (gid_t) -1) && (egid != current->gid) &&
(egid != current->egid) && (egid != current->sgid))
return -EPERM;
if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
(sgid != current->egid) && (sgid != current->sgid))
return -EPERM;
}
if (egid != (gid_t) -1) {
if (egid != current->egid)
{
current->mm->dumpable = 0;
wmb();
}
current->egid = egid;
}
current->fsgid = current->egid;
if (rgid != (gid_t) -1)
current->gid = rgid;
if (sgid != (gid_t) -1)
current->sgid = sgid;
return 0;
}
 
asmlinkage long sys_getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid)
{
int retval;
 
if (!(retval = put_user(current->gid, rgid)) &&
!(retval = put_user(current->egid, egid)))
retval = put_user(current->sgid, sgid);
 
return retval;
}
 
 
/*
* "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
* is used for "access()" and for the NFS daemon (letting nfsd stay at
* whatever uid it wants to). It normally shadows "euid", except when
* explicitly set by setfsuid() or for access..
*/
asmlinkage long sys_setfsuid(uid_t uid)
{
int old_fsuid;
 
old_fsuid = current->fsuid;
if (uid == current->uid || uid == current->euid ||
uid == current->suid || uid == current->fsuid ||
capable(CAP_SETUID))
{
if (uid != old_fsuid)
{
current->mm->dumpable = 0;
wmb();
}
current->fsuid = uid;
}
 
/* We emulate fsuid by essentially doing a scaled-down version
* of what we did in setresuid and friends. However, we only
* operate on the fs-specific bits of the process' effective
* capabilities
*
* FIXME - is fsuser used for all CAP_FS_MASK capabilities?
* if not, we might be a bit too harsh here.
*/
if (!issecure(SECURE_NO_SETUID_FIXUP)) {
if (old_fsuid == 0 && current->fsuid != 0) {
cap_t(current->cap_effective) &= ~CAP_FS_MASK;
}
if (old_fsuid != 0 && current->fsuid == 0) {
cap_t(current->cap_effective) |=
(cap_t(current->cap_permitted) & CAP_FS_MASK);
}
}
 
return old_fsuid;
}
 
/*
 
*/
asmlinkage long sys_setfsgid(gid_t gid)
{
int old_fsgid;
 
old_fsgid = current->fsgid;
if (gid == current->gid || gid == current->egid ||
gid == current->sgid || gid == current->fsgid ||
capable(CAP_SETGID))
{
if (gid != old_fsgid)
{
current->mm->dumpable = 0;
wmb();
}
current->fsgid = gid;
}
return old_fsgid;
}
 
asmlinkage long sys_times(struct tms * tbuf)
{
/*
* In the SMP world we might just be unlucky and have one of
* the times increment as we use it. Since the value is an
* atomically safe type this is just fine. Conceptually its
* as if the syscall took an instant longer to occur.
*/
if (tbuf)
if (copy_to_user(tbuf, &current->times, sizeof(struct tms)))
return -EFAULT;
return jiffies;
}
 
/*
* This needs some heavy checking ...
* I just haven't the stomach for it. I also don't fully
* understand sessions/pgrp etc. Let somebody who does explain it.
*
* OK, I think I have the protection semantics right.... this is really
* only important on a multi-user system anyway, to make sure one user
* can't send a signal to a process owned by another. -TYT, 12/12/91
*
* Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
* LBT 04.03.94
*/
 
asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
{
struct task_struct * p;
int err = -EINVAL;
 
if (!pid)
pid = current->pid;
if (!pgid)
pgid = pid;
if (pgid < 0)
return -EINVAL;
 
/* From this point forward we keep holding onto the tasklist lock
* so that our parent does not change from under us. -DaveM
*/
read_lock(&tasklist_lock);
 
err = -ESRCH;
p = find_task_by_pid(pid);
if (!p)
goto out;
 
if (p->p_pptr == current || p->p_opptr == current) {
err = -EPERM;
if (p->session != current->session)
goto out;
err = -EACCES;
if (p->did_exec)
goto out;
} else if (p != current)
goto out;
err = -EPERM;
if (p->leader)
goto out;
if (pgid != pid) {
struct task_struct * tmp;
for_each_task (tmp) {
if (tmp->pgrp == pgid &&
tmp->session == current->session)
goto ok_pgid;
}
goto out;
}
 
ok_pgid:
p->pgrp = pgid;
err = 0;
out:
/* All paths lead to here, thus we are safe. -DaveM */
read_unlock(&tasklist_lock);
return err;
}
 
asmlinkage long sys_getpgid(pid_t pid)
{
if (!pid) {
return current->pgrp;
} else {
int retval;
struct task_struct *p;
 
read_lock(&tasklist_lock);
p = find_task_by_pid(pid);
 
retval = -ESRCH;
if (p)
retval = p->pgrp;
read_unlock(&tasklist_lock);
return retval;
}
}
 
asmlinkage long sys_getpgrp(void)
{
/* SMP - assuming writes are word atomic this is fine */
return current->pgrp;
}
 
asmlinkage long sys_getsid(pid_t pid)
{
if (!pid) {
return current->session;
} else {
int retval;
struct task_struct *p;
 
read_lock(&tasklist_lock);
p = find_task_by_pid(pid);
 
retval = -ESRCH;
if(p)
retval = p->session;
read_unlock(&tasklist_lock);
return retval;
}
}
 
asmlinkage long sys_setsid(void)
{
struct task_struct * p;
int err = -EPERM;
 
read_lock(&tasklist_lock);
for_each_task(p) {
if (p->pgrp == current->pid)
goto out;
}
 
current->leader = 1;
current->session = current->pgrp = current->pid;
current->tty = NULL;
current->tty_old_pgrp = 0;
err = current->pgrp;
out:
read_unlock(&tasklist_lock);
return err;
}
 
/*
* Supplementary group IDs
*/
asmlinkage long sys_getgroups(int gidsetsize, gid_t *grouplist)
{
int i;
/*
* SMP: Nobody else can change our grouplist. Thus we are
* safe.
*/
 
if (gidsetsize < 0)
return -EINVAL;
i = current->ngroups;
if (gidsetsize) {
if (i > gidsetsize)
return -EINVAL;
if (copy_to_user(grouplist, current->groups, sizeof(gid_t)*i))
return -EFAULT;
}
return i;
}
 
/*
* SMP: Our groups are not shared. We can copy to/from them safely
* without another task interfering.
*/
asmlinkage long sys_setgroups(int gidsetsize, gid_t *grouplist)
{
if (!capable(CAP_SETGID))
return -EPERM;
if ((unsigned) gidsetsize > NGROUPS)
return -EINVAL;
if(copy_from_user(current->groups, grouplist, gidsetsize * sizeof(gid_t)))
return -EFAULT;
current->ngroups = gidsetsize;
return 0;
}
 
static int supplemental_group_member(gid_t grp)
{
int i = current->ngroups;
 
if (i) {
gid_t *groups = current->groups;
do {
if (*groups == grp)
return 1;
groups++;
i--;
} while (i);
}
return 0;
}
 
/*
* Check whether we're fsgid/egid or in the supplemental group..
*/
int in_group_p(gid_t grp)
{
int retval = 1;
if (grp != current->fsgid)
retval = supplemental_group_member(grp);
return retval;
}
 
int in_egroup_p(gid_t grp)
{
int retval = 1;
if (grp != current->egid)
retval = supplemental_group_member(grp);
return retval;
}
 
DECLARE_RWSEM(uts_sem);
 
asmlinkage long sys_newuname(struct new_utsname * name)
{
int errno = 0;
 
down_read(&uts_sem);
if (copy_to_user(name,&system_utsname,sizeof *name))
errno = -EFAULT;
up_read(&uts_sem);
return errno;
}
 
asmlinkage long sys_sethostname(char *name, int len)
{
int errno;
char tmp[__NEW_UTS_LEN];
 
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
memcpy(system_utsname.nodename, tmp, len);
system_utsname.nodename[len] = 0;
errno = 0;
}
up_write(&uts_sem);
return errno;
}
 
asmlinkage long sys_gethostname(char *name, int len)
{
int i, errno;
 
if (len < 0)
return -EINVAL;
down_read(&uts_sem);
i = 1 + strlen(system_utsname.nodename);
if (i > len)
i = len;
errno = 0;
if (copy_to_user(name, system_utsname.nodename, i))
errno = -EFAULT;
up_read(&uts_sem);
return errno;
}
 
/*
* Only setdomainname; getdomainname can be implemented by calling
* uname()
*/
asmlinkage long sys_setdomainname(char *name, int len)
{
int errno;
char tmp[__NEW_UTS_LEN];
 
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
 
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
memcpy(system_utsname.domainname, tmp, len);
system_utsname.domainname[len] = 0;
errno = 0;
}
up_write(&uts_sem);
return errno;
}
 
asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit *rlim)
{
if (resource >= RLIM_NLIMITS)
return -EINVAL;
else
return copy_to_user(rlim, current->rlim + resource, sizeof(*rlim))
? -EFAULT : 0;
}
 
#if !defined(__ia64__)
 
/*
* Back compatibility for getrlimit. Needed for some apps.
*/
asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit *rlim)
{
struct rlimit x;
if (resource >= RLIM_NLIMITS)
return -EINVAL;
 
memcpy(&x, current->rlim + resource, sizeof(*rlim));
if(x.rlim_cur > 0x7FFFFFFF)
x.rlim_cur = 0x7FFFFFFF;
if(x.rlim_max > 0x7FFFFFFF)
x.rlim_max = 0x7FFFFFFF;
return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
}
 
#endif
 
asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit *rlim)
{
struct rlimit new_rlim, *old_rlim;
 
if (resource >= RLIM_NLIMITS)
return -EINVAL;
if(copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
return -EFAULT;
if (new_rlim.rlim_cur > new_rlim.rlim_max)
return -EINVAL;
old_rlim = current->rlim + resource;
if (((new_rlim.rlim_cur > old_rlim->rlim_max) ||
(new_rlim.rlim_max > old_rlim->rlim_max)) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
if (resource == RLIMIT_NOFILE) {
if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
return -EPERM;
}
*old_rlim = new_rlim;
return 0;
}
 
/*
* It would make sense to put struct rusage in the task_struct,
* except that would make the task_struct be *really big*. After
* task_struct gets moved into malloc'ed memory, it would
* make sense to do this. It will make moving the rest of the information
* a lot simpler! (Which we're not doing right now because we're not
* measuring them yet).
*
* This is SMP safe. Either we are called from sys_getrusage on ourselves
* below (we know we aren't going to exit/disappear and only we change our
* rusage counters), or we are called from wait4() on a process which is
* either stopped or zombied. In the zombied case the task won't get
* reaped till shortly after the call to getrusage(), in both cases the
* task being examined is in a frozen state so the counters won't change.
*
* FIXME! Get the fault counts properly!
*/
int getrusage(struct task_struct *p, int who, struct rusage *ru)
{
struct rusage r;
 
memset((char *) &r, 0, sizeof(r));
switch (who) {
case RUSAGE_SELF:
r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_utime);
r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_utime);
r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_stime);
r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime);
r.ru_minflt = p->min_flt;
r.ru_majflt = p->maj_flt;
r.ru_nswap = p->nswap;
break;
case RUSAGE_CHILDREN:
r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_cutime);
r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_cutime);
r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_cstime);
r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_cstime);
r.ru_minflt = p->cmin_flt;
r.ru_majflt = p->cmaj_flt;
r.ru_nswap = p->cnswap;
break;
default:
r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_utime + p->times.tms_cutime);
r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_utime + p->times.tms_cutime);
r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_stime + p->times.tms_cstime);
r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime + p->times.tms_cstime);
r.ru_minflt = p->min_flt + p->cmin_flt;
r.ru_majflt = p->maj_flt + p->cmaj_flt;
r.ru_nswap = p->nswap + p->cnswap;
break;
}
return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
 
asmlinkage long sys_getrusage(int who, struct rusage *ru)
{
if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
return -EINVAL;
return getrusage(current, who, ru);
}
 
asmlinkage long sys_umask(int mask)
{
mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
return mask;
}
asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
int error = 0;
int sig;
 
switch (option) {
case PR_SET_PDEATHSIG:
sig = arg2;
if (sig < 0 || sig > _NSIG) {
error = -EINVAL;
break;
}
current->pdeath_signal = sig;
break;
case PR_GET_PDEATHSIG:
error = put_user(current->pdeath_signal, (int *)arg2);
break;
case PR_GET_DUMPABLE:
if (is_dumpable(current))
error = 1;
break;
case PR_SET_DUMPABLE:
if (arg2 != 0 && arg2 != 1) {
error = -EINVAL;
break;
}
current->mm->dumpable = arg2;
break;
 
case PR_SET_UNALIGN:
error = SET_UNALIGN_CTL(current, arg2);
break;
case PR_GET_UNALIGN:
error = GET_UNALIGN_CTL(current, arg2);
break;
case PR_SET_FPEMU:
error = SET_FPEMU_CTL(current, arg2);
break;
case PR_GET_FPEMU:
error = GET_FPEMU_CTL(current, arg2);
break;
case PR_SET_FPEXC:
error = SET_FPEXC_CTL(current, arg2);
break;
case PR_GET_FPEXC:
error = GET_FPEXC_CTL(current, arg2);
break;
 
case PR_GET_KEEPCAPS:
if (current->keep_capabilities)
error = 1;
break;
case PR_SET_KEEPCAPS:
if (arg2 != 0 && arg2 != 1) {
error = -EINVAL;
break;
}
current->keep_capabilities = arg2;
break;
default:
error = -EINVAL;
break;
}
return error;
}
 
EXPORT_SYMBOL(notifier_chain_register);
EXPORT_SYMBOL(notifier_chain_unregister);
EXPORT_SYMBOL(notifier_call_chain);
EXPORT_SYMBOL(register_reboot_notifier);
EXPORT_SYMBOL(unregister_reboot_notifier);
EXPORT_SYMBOL(in_group_p);
EXPORT_SYMBOL(in_egroup_p);
/ptrace.c
0,0 → 1,234
/*
* linux/kernel/ptrace.c
*
* (C) Copyright 1999 Linus Torvalds
*
* Common interfaces for "ptrace()" which we do not want
* to continually duplicate across every architecture.
*/
 
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/smp_lock.h>
 
#include <asm/pgtable.h>
#include <asm/uaccess.h>
 
/*
* Check that we have indeed attached to the thing..
*/
int ptrace_check_attach(struct task_struct *child, int kill)
{
 
if (!(child->ptrace & PT_PTRACED))
return -ESRCH;
 
if (child->p_pptr != current)
return -ESRCH;
 
if (!kill) {
if (child->state != TASK_STOPPED)
return -ESRCH;
#ifdef CONFIG_SMP
/* Make sure the child gets off its CPU.. */
for (;;) {
task_lock(child);
if (!task_has_cpu(child))
break;
task_unlock(child);
do {
if (child->state != TASK_STOPPED)
return -ESRCH;
barrier();
cpu_relax();
} while (task_has_cpu(child));
}
task_unlock(child);
#endif
}
 
/* All systems go.. */
return 0;
}
 
int ptrace_attach(struct task_struct *task)
{
task_lock(task);
if (task->pid <= 1)
goto bad;
if (task == current)
goto bad;
if (!task->mm)
goto bad;
if(((current->uid != task->euid) ||
(current->uid != task->suid) ||
(current->uid != task->uid) ||
(current->gid != task->egid) ||
(current->gid != task->sgid) ||
(!cap_issubset(task->cap_permitted, current->cap_permitted)) ||
(current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
goto bad;
rmb();
if (!is_dumpable(task) && !capable(CAP_SYS_PTRACE))
goto bad;
/* the same process cannot be attached many times */
if (task->ptrace & PT_PTRACED)
goto bad;
 
/* Go */
task->ptrace |= PT_PTRACED;
if (capable(CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
task_unlock(task);
 
write_lock_irq(&tasklist_lock);
if (task->p_pptr != current) {
REMOVE_LINKS(task);
task->p_pptr = current;
SET_LINKS(task);
}
write_unlock_irq(&tasklist_lock);
 
send_sig(SIGSTOP, task, 1);
return 0;
 
bad:
task_unlock(task);
return -EPERM;
}
 
int ptrace_detach(struct task_struct *child, unsigned int data)
{
if ((unsigned long) data > _NSIG)
return -EIO;
 
/* Architecture-specific hardware disable .. */
ptrace_disable(child);
 
/* .. re-parent .. */
child->ptrace = 0;
child->exit_code = data;
write_lock_irq(&tasklist_lock);
REMOVE_LINKS(child);
child->p_pptr = child->p_opptr;
SET_LINKS(child);
write_unlock_irq(&tasklist_lock);
 
/* .. and wake it up. */
wake_up_process(child);
return 0;
}
 
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
 
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
{
struct mm_struct *mm;
struct vm_area_struct *vma;
struct page *page;
void *old_buf = buf;
 
/* Worry about races with exit() */
task_lock(tsk);
mm = tsk->mm;
if (mm)
atomic_inc(&mm->mm_users);
task_unlock(tsk);
if (!mm)
return 0;
 
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was sucessfully transfered */
while (len) {
int bytes, ret, offset;
void *maddr;
 
ret = get_user_pages(current, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0)
break;
 
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
 
flush_cache_page(vma, addr);
 
maddr = kmap(page);
if (write) {
memcpy(maddr + offset, buf, bytes);
flush_page_to_ram(page);
flush_icache_user_range(vma, page, addr, len);
set_page_dirty(page);
} else {
memcpy(buf, maddr + offset, bytes);
flush_page_to_ram(page);
}
kunmap(page);
put_page(page);
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
mmput(mm);
return buf - old_buf;
}
 
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len)
{
int copied = 0;
 
while (len > 0) {
char buf[128];
int this_len, retval;
 
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
retval = access_process_vm(tsk, src, buf, this_len, 0);
if (!retval) {
if (copied)
break;
return -EIO;
}
if (copy_to_user(dst, buf, retval))
return -EFAULT;
copied += retval;
src += retval;
dst += retval;
len -= retval;
}
return copied;
}
 
int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len)
{
int copied = 0;
 
while (len > 0) {
char buf[128];
int this_len, retval;
 
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
retval = access_process_vm(tsk, dst, buf, this_len, 1);
if (!retval) {
if (copied)
break;
return -EIO;
}
copied += retval;
src += retval;
dst += retval;
len -= retval;
}
return copied;
}
/user.c
0,0 → 1,154
/*
* The "user cache".
*
* (C) Copyright 1991-2000 Linus Torvalds
*
* We have a per-user structure to keep track of how many
* processes, files etc the user has claimed, in order to be
* able to have per-user limits for system resources.
*/
 
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
 
/*
* UID task count cache, to get fast user lookup in "alloc_uid"
* when changing user ID's (ie setuid() and friends).
*/
#define UIDHASH_BITS 8
#define UIDHASH_SZ (1 << UIDHASH_BITS)
#define UIDHASH_MASK (UIDHASH_SZ - 1)
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) ^ uid) & UIDHASH_MASK)
#define uidhashentry(uid) (uidhash_table + __uidhashfn(uid))
 
static kmem_cache_t *uid_cachep;
static struct user_struct *uidhash_table[UIDHASH_SZ];
static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
 
struct user_struct root_user = {
__count: ATOMIC_INIT(1),
processes: ATOMIC_INIT(1),
files: ATOMIC_INIT(0)
};
 
/*
* These routines must be called with the uidhash spinlock held!
*/
static inline void uid_hash_insert(struct user_struct *up, struct user_struct **hashent)
{
struct user_struct *next = *hashent;
 
up->next = next;
if (next)
next->pprev = &up->next;
up->pprev = hashent;
*hashent = up;
}
 
static inline void uid_hash_remove(struct user_struct *up)
{
struct user_struct *next = up->next;
struct user_struct **pprev = up->pprev;
 
if (next)
next->pprev = pprev;
*pprev = next;
}
 
static inline struct user_struct *uid_hash_find(uid_t uid, struct user_struct **hashent)
{
struct user_struct *next;
 
next = *hashent;
for (;;) {
struct user_struct *up = next;
if (next) {
next = up->next;
if (up->uid != uid)
continue;
atomic_inc(&up->__count);
}
return up;
}
}
 
void free_uid(struct user_struct *up)
{
if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
uid_hash_remove(up);
kmem_cache_free(uid_cachep, up);
spin_unlock(&uidhash_lock);
}
}
 
struct user_struct * alloc_uid(uid_t uid)
{
struct user_struct **hashent = uidhashentry(uid);
struct user_struct *up;
 
spin_lock(&uidhash_lock);
up = uid_hash_find(uid, hashent);
spin_unlock(&uidhash_lock);
 
if (!up) {
struct user_struct *new;
 
new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
if (!new)
return NULL;
new->uid = uid;
atomic_set(&new->__count, 1);
atomic_set(&new->processes, 0);
atomic_set(&new->files, 0);
 
/*
* Before adding this, check whether we raced
* on adding the same user already..
*/
spin_lock(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
kmem_cache_free(uid_cachep, new);
} else {
uid_hash_insert(new, hashent);
up = new;
}
spin_unlock(&uidhash_lock);
 
}
return up;
}
 
void switch_uid(struct user_struct *new_user)
{
struct user_struct *old_user;
 
/* What if a process setreuid()'s and this brings the
* new uid over his NPROC rlimit? We can check this now
* cheaply with the new uid cache, so if it matters
* we should be checking for it. -DaveM
*/
old_user = current->user;
atomic_inc(&new_user->__count);
atomic_inc(&new_user->processes);
atomic_dec(&old_user->processes);
current->user = new_user;
free_uid(old_user);
}
 
 
static int __init uid_cache_init(void)
{
uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if(!uid_cachep)
panic("Cannot create uid taskcount SLAB cache\n");
 
/* Insert the root user immediately - init already runs with this */
uid_hash_insert(&root_user, uidhashentry(0));
return 0;
}
 
module_init(uid_cache_init);
/timer.c
0,0 → 1,876
/*
* linux/kernel/timer.c
*
* Kernel internal timers, kernel timekeeping, basic process system calls
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
*
* 1997-09-10 Updated NTP code according to technical memorandum Jan '96
* "A Kernel Model for Precision Timekeeping" by Dave Mills
* 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
* serialize accesses to xtime/lost_ticks).
* Copyright (C) 1998 Andrea Arcangeli
* 1999-03-10 Improved NTP compatibility by Ulrich Windl
*/
 
#include <linux/config.h>
#include <linux/mm.h>
#include <linux/timex.h>
#include <linux/delay.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
 
#include <asm/uaccess.h>
 
/*
* Timekeeping variables
*/
 
long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */
 
/* The current time */
struct timeval xtime __attribute__ ((aligned (16)));
 
/* Don't completely fail for HZ > 500. */
int tickadj = 500/HZ ? : 1; /* microsecs */
 
DECLARE_TASK_QUEUE(tq_timer);
DECLARE_TASK_QUEUE(tq_immediate);
 
/*
* phase-lock loop variables
*/
/* TIME_ERROR prevents overwriting the CMOS clock */
int time_state = TIME_OK; /* clock synchronization status */
int time_status = STA_UNSYNC; /* clock status bits */
long time_offset; /* time adjustment (us) */
long time_constant = 2; /* pll time constant */
long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
long time_precision = 1; /* clock precision (us) */
long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
long time_phase; /* phase offset (scaled us) */
long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
/* frequency offset (scaled ppm)*/
long time_adj; /* tick adjust (scaled 1 / HZ) */
long time_reftime; /* time at last adjustment (s) */
 
long time_adjust;
long time_adjust_step;
 
unsigned long event;
 
extern int do_setitimer(int, struct itimerval *, struct itimerval *);
 
unsigned long volatile jiffies;
 
unsigned int * prof_buffer;
unsigned long prof_len;
unsigned long prof_shift;
 
/*
* Event timer code
*/
#define TVN_BITS 6
#define TVR_BITS 8
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
 
struct timer_vec {
int index;
struct list_head vec[TVN_SIZE];
};
 
struct timer_vec_root {
int index;
struct list_head vec[TVR_SIZE];
};
 
static struct timer_vec tv5;
static struct timer_vec tv4;
static struct timer_vec tv3;
static struct timer_vec tv2;
static struct timer_vec_root tv1;
 
static struct timer_vec * const tvecs[] = {
(struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
};
 
static struct list_head * run_timer_list_running;
 
#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
 
void init_timervecs (void)
{
int i;
 
for (i = 0; i < TVN_SIZE; i++) {
INIT_LIST_HEAD(tv5.vec + i);
INIT_LIST_HEAD(tv4.vec + i);
INIT_LIST_HEAD(tv3.vec + i);
INIT_LIST_HEAD(tv2.vec + i);
}
for (i = 0; i < TVR_SIZE; i++)
INIT_LIST_HEAD(tv1.vec + i);
}
 
static unsigned long timer_jiffies;
 
static inline void internal_add_timer(struct timer_list *timer)
{
/*
* must be cli-ed when calling this
*/
unsigned long expires = timer->expires;
unsigned long idx = expires - timer_jiffies;
struct list_head * vec;
 
if (run_timer_list_running)
vec = run_timer_list_running;
else if (idx < TVR_SIZE) {
int i = expires & TVR_MASK;
vec = tv1.vec + i;
} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
int i = (expires >> TVR_BITS) & TVN_MASK;
vec = tv2.vec + i;
} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
vec = tv3.vec + i;
} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
vec = tv4.vec + i;
} else if ((signed long) idx < 0) {
/* can happen if you add a timer with expires == jiffies,
* or you set a timer to go off in the past
*/
vec = tv1.vec + tv1.index;
} else if (idx <= 0xffffffffUL) {
int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
vec = tv5.vec + i;
} else {
/* Can only get here on architectures with 64-bit jiffies */
INIT_LIST_HEAD(&timer->list);
return;
}
/*
* Timers are FIFO!
*/
list_add(&timer->list, vec->prev);
}
 
/* Initialize both explicitly - let's try to have them in the same cache line */
spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
 
#ifdef CONFIG_SMP
volatile struct timer_list * volatile running_timer;
#define timer_enter(t) do { running_timer = t; mb(); } while (0)
#define timer_exit() do { running_timer = NULL; } while (0)
#define timer_is_running(t) (running_timer == t)
#define timer_synchronize(t) while (timer_is_running(t)) barrier()
#else
#define timer_enter(t) do { } while (0)
#define timer_exit() do { } while (0)
#endif
 
void add_timer(struct timer_list *timer)
{
unsigned long flags;
 
spin_lock_irqsave(&timerlist_lock, flags);
if (timer_pending(timer))
goto bug;
internal_add_timer(timer);
spin_unlock_irqrestore(&timerlist_lock, flags);
return;
bug:
spin_unlock_irqrestore(&timerlist_lock, flags);
printk("bug: kernel timer added twice at %p.\n",
__builtin_return_address(0));
}
 
static inline int detach_timer (struct timer_list *timer)
{
if (!timer_pending(timer))
return 0;
list_del(&timer->list);
return 1;
}
 
int mod_timer(struct timer_list *timer, unsigned long expires)
{
int ret;
unsigned long flags;
 
spin_lock_irqsave(&timerlist_lock, flags);
timer->expires = expires;
ret = detach_timer(timer);
internal_add_timer(timer);
spin_unlock_irqrestore(&timerlist_lock, flags);
return ret;
}
 
int del_timer(struct timer_list * timer)
{
int ret;
unsigned long flags;
 
spin_lock_irqsave(&timerlist_lock, flags);
ret = detach_timer(timer);
timer->list.next = timer->list.prev = NULL;
spin_unlock_irqrestore(&timerlist_lock, flags);
return ret;
}
 
#ifdef CONFIG_SMP
void sync_timers(void)
{
spin_unlock_wait(&global_bh_lock);
}
 
/*
* SMP specific function to delete periodic timer.
* Caller must disable by some means restarting the timer
* for new. Upon exit the timer is not queued and handler is not running
* on any CPU. It returns number of times, which timer was deleted
* (for reference counting).
*/
 
int del_timer_sync(struct timer_list * timer)
{
int ret = 0;
 
for (;;) {
unsigned long flags;
int running;
 
spin_lock_irqsave(&timerlist_lock, flags);
ret += detach_timer(timer);
timer->list.next = timer->list.prev = 0;
running = timer_is_running(timer);
spin_unlock_irqrestore(&timerlist_lock, flags);
 
if (!running)
break;
 
timer_synchronize(timer);
}
 
return ret;
}
#endif
 
 
static inline void cascade_timers(struct timer_vec *tv)
{
/* cascade all the timers from tv up one level */
struct list_head *head, *curr, *next;
 
head = tv->vec + tv->index;
curr = head->next;
/*
* We are removing _all_ timers from the list, so we don't have to
* detach them individually, just clear the list afterwards.
*/
while (curr != head) {
struct timer_list *tmp;
 
tmp = list_entry(curr, struct timer_list, list);
next = curr->next;
list_del(curr); // not needed
internal_add_timer(tmp);
curr = next;
}
INIT_LIST_HEAD(head);
tv->index = (tv->index + 1) & TVN_MASK;
}
 
static inline void run_timer_list(void)
{
spin_lock_irq(&timerlist_lock);
while ((long)(jiffies - timer_jiffies) >= 0) {
LIST_HEAD(queued);
struct list_head *head, *curr;
if (!tv1.index) {
int n = 1;
do {
cascade_timers(tvecs[n]);
} while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
}
run_timer_list_running = &queued;
repeat:
head = tv1.vec + tv1.index;
curr = head->next;
if (curr != head) {
struct timer_list *timer;
void (*fn)(unsigned long);
unsigned long data;
 
timer = list_entry(curr, struct timer_list, list);
fn = timer->function;
data= timer->data;
 
detach_timer(timer);
timer->list.next = timer->list.prev = NULL;
timer_enter(timer);
spin_unlock_irq(&timerlist_lock);
fn(data);
spin_lock_irq(&timerlist_lock);
timer_exit();
goto repeat;
}
run_timer_list_running = NULL;
++timer_jiffies;
tv1.index = (tv1.index + 1) & TVR_MASK;
 
curr = queued.next;
while (curr != &queued) {
struct timer_list *timer;
 
timer = list_entry(curr, struct timer_list, list);
curr = curr->next;
internal_add_timer(timer);
}
}
spin_unlock_irq(&timerlist_lock);
}
 
spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED;
 
void tqueue_bh(void)
{
run_task_queue(&tq_timer);
}
 
void immediate_bh(void)
{
run_task_queue(&tq_immediate);
}
 
/*
* this routine handles the overflow of the microsecond field
*
* The tricky bits of code to handle the accurate clock support
* were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
* They were originally developed for SUN and DEC kernels.
* All the kudos should go to Dave for this stuff.
*
*/
static void second_overflow(void)
{
long ltemp;
 
/* Bump the maxerror field */
time_maxerror += time_tolerance >> SHIFT_USEC;
if ( time_maxerror > NTP_PHASE_LIMIT ) {
time_maxerror = NTP_PHASE_LIMIT;
time_status |= STA_UNSYNC;
}
 
/*
* Leap second processing. If in leap-insert state at
* the end of the day, the system clock is set back one
* second; if in leap-delete state, the system clock is
* set ahead one second. The microtime() routine or
* external clock driver will insure that reported time
* is always monotonic. The ugly divides should be
* replaced.
*/
switch (time_state) {
 
case TIME_OK:
if (time_status & STA_INS)
time_state = TIME_INS;
else if (time_status & STA_DEL)
time_state = TIME_DEL;
break;
 
case TIME_INS:
if (xtime.tv_sec % 86400 == 0) {
xtime.tv_sec--;
time_state = TIME_OOP;
printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
}
break;
 
case TIME_DEL:
if ((xtime.tv_sec + 1) % 86400 == 0) {
xtime.tv_sec++;
time_state = TIME_WAIT;
printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
}
break;
 
case TIME_OOP:
time_state = TIME_WAIT;
break;
 
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
}
 
/*
* Compute the phase adjustment for the next second. In
* PLL mode, the offset is reduced by a fixed factor
* times the time constant. In FLL mode the offset is
* used directly. In either mode, the maximum phase
* adjustment for each second is clamped so as to spread
* the adjustment over not more than the number of
* seconds between updates.
*/
if (time_offset < 0) {
ltemp = -time_offset;
if (!(time_status & STA_FLL))
ltemp >>= SHIFT_KG + time_constant;
if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
time_offset += ltemp;
time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
} else {
ltemp = time_offset;
if (!(time_status & STA_FLL))
ltemp >>= SHIFT_KG + time_constant;
if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
time_offset -= ltemp;
time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
}
 
/*
* Compute the frequency estimate and additional phase
* adjustment due to frequency error for the next
* second. When the PPS signal is engaged, gnaw on the
* watchdog counter and update the frequency computed by
* the pll and the PPS signal.
*/
pps_valid++;
if (pps_valid == PPS_VALID) { /* PPS signal lost */
pps_jitter = MAXTIME;
pps_stabil = MAXFREQ;
time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
STA_PPSWANDER | STA_PPSERROR);
}
ltemp = time_freq + pps_freq;
if (ltemp < 0)
time_adj -= -ltemp >>
(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
else
time_adj += ltemp >>
(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
 
#if HZ == 100
/* Compensate for (HZ==100) != (1 << SHIFT_HZ).
* Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
*/
if (time_adj < 0)
time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
else
time_adj += (time_adj >> 2) + (time_adj >> 5);
#endif
}
 
/* in the NTP reference this is called "hardclock()" */
static void update_wall_time_one_tick(void)
{
if ( (time_adjust_step = time_adjust) != 0 ) {
/* We are doing an adjtime thing.
*
* Prepare time_adjust_step to be within bounds.
* Note that a positive time_adjust means we want the clock
* to run faster.
*
* Limit the amount of the step to be in the range
* -tickadj .. +tickadj
*/
if (time_adjust > tickadj)
time_adjust_step = tickadj;
else if (time_adjust < -tickadj)
time_adjust_step = -tickadj;
/* Reduce by this step the amount of time left */
time_adjust -= time_adjust_step;
}
xtime.tv_usec += tick + time_adjust_step;
/*
* Advance the phase, once it gets to one microsecond, then
* advance the tick more.
*/
time_phase += time_adj;
if (time_phase <= -FINEUSEC) {
long ltemp = -time_phase >> SHIFT_SCALE;
time_phase += ltemp << SHIFT_SCALE;
xtime.tv_usec -= ltemp;
}
else if (time_phase >= FINEUSEC) {
long ltemp = time_phase >> SHIFT_SCALE;
time_phase -= ltemp << SHIFT_SCALE;
xtime.tv_usec += ltemp;
}
}
 
/*
* Using a loop looks inefficient, but "ticks" is
* usually just one (we shouldn't be losing ticks,
* we're doing this this way mainly for interrupt
* latency reasons, not because we think we'll
* have lots of lost timer ticks
*/
static void update_wall_time(unsigned long ticks)
{
do {
ticks--;
update_wall_time_one_tick();
} while (ticks);
 
if (xtime.tv_usec >= 1000000) {
xtime.tv_usec -= 1000000;
xtime.tv_sec++;
second_overflow();
}
}
 
static inline void do_process_times(struct task_struct *p,
unsigned long user, unsigned long system)
{
unsigned long psecs;
 
psecs = (p->times.tms_utime += user);
psecs += (p->times.tms_stime += system);
if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
/* Send SIGXCPU every second.. */
if (!(psecs % HZ))
send_sig(SIGXCPU, p, 1);
/* and SIGKILL when we go over max.. */
if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
send_sig(SIGKILL, p, 1);
}
}
 
static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
{
unsigned long it_virt = p->it_virt_value;
 
if (it_virt) {
it_virt -= ticks;
if (!it_virt) {
it_virt = p->it_virt_incr;
send_sig(SIGVTALRM, p, 1);
}
p->it_virt_value = it_virt;
}
}
 
static inline void do_it_prof(struct task_struct *p)
{
unsigned long it_prof = p->it_prof_value;
 
if (it_prof) {
if (--it_prof == 0) {
it_prof = p->it_prof_incr;
send_sig(SIGPROF, p, 1);
}
p->it_prof_value = it_prof;
}
}
 
void update_one_process(struct task_struct *p, unsigned long user,
unsigned long system, int cpu)
{
p->per_cpu_utime[cpu] += user;
p->per_cpu_stime[cpu] += system;
do_process_times(p, user, system);
do_it_virt(p, user);
do_it_prof(p);
}
 
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
struct task_struct *p = current;
int cpu = smp_processor_id(), system = user_tick ^ 1;
 
update_one_process(p, user_tick, system, cpu);
if (p->pid) {
if (--p->counter <= 0) {
p->counter = 0;
/*
* SCHED_FIFO is priority preemption, so this is
* not the place to decide whether to reschedule a
* SCHED_FIFO task or not - Bhavesh Davda
*/
if (p->policy != SCHED_FIFO) {
p->need_resched = 1;
}
}
if (p->nice > 0)
kstat.per_cpu_nice[cpu] += user_tick;
else
kstat.per_cpu_user[cpu] += user_tick;
kstat.per_cpu_system[cpu] += system;
} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
kstat.per_cpu_system[cpu] += system;
}
 
/*
* Nr of active tasks - counted in fixed-point numbers
*/
static unsigned long count_active_tasks(void)
{
struct task_struct *p;
unsigned long nr = 0;
 
read_lock(&tasklist_lock);
for_each_task(p) {
if ((p->state == TASK_RUNNING ||
(p->state & TASK_UNINTERRUPTIBLE)))
nr += FIXED_1;
}
read_unlock(&tasklist_lock);
return nr;
}
 
/*
* Hmm.. Changed this, as the GNU make sources (load.c) seems to
* imply that avenrun[] is the standard name for this kind of thing.
* Nothing else seems to be standardized: the fractional size etc
* all seem to differ on different machines.
*/
unsigned long avenrun[3];
 
static inline void calc_load(unsigned long ticks)
{
unsigned long active_tasks; /* fixed-point */
static int count = LOAD_FREQ;
 
count -= ticks;
if (count < 0) {
count += LOAD_FREQ;
active_tasks = count_active_tasks();
CALC_LOAD(avenrun[0], EXP_1, active_tasks);
CALC_LOAD(avenrun[1], EXP_5, active_tasks);
CALC_LOAD(avenrun[2], EXP_15, active_tasks);
}
}
 
/* jiffies at the most recent update of wall time */
unsigned long wall_jiffies;
 
/*
* This spinlock protect us from races in SMP while playing with xtime. -arca
*/
rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
 
static inline void update_times(void)
{
unsigned long ticks;
 
/*
* update_times() is run from the raw timer_bh handler so we
* just know that the irqs are locally enabled and so we don't
* need to save/restore the flags of the local CPU here. -arca
*/
write_lock_irq(&xtime_lock);
vxtime_lock();
 
ticks = jiffies - wall_jiffies;
if (ticks) {
wall_jiffies += ticks;
update_wall_time(ticks);
}
vxtime_unlock();
write_unlock_irq(&xtime_lock);
calc_load(ticks);
}
 
void timer_bh(void)
{
update_times();
run_timer_list();
}
 
void do_timer(struct pt_regs *regs)
{
(*(unsigned long *)&jiffies)++;
#ifndef CONFIG_SMP
/* SMP process accounting uses the local APIC timer */
 
update_process_times(user_mode(regs));
#endif
mark_bh(TIMER_BH);
if (TQ_ACTIVE(tq_timer))
mark_bh(TQUEUE_BH);
}
 
#if !defined(__alpha__) && !defined(__ia64__)
 
/*
* For backwards compatibility? This can be done in libc so Alpha
* and all newer ports shouldn't need it.
*/
asmlinkage unsigned long sys_alarm(unsigned int seconds)
{
struct itimerval it_new, it_old;
unsigned int oldalarm;
 
it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
it_new.it_value.tv_sec = seconds;
it_new.it_value.tv_usec = 0;
do_setitimer(ITIMER_REAL, &it_new, &it_old);
oldalarm = it_old.it_value.tv_sec;
/* ehhh.. We can't return 0 if we have an alarm pending.. */
/* And we'd better return too much than too little anyway */
if (it_old.it_value.tv_usec)
oldalarm++;
return oldalarm;
}
 
#endif
 
#ifndef __alpha__
 
/*
* The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
* should be moved into arch/i386 instead?
*/
 
/**
* sys_getpid - return the thread group id of the current process
*
* Note, despite the name, this returns the tgid not the pid. The tgid and
* the pid are identical unless CLONE_THREAD was specified on clone() in
* which case the tgid is the same in all threads of the same group.
*
* This is SMP safe as current->tgid does not change.
*/
asmlinkage long sys_getpid(void)
{
return current->tgid;
}
 
/*
* This is not strictly SMP safe: p_opptr could change
* from under us. However, rather than getting any lock
* we can use an optimistic algorithm: get the parent
* pid, and go back and check that the parent is still
* the same. If it has changed (which is extremely unlikely
* indeed), we just try again..
*
* NOTE! This depends on the fact that even if we _do_
* get an old value of "parent", we can happily dereference
* the pointer: we just can't necessarily trust the result
* until we know that the parent pointer is valid.
*
* The "mb()" macro is a memory barrier - a synchronizing
* event. It also makes sure that gcc doesn't optimize
* away the necessary memory references.. The barrier doesn't
* have to have all that strong semantics: on x86 we don't
* really require a synchronizing instruction, for example.
* The barrier is more important for code generation than
* for any real memory ordering semantics (even if there is
* a small window for a race, using the old pointer is
* harmless for a while).
*/
asmlinkage long sys_getppid(void)
{
int pid;
struct task_struct * me = current;
struct task_struct * parent;
 
parent = me->p_opptr;
for (;;) {
pid = parent->pid;
#if CONFIG_SMP
{
struct task_struct *old = parent;
mb();
parent = me->p_opptr;
if (old != parent)
continue;
}
#endif
break;
}
return pid;
}
 
asmlinkage long sys_getuid(void)
{
/* Only we change this so SMP safe */
return current->uid;
}
 
asmlinkage long sys_geteuid(void)
{
/* Only we change this so SMP safe */
return current->euid;
}
 
asmlinkage long sys_getgid(void)
{
/* Only we change this so SMP safe */
return current->gid;
}
 
asmlinkage long sys_getegid(void)
{
/* Only we change this so SMP safe */
return current->egid;
}
 
#endif
 
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid(void)
{
return current->pid;
}
 
asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
{
struct timespec t;
unsigned long expire;
 
if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
return -EFAULT;
 
if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
return -EINVAL;
 
 
if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
current->policy != SCHED_OTHER)
{
/*
* Short delay requests up to 2 ms will be handled with
* high precision by a busy wait for all real-time processes.
*
* Its important on SMP not to do this holding locks.
*/
udelay((t.tv_nsec + 999) / 1000);
return 0;
}
 
expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
 
current->state = TASK_INTERRUPTIBLE;
expire = schedule_timeout(expire);
 
if (expire) {
if (rmtp) {
jiffies_to_timespec(expire, &t);
if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
return -EFAULT;
}
return -EINTR;
}
return 0;
}
 
/sysctl.c
0,0 → 1,1524
/*
* sysctl.c: General linux system control interface
*
* Begun 24 March 1995, Stephen Tweedie
* Added /proc support, Dec 1995
* Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
* Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
* Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
* Dynamic registration fixes, Stephen Tweedie.
* Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
* Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
* Horn.
* Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
* Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
* Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
* Wendling.
* The list_for_each() macro wasn't appropriate for the sysctl loop.
* Removed it and replaced it with older style, 03/23/00, Bill Wendling
*/
 
#include <linux/config.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/swapctl.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/utsname.h>
#include <linux/capability.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/highuid.h>
#include <linux/swap.h>
 
#include <asm/uaccess.h>
 
#ifdef CONFIG_ROOT_NFS
#include <linux/nfs_fs.h>
#endif
 
#if defined(CONFIG_SYSCTL)
 
/* External variables not in a header file. */
extern int panic_timeout;
extern int C_A_D;
extern int bdf_prm[], bdflush_min[], bdflush_max[];
extern int sysctl_overcommit_memory;
extern int max_threads;
extern atomic_t nr_queued_signals;
extern int max_queued_signals;
extern int sysrq_enabled;
extern int core_uses_pid;
extern int core_setuid_ok;
extern char core_pattern[];
extern int cad_pid;
extern int laptop_mode;
extern int block_dump;
 
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
 
#ifdef CONFIG_KMOD
extern char modprobe_path[];
#endif
#ifdef CONFIG_HOTPLUG
extern char hotplug_path[];
#endif
#ifdef CONFIG_CHR_DEV_SG
extern int sg_big_buff;
#endif
#ifdef CONFIG_SYSVIPC
extern size_t shm_ctlmax;
extern size_t shm_ctlall;
extern int shm_ctlmni;
extern int msg_ctlmax;
extern int msg_ctlmnb;
extern int msg_ctlmni;
extern int sem_ctls[];
#endif
 
extern int exception_trace;
 
#ifdef __sparc__
extern char reboot_command [];
extern int stop_a_enabled;
#endif
 
#ifdef CONFIG_ARCH_S390
#ifdef CONFIG_MATHEMU
extern int sysctl_ieee_emulation_warnings;
#endif
extern int sysctl_userprocess_debug;
#endif
 
#ifdef CONFIG_PPC32
extern unsigned long zero_paged_on, powersave_nap;
int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp);
int proc_dol3crvec(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp);
#endif
 
#ifdef CONFIG_BSD_PROCESS_ACCT
extern int acct_parm[];
#endif
 
extern int pgt_cache_water[];
 
static int parse_table(int *, int, void *, size_t *, void *, size_t,
ctl_table *, void **);
static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp);
 
static ctl_table root_table[];
static struct ctl_table_header root_table_header =
{ root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
 
static ctl_table kern_table[];
static ctl_table vm_table[];
#ifdef CONFIG_NET
extern ctl_table net_table[];
#endif
static ctl_table proc_table[];
static ctl_table fs_table[];
static ctl_table debug_table[];
static ctl_table dev_table[];
extern ctl_table random_table[];
 
/* /proc declarations: */
 
#ifdef CONFIG_PROC_FS
 
static ssize_t proc_readsys(struct file *, char *, size_t, loff_t *);
static ssize_t proc_writesys(struct file *, const char *, size_t, loff_t *);
static int proc_sys_permission(struct inode *, int);
 
struct file_operations proc_sys_file_operations = {
read: proc_readsys,
write: proc_writesys,
};
 
static struct inode_operations proc_sys_inode_operations = {
permission: proc_sys_permission,
};
 
extern struct proc_dir_entry *proc_sys_root;
 
static void register_proc_table(ctl_table *, struct proc_dir_entry *);
static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
#endif
 
/* The default sysctl tables: */
 
static ctl_table root_table[] = {
{CTL_KERN, "kernel", NULL, 0, 0555, kern_table},
{CTL_VM, "vm", NULL, 0, 0555, vm_table},
#ifdef CONFIG_NET
{CTL_NET, "net", NULL, 0, 0555, net_table},
#endif
{CTL_PROC, "proc", NULL, 0, 0555, proc_table},
{CTL_FS, "fs", NULL, 0, 0555, fs_table},
{CTL_DEBUG, "debug", NULL, 0, 0555, debug_table},
{CTL_DEV, "dev", NULL, 0, 0555, dev_table},
{0}
};
 
static ctl_table kern_table[] = {
{KERN_OSTYPE, "ostype", system_utsname.sysname, 64,
0444, NULL, &proc_doutsstring, &sysctl_string},
{KERN_OSRELEASE, "osrelease", system_utsname.release, 64,
0444, NULL, &proc_doutsstring, &sysctl_string},
{KERN_VERSION, "version", system_utsname.version, 64,
0444, NULL, &proc_doutsstring, &sysctl_string},
{KERN_NODENAME, "hostname", system_utsname.nodename, 64,
0644, NULL, &proc_doutsstring, &sysctl_string},
{KERN_DOMAINNAME, "domainname", system_utsname.domainname, 64,
0644, NULL, &proc_doutsstring, &sysctl_string},
{KERN_PANIC, "panic", &panic_timeout, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_CORE_USES_PID, "core_uses_pid", &core_uses_pid, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_CORE_SETUID, "core_setuid_ok", &core_setuid_ok, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_CORE_PATTERN, "core_pattern", core_pattern, 64,
0644, NULL, &proc_dostring, &sysctl_string},
{KERN_TAINTED, "tainted", &tainted, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_CAP_BSET, "cap-bound", &cap_bset, sizeof(kernel_cap_t),
0600, NULL, &proc_dointvec_bset},
#ifdef CONFIG_BLK_DEV_INITRD
{KERN_REALROOTDEV, "real-root-dev", &real_root_dev, sizeof(int),
0644, NULL, &proc_dointvec},
#endif
#ifdef __sparc__
{KERN_SPARC_REBOOT, "reboot-cmd", reboot_command,
256, 0644, NULL, &proc_dostring, &sysctl_string },
{KERN_SPARC_STOP_A, "stop-a", &stop_a_enabled, sizeof (int),
0644, NULL, &proc_dointvec},
#endif
#ifdef CONFIG_PPC32
{KERN_PPC_ZEROPAGED, "zero-paged", &zero_paged_on, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_PPC_POWERSAVE_NAP, "powersave-nap", &powersave_nap, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_PPC_L2CR, "l2cr", NULL, 0,
0644, NULL, &proc_dol2crvec},
{KERN_PPC_L3CR, "l3cr", NULL, 0,
0644, NULL, &proc_dol3crvec},
#endif
{KERN_CTLALTDEL, "ctrl-alt-del", &C_A_D, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_PRINTK, "printk", &console_loglevel, 4*sizeof(int),
0644, NULL, &proc_dointvec},
#ifdef CONFIG_KMOD
{KERN_MODPROBE, "modprobe", &modprobe_path, 256,
0644, NULL, &proc_dostring, &sysctl_string },
#endif
#ifdef CONFIG_HOTPLUG
{KERN_HOTPLUG, "hotplug", &hotplug_path, 256,
0644, NULL, &proc_dostring, &sysctl_string },
#endif
#ifdef CONFIG_CHR_DEV_SG
{KERN_SG_BIG_BUFF, "sg-big-buff", &sg_big_buff, sizeof (int),
0444, NULL, &proc_dointvec},
#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
{KERN_ACCT, "acct", &acct_parm, 3*sizeof(int),
0644, NULL, &proc_dointvec},
#endif
{KERN_RTSIGNR, "rtsig-nr", &nr_queued_signals, sizeof(int),
0444, NULL, &proc_dointvec},
{KERN_RTSIGMAX, "rtsig-max", &max_queued_signals, sizeof(int),
0644, NULL, &proc_dointvec},
#ifdef CONFIG_SYSVIPC
{KERN_SHMMAX, "shmmax", &shm_ctlmax, sizeof (size_t),
0644, NULL, &proc_doulongvec_minmax},
{KERN_SHMALL, "shmall", &shm_ctlall, sizeof (size_t),
0644, NULL, &proc_doulongvec_minmax},
{KERN_SHMMNI, "shmmni", &shm_ctlmni, sizeof (int),
0644, NULL, &proc_dointvec},
{KERN_MSGMAX, "msgmax", &msg_ctlmax, sizeof (int),
0644, NULL, &proc_dointvec},
{KERN_MSGMNI, "msgmni", &msg_ctlmni, sizeof (int),
0644, NULL, &proc_dointvec},
{KERN_MSGMNB, "msgmnb", &msg_ctlmnb, sizeof (int),
0644, NULL, &proc_dointvec},
{KERN_SEM, "sem", &sem_ctls, 4*sizeof (int),
0644, NULL, &proc_dointvec},
#endif
#ifdef CONFIG_MAGIC_SYSRQ
{KERN_SYSRQ, "sysrq", &sysrq_enabled, sizeof (int),
0644, NULL, &proc_dointvec},
#endif
{KERN_CADPID, "cad_pid", &cad_pid, sizeof (int),
0600, NULL, &proc_dointvec},
{KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int),
0644, NULL, &proc_dointvec},
{KERN_RANDOM, "random", NULL, 0, 0555, random_table},
{KERN_OVERFLOWUID, "overflowuid", &overflowuid, sizeof(int), 0644, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&minolduid, &maxolduid},
{KERN_OVERFLOWGID, "overflowgid", &overflowgid, sizeof(int), 0644, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&minolduid, &maxolduid},
#ifdef CONFIG_ARCH_S390
#ifdef CONFIG_MATHEMU
{KERN_IEEE_EMULATION_WARNINGS,"ieee_emulation_warnings",
&sysctl_ieee_emulation_warnings,sizeof(int),0644,NULL,&proc_dointvec},
#endif
{KERN_S390_USER_DEBUG_LOGGING,"userprocess_debug",
&sysctl_userprocess_debug,sizeof(int),0644,NULL,&proc_dointvec},
#endif
#ifdef __x86_64__
{KERN_EXCEPTION_TRACE,"exception-trace",
&exception_trace,sizeof(int),0644,NULL,&proc_dointvec},
#endif
{0}
};
 
static ctl_table vm_table[] = {
{VM_GFP_DEBUG, "vm_gfp_debug",
&vm_gfp_debug, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_VFS_SCAN_RATIO, "vm_vfs_scan_ratio",
&vm_vfs_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_CACHE_SCAN_RATIO, "vm_cache_scan_ratio",
&vm_cache_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_MAPPED_RATIO, "vm_mapped_ratio",
&vm_mapped_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_LRU_BALANCE_RATIO, "vm_lru_balance_ratio",
&vm_lru_balance_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_PASSES, "vm_passes",
&vm_passes, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&bdflush_min, &bdflush_max},
{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
{VM_PAGERDAEMON, "kswapd",
&pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
{VM_PGT_CACHE, "pagetable_cache",
&pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec},
{VM_PAGE_CLUSTER, "page-cluster",
&page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_MIN_READAHEAD, "min-readahead",
&vm_min_readahead,sizeof(int), 0644, NULL, &proc_dointvec},
{VM_MAX_READAHEAD, "max-readahead",
&vm_max_readahead,sizeof(int), 0644, NULL, &proc_dointvec},
{VM_MAX_MAP_COUNT, "max_map_count",
&max_map_count, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_LAPTOP_MODE, "laptop_mode",
&laptop_mode, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_BLOCK_DUMP, "block_dump",
&block_dump, sizeof(int), 0644, NULL, &proc_dointvec},
{0}
};
 
static ctl_table proc_table[] = {
{0}
};
 
static ctl_table fs_table[] = {
{FS_NRINODE, "inode-nr", &inodes_stat, 2*sizeof(int),
0444, NULL, &proc_dointvec},
{FS_STATINODE, "inode-state", &inodes_stat, 7*sizeof(int),
0444, NULL, &proc_dointvec},
{FS_NRFILE, "file-nr", &files_stat, 3*sizeof(int),
0444, NULL, &proc_dointvec},
{FS_MAXFILE, "file-max", &files_stat.max_files, sizeof(int),
0644, NULL, &proc_dointvec},
{FS_DENTRY, "dentry-state", &dentry_stat, 6*sizeof(int),
0444, NULL, &proc_dointvec},
{FS_OVERFLOWUID, "overflowuid", &fs_overflowuid, sizeof(int), 0644, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&minolduid, &maxolduid},
{FS_OVERFLOWGID, "overflowgid", &fs_overflowgid, sizeof(int), 0644, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&minolduid, &maxolduid},
{FS_LEASES, "leases-enable", &leases_enable, sizeof(int),
0644, NULL, &proc_dointvec},
{FS_DIR_NOTIFY, "dir-notify-enable", &dir_notify_enable,
sizeof(int), 0644, NULL, &proc_dointvec},
{FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int),
0644, NULL, &proc_dointvec},
{0}
};
 
static ctl_table debug_table[] = {
{0}
};
 
static ctl_table dev_table[] = {
{0}
};
 
extern void init_irq_proc (void);
 
void __init sysctl_init(void)
{
#ifdef CONFIG_PROC_FS
register_proc_table(root_table, proc_sys_root);
init_irq_proc();
#endif
}
 
int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
void *newval, size_t newlen)
{
struct list_head *tmp;
 
if (nlen <= 0 || nlen >= CTL_MAXNAME)
return -ENOTDIR;
if (oldval) {
int old_len;
if (!oldlenp || get_user(old_len, oldlenp))
return -EFAULT;
}
tmp = &root_table_header.ctl_entry;
do {
struct ctl_table_header *head =
list_entry(tmp, struct ctl_table_header, ctl_entry);
void *context = NULL;
int error = parse_table(name, nlen, oldval, oldlenp,
newval, newlen, head->ctl_table,
&context);
if (context)
kfree(context);
if (error != -ENOTDIR)
return error;
tmp = tmp->next;
} while (tmp != &root_table_header.ctl_entry);
return -ENOTDIR;
}
 
extern asmlinkage long sys_sysctl(struct __sysctl_args *args)
{
struct __sysctl_args tmp;
int error;
 
if (copy_from_user(&tmp, args, sizeof(tmp)))
return -EFAULT;
lock_kernel();
error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
tmp.newval, tmp.newlen);
unlock_kernel();
return error;
}
 
/*
* ctl_perm does NOT grant the superuser all rights automatically, because
* some sysctl variables are readonly even to root.
*/
 
static int test_perm(int mode, int op)
{
if (!current->euid)
mode >>= 6;
else if (in_egroup_p(0))
mode >>= 3;
if ((mode & op & 0007) == op)
return 0;
return -EACCES;
}
 
static inline int ctl_perm(ctl_table *table, int op)
{
return test_perm(table->mode, op);
}
 
static int parse_table(int *name, int nlen,
void *oldval, size_t *oldlenp,
void *newval, size_t newlen,
ctl_table *table, void **context)
{
int n;
repeat:
if (!nlen)
return -ENOTDIR;
if (get_user(n, name))
return -EFAULT;
for ( ; table->ctl_name; table++) {
if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
int error;
if (table->child) {
if (ctl_perm(table, 001))
return -EPERM;
if (table->strategy) {
error = table->strategy(
table, name, nlen,
oldval, oldlenp,
newval, newlen, context);
if (error)
return error;
}
name++;
nlen--;
table = table->child;
goto repeat;
}
error = do_sysctl_strategy(table, name, nlen,
oldval, oldlenp,
newval, newlen, context);
return error;
}
}
return -ENOTDIR;
}
 
/* Perform the actual read/write of a sysctl table entry. */
int do_sysctl_strategy (ctl_table *table,
int *name, int nlen,
void *oldval, size_t *oldlenp,
void *newval, size_t newlen, void **context)
{
int op = 0, rc;
size_t len;
 
if (oldval)
op |= 004;
if (newval)
op |= 002;
if (ctl_perm(table, op))
return -EPERM;
 
if (table->strategy) {
rc = table->strategy(table, name, nlen, oldval, oldlenp,
newval, newlen, context);
if (rc < 0)
return rc;
if (rc > 0)
return 0;
}
 
/* If there is no strategy routine, or if the strategy returns
* zero, proceed with automatic r/w */
if (table->data && table->maxlen) {
if (oldval && oldlenp) {
if (get_user(len, oldlenp))
return -EFAULT;
if (len) {
if (len > table->maxlen)
len = table->maxlen;
if(copy_to_user(oldval, table->data, len))
return -EFAULT;
if(put_user(len, oldlenp))
return -EFAULT;
}
}
if (newval && newlen) {
len = newlen;
if (len > table->maxlen)
len = table->maxlen;
if(copy_from_user(table->data, newval, len))
return -EFAULT;
}
}
return 0;
}
 
/**
* register_sysctl_table - register a sysctl hierarchy
* @table: the top-level table structure
* @insert_at_head: whether the entry should be inserted in front or at the end
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. An entry with a ctl_name of 0 terminates the table.
*
* The members of the &ctl_table structure are used as follows:
*
* ctl_name - This is the numeric sysctl value used by sysctl(2). The number
* must be unique within that level of sysctl
*
* procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
* enter a sysctl file
*
* data - a pointer to data for use by proc_handler
*
* maxlen - the maximum size in bytes of the data
*
* mode - the file permissions for the /proc/sys file, and for sysctl(2)
*
* child - a pointer to the child sysctl table if this entry is a directory, or
* %NULL.
*
* proc_handler - the text handler routine (described below)
*
* strategy - the strategy routine (described below)
*
* de - for internal use by the sysctl routines
*
* extra1, extra2 - extra pointers usable by the proc handler routines
*
* Leaf nodes in the sysctl tree will be represented by a single file
* under /proc; non-leaf nodes will be represented by directories.
*
* sysctl(2) can automatically manage read and write requests through
* the sysctl table. The data and maxlen fields of the ctl_table
* struct enable minimal validation of the values being written to be
* performed, and the mode field allows minimal authentication.
*
* More sophisticated management can be enabled by the provision of a
* strategy routine with the table entry. This will be called before
* any automatic read or write of the data is performed.
*
* The strategy routine may return
*
* < 0 - Error occurred (error is passed to user process)
*
* 0 - OK - proceed with automatic read or write.
*
* > 0 - OK - read or write has been done by the strategy routine, so
* return immediately.
*
* There must be a proc_handler routine for any terminal nodes
* mirrored under /proc/sys (non-terminals are handled by a built-in
* directory handler). Several default handlers are available to
* cover common cases -
*
* proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
* proc_dointvec_minmax(), proc_doulongvec_ms_jiffies_minmax(),
* proc_doulongvec_minmax()
*
* It is the handler's job to read the input buffer from user memory
* and process it. The handler should return 0 on success.
*
* This routine returns %NULL on a failure to register, and a pointer
* to the table header on success.
*/
struct ctl_table_header *register_sysctl_table(ctl_table * table,
int insert_at_head)
{
struct ctl_table_header *tmp;
tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
if (!tmp)
return NULL;
tmp->ctl_table = table;
INIT_LIST_HEAD(&tmp->ctl_entry);
if (insert_at_head)
list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
else
list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
#ifdef CONFIG_PROC_FS
register_proc_table(table, proc_sys_root);
#endif
return tmp;
}
 
/**
* unregister_sysctl_table - unregister a sysctl table hierarchy
* @header: the header returned from register_sysctl_table
*
* Unregisters the sysctl table and all children. proc entries may not
* actually be removed until they are no longer used by anyone.
*/
void unregister_sysctl_table(struct ctl_table_header * header)
{
list_del(&header->ctl_entry);
#ifdef CONFIG_PROC_FS
unregister_proc_table(header->ctl_table, proc_sys_root);
#endif
kfree(header);
}
 
/*
* /proc/sys support
*/
 
#ifdef CONFIG_PROC_FS
 
/* Scan the sysctl entries in table and add them all into /proc */
static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
{
struct proc_dir_entry *de;
int len;
mode_t mode;
for (; table->ctl_name; table++) {
/* Can't do anything without a proc name. */
if (!table->procname)
continue;
/* Maybe we can't do anything with it... */
if (!table->proc_handler && !table->child) {
printk(KERN_WARNING "SYSCTL: Can't register %s\n",
table->procname);
continue;
}
 
len = strlen(table->procname);
mode = table->mode;
 
de = NULL;
if (table->proc_handler)
mode |= S_IFREG;
else {
mode |= S_IFDIR;
for (de = root->subdir; de; de = de->next) {
if (proc_match(len, table->procname, de))
break;
}
/* If the subdir exists already, de is non-NULL */
}
 
if (!de) {
de = create_proc_entry(table->procname, mode, root);
if (!de)
continue;
de->data = (void *) table;
if (table->proc_handler) {
de->proc_fops = &proc_sys_file_operations;
de->proc_iops = &proc_sys_inode_operations;
}
}
table->de = de;
if (de->mode & S_IFDIR)
register_proc_table(table->child, de);
}
}
 
/*
* Unregister a /proc sysctl table and any subdirectories.
*/
static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
{
struct proc_dir_entry *de;
for (; table->ctl_name; table++) {
if (!(de = table->de))
continue;
if (de->mode & S_IFDIR) {
if (!table->child) {
printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
continue;
}
unregister_proc_table(table->child, de);
 
/* Don't unregister directories which still have entries.. */
if (de->subdir)
continue;
}
 
/* Don't unregister proc entries that are still being used.. */
if (atomic_read(&de->count))
continue;
 
table->de = NULL;
remove_proc_entry(table->procname, root);
}
}
 
static ssize_t do_rw_proc(int write, struct file * file, char * buf,
size_t count, loff_t *ppos)
{
int op;
struct proc_dir_entry *de;
struct ctl_table *table;
size_t res;
ssize_t error;
de = (struct proc_dir_entry*) file->f_dentry->d_inode->u.generic_ip;
if (!de || !de->data)
return -ENOTDIR;
table = (struct ctl_table *) de->data;
if (!table || !table->proc_handler)
return -ENOTDIR;
op = (write ? 002 : 004);
if (ctl_perm(table, op))
return -EPERM;
res = count;
 
/*
* FIXME: we need to pass on ppos to the handler.
*/
 
error = (*table->proc_handler) (table, write, file, buf, &res);
if (error)
return error;
return res;
}
 
static ssize_t proc_readsys(struct file * file, char * buf,
size_t count, loff_t *ppos)
{
return do_rw_proc(0, file, buf, count, ppos);
}
 
static ssize_t proc_writesys(struct file * file, const char * buf,
size_t count, loff_t *ppos)
{
return do_rw_proc(1, file, (char *) buf, count, ppos);
}
 
static int proc_sys_permission(struct inode *inode, int op)
{
return test_perm(inode->i_mode, op);
}
 
/**
* proc_dostring - read a string sysctl
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
*
* Reads/writes a string from/to the user buffer. If the kernel
* buffer provided is not large enough to hold the string, the
* string is truncated. The copied string is %NULL-terminated.
* If the string is being read by the user process, it is copied
* and a newline '\n' is added. It is truncated if the buffer is
* not large enough.
*
* Returns 0 on success.
*/
int proc_dostring(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
size_t len;
char *p, c;
if (!table->data || !table->maxlen || !*lenp ||
(filp->f_pos && !write)) {
*lenp = 0;
return 0;
}
if (write) {
len = 0;
p = buffer;
while (len < *lenp) {
if (get_user(c, p++))
return -EFAULT;
if (c == 0 || c == '\n')
break;
len++;
}
if (len >= table->maxlen)
len = table->maxlen-1;
if(copy_from_user(table->data, buffer, len))
return -EFAULT;
((char *) table->data)[len] = 0;
filp->f_pos += *lenp;
} else {
len = strlen(table->data);
if (len > table->maxlen)
len = table->maxlen;
if (len > *lenp)
len = *lenp;
if (len)
if(copy_to_user(buffer, table->data, len))
return -EFAULT;
if (len < *lenp) {
if(put_user('\n', ((char *) buffer) + len))
return -EFAULT;
len++;
}
*lenp = len;
filp->f_pos += len;
}
return 0;
}
 
/*
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
*/
static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
int r;
 
if (!write) {
down_read(&uts_sem);
r=proc_dostring(table,0,filp,buffer,lenp);
up_read(&uts_sem);
} else {
down_write(&uts_sem);
r=proc_dostring(table,1,filp,buffer,lenp);
up_write(&uts_sem);
}
return r;
}
 
#define OP_SET 0
#define OP_AND 1
#define OP_OR 2
#define OP_MAX 3
#define OP_MIN 4
 
static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp, int conv, int op)
{
int *i, vleft, first=1, neg, val;
size_t left, len;
#define TMPBUFLEN 20
char buf[TMPBUFLEN], *p;
if (!table->data || !table->maxlen || !*lenp ||
(filp->f_pos && !write)) {
*lenp = 0;
return 0;
}
i = (int *) table->data;
vleft = table->maxlen / sizeof(int);
left = *lenp;
for (; left && vleft--; i++, first=0) {
if (write) {
while (left) {
char c;
if (get_user(c, (char *) buffer))
return -EFAULT;
if (!isspace(c))
break;
left--;
((char *) buffer)++;
}
if (!left)
break;
neg = 0;
len = left;
if (len > TMPBUFLEN-1)
len = TMPBUFLEN-1;
if(copy_from_user(buf, buffer, len))
return -EFAULT;
buf[len] = 0;
p = buf;
if (*p == '-' && left > 1) {
neg = 1;
left--, p++;
}
if (*p < '0' || *p > '9')
break;
val = simple_strtoul(p, &p, 0) * conv;
len = p-buf;
if ((len < left) && *p && !isspace(*p))
break;
if (neg)
val = -val;
buffer += len;
left -= len;
switch(op) {
case OP_SET: *i = val; break;
case OP_AND: *i &= val; break;
case OP_OR: *i |= val; break;
case OP_MAX: if(*i < val)
*i = val;
break;
case OP_MIN: if(*i > val)
*i = val;
break;
}
} else {
p = buf;
if (!first)
*p++ = '\t';
sprintf(p, "%d", (*i) / conv);
len = strlen(buf);
if (len > left)
len = left;
if(copy_to_user(buffer, buf, len))
return -EFAULT;
left -= len;
buffer += len;
}
}
 
if (!write && !first && left) {
if(put_user('\n', (char *) buffer))
return -EFAULT;
left--, buffer++;
}
if (write) {
p = (char *) buffer;
while (left) {
char c;
if (get_user(c, p++))
return -EFAULT;
if (!isspace(c))
break;
left--;
}
}
if (write && first)
return -EINVAL;
*lenp -= left;
filp->f_pos += *lenp;
return 0;
}
 
/**
* proc_dointvec - read a vector of integers
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
* values from/to the user buffer, treated as an ASCII string.
*
* Returns 0 on success.
*/
int proc_dointvec(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return do_proc_dointvec(table,write,filp,buffer,lenp,1,OP_SET);
}
 
/*
* init may raise the set.
*/
int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
if (!capable(CAP_SYS_MODULE)) {
return -EPERM;
}
return do_proc_dointvec(table,write,filp,buffer,lenp,1,
(current->pid == 1) ? OP_SET : OP_AND);
}
 
/**
* proc_dointvec_minmax - read a vector of integers with min/max values
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
* values from/to the user buffer, treated as an ASCII string.
*
* This routine will ensure the values are within the range specified by
* table->extra1 (min) and table->extra2 (max).
*
* Returns 0 on success.
*/
int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
int *i, *min, *max, vleft, first=1, neg, val;
size_t len, left;
#define TMPBUFLEN 20
char buf[TMPBUFLEN], *p;
if (!table->data || !table->maxlen || !*lenp ||
(filp->f_pos && !write)) {
*lenp = 0;
return 0;
}
i = (int *) table->data;
min = (int *) table->extra1;
max = (int *) table->extra2;
vleft = table->maxlen / sizeof(int);
left = *lenp;
for (; left && vleft--; i++, min++, max++, first=0) {
if (write) {
while (left) {
char c;
if (get_user(c, (char *) buffer))
return -EFAULT;
if (!isspace(c))
break;
left--;
((char *) buffer)++;
}
if (!left)
break;
neg = 0;
len = left;
if (len > TMPBUFLEN-1)
len = TMPBUFLEN-1;
if(copy_from_user(buf, buffer, len))
return -EFAULT;
buf[len] = 0;
p = buf;
if (*p == '-' && left > 1) {
neg = 1;
left--, p++;
}
if (*p < '0' || *p > '9')
break;
val = simple_strtoul(p, &p, 0);
len = p-buf;
if ((len < left) && *p && !isspace(*p))
break;
if (neg)
val = -val;
buffer += len;
left -= len;
 
if ((min && val < *min) || (max && val > *max))
continue;
*i = val;
} else {
p = buf;
if (!first)
*p++ = '\t';
sprintf(p, "%d", *i);
len = strlen(buf);
if (len > left)
len = left;
if(copy_to_user(buffer, buf, len))
return -EFAULT;
left -= len;
buffer += len;
}
}
 
if (!write && !first && left) {
if(put_user('\n', (char *) buffer))
return -EFAULT;
left--, buffer++;
}
if (write) {
p = (char *) buffer;
while (left) {
char c;
if (get_user(c, p++))
return -EFAULT;
if (!isspace(c))
break;
left--;
}
}
if (write && first)
return -EINVAL;
*lenp -= left;
filp->f_pos += *lenp;
return 0;
}
 
static int do_proc_doulongvec_minmax(ctl_table *table, int write,
struct file *filp,
void *buffer, size_t *lenp,
unsigned long convmul,
unsigned long convdiv)
{
#define TMPBUFLEN 20
unsigned long *i, *min, *max, val;
int vleft, first=1, neg;
size_t len, left;
char buf[TMPBUFLEN], *p;
if (!table->data || !table->maxlen || !*lenp ||
(filp->f_pos && !write)) {
*lenp = 0;
return 0;
}
i = (unsigned long *) table->data;
min = (unsigned long *) table->extra1;
max = (unsigned long *) table->extra2;
vleft = table->maxlen / sizeof(unsigned long);
left = *lenp;
for (; left && vleft--; i++, first=0) {
if (write) {
while (left) {
char c;
if (get_user(c, (char *) buffer))
return -EFAULT;
if (!isspace(c))
break;
left--;
((char *) buffer)++;
}
if (!left)
break;
neg = 0;
len = left;
if (len > TMPBUFLEN-1)
len = TMPBUFLEN-1;
if(copy_from_user(buf, buffer, len))
return -EFAULT;
buf[len] = 0;
p = buf;
if (*p == '-' && left > 1) {
neg = 1;
left--, p++;
}
if (*p < '0' || *p > '9')
break;
val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
len = p-buf;
if ((len < left) && *p && !isspace(*p))
break;
if (neg)
val = -val;
buffer += len;
left -= len;
 
if(neg)
continue;
if (min && val < *min++)
continue;
if (max && val > *max++)
continue;
*i = val;
} else {
p = buf;
if (!first)
*p++ = '\t';
sprintf(p, "%lu", convdiv * (*i) / convmul);
len = strlen(buf);
if (len > left)
len = left;
if(copy_to_user(buffer, buf, len))
return -EFAULT;
left -= len;
buffer += len;
}
}
 
if (!write && !first && left) {
if(put_user('\n', (char *) buffer))
return -EFAULT;
left--, buffer++;
}
if (write) {
p = (char *) buffer;
while (left) {
char c;
if (get_user(c, p++))
return -EFAULT;
if (!isspace(c))
break;
left--;
}
}
if (write && first)
return -EINVAL;
*lenp -= left;
filp->f_pos += *lenp;
return 0;
#undef TMPBUFLEN
}
 
/**
* proc_doulongvec_minmax - read a vector of long integers with min/max values
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
*
* Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
* values from/to the user buffer, treated as an ASCII string.
*
* This routine will ensure the values are within the range specified by
* table->extra1 (min) and table->extra2 (max).
*
* Returns 0 on success.
*/
int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, 1l, 1l);
}
 
/**
* proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
*
* Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
* values from/to the user buffer, treated as an ASCII string. The values
* are treated as milliseconds, and converted to jiffies when they are stored.
*
* This routine will ensure the values are within the range specified by
* table->extra1 (min) and table->extra2 (max).
*
* Returns 0 on success.
*/
int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
struct file *filp,
void *buffer, size_t *lenp)
{
return do_proc_doulongvec_minmax(table, write, filp, buffer,
lenp, HZ, 1000l);
}
 
 
/**
* proc_dointvec_jiffies - read a vector of integers as seconds
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
* values from/to the user buffer, treated as an ASCII string.
* The values read are assumed to be in seconds, and are converted into
* jiffies.
*
* Returns 0 on success.
*/
int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return do_proc_dointvec(table,write,filp,buffer,lenp,HZ,OP_SET);
}
 
#else /* CONFIG_PROC_FS */
 
int proc_dostring(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_dointvec(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
 
#endif /* CONFIG_PROC_FS */
 
 
/*
* General sysctl support routines
*/
 
/* The generic string strategy routine: */
int sysctl_string(ctl_table *table, int *name, int nlen,
void *oldval, size_t *oldlenp,
void *newval, size_t newlen, void **context)
{
size_t l, len;
if (!table->data || !table->maxlen)
return -ENOTDIR;
if (oldval && oldlenp) {
if (get_user(len, oldlenp))
return -EFAULT;
if (len) {
l = strlen(table->data);
if (len > l) len = l;
if (len >= table->maxlen)
len = table->maxlen;
if(copy_to_user(oldval, table->data, len))
return -EFAULT;
if(put_user(0, ((char *) oldval) + len))
return -EFAULT;
if(put_user(len, oldlenp))
return -EFAULT;
}
}
if (newval && newlen) {
len = newlen;
if (len > table->maxlen)
len = table->maxlen;
if(copy_from_user(table->data, newval, len))
return -EFAULT;
if (len == table->maxlen)
len--;
((char *) table->data)[len] = 0;
}
return 0;
}
 
/*
* This function makes sure that all of the integers in the vector
* are between the minimum and maximum values given in the arrays
* table->extra1 and table->extra2, respectively.
*/
int sysctl_intvec(ctl_table *table, int *name, int nlen,
void *oldval, size_t *oldlenp,
void *newval, size_t newlen, void **context)
{
int i, *vec, *min, *max;
size_t length;
 
if (newval && newlen) {
if (newlen % sizeof(int) != 0)
return -EINVAL;
 
if (!table->extra1 && !table->extra2)
return 0;
 
if (newlen > table->maxlen)
newlen = table->maxlen;
length = newlen / sizeof(int);
 
vec = (int *) newval;
min = (int *) table->extra1;
max = (int *) table->extra2;
 
for (i = 0; i < length; i++) {
int value;
if (get_user(value, vec + i))
return -EFAULT;
if (min && value < min[i])
return -EINVAL;
if (max && value > max[i])
return -EINVAL;
}
}
return 0;
}
 
/* Strategy function to convert jiffies to seconds */
int sysctl_jiffies(ctl_table *table, int *name, int nlen,
void *oldval, size_t *oldlenp,
void *newval, size_t newlen, void **context)
{
if (oldval) {
size_t olen;
if (oldlenp) {
if (get_user(olen, oldlenp))
return -EFAULT;
if (olen!=sizeof(int))
return -EINVAL;
}
if (put_user(*(int *)(table->data) / HZ, (int *)oldval) ||
(oldlenp && put_user(sizeof(int),oldlenp)))
return -EFAULT;
}
if (newval && newlen) {
int new;
if (newlen != sizeof(int))
return -EINVAL;
if (get_user(new, (int *)newval))
return -EFAULT;
*(int *)(table->data) = new*HZ;
}
return 1;
}
 
 
#else /* CONFIG_SYSCTL */
 
 
extern asmlinkage long sys_sysctl(struct __sysctl_args *args)
{
return -ENOSYS;
}
 
int sysctl_string(ctl_table *table, int *name, int nlen,
void *oldval, size_t *oldlenp,
void *newval, size_t newlen, void **context)
{
return -ENOSYS;
}
 
int sysctl_intvec(ctl_table *table, int *name, int nlen,
void *oldval, size_t *oldlenp,
void *newval, size_t newlen, void **context)
{
return -ENOSYS;
}
 
int sysctl_jiffies(ctl_table *table, int *name, int nlen,
void *oldval, size_t *oldlenp,
void *newval, size_t newlen, void **context)
{
return -ENOSYS;
}
 
int proc_dostring(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_dointvec(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
struct file *filp,
void *buffer, size_t *lenp)
{
return -ENOSYS;
}
 
struct ctl_table_header * register_sysctl_table(ctl_table * table,
int insert_at_head)
{
return 0;
}
 
void unregister_sysctl_table(struct ctl_table_header * table)
{
}
 
#endif /* CONFIG_SYSCTL */
/capability.c
0,0 → 1,216
/*
* linux/kernel/capability.c
*
* Copyright (C) 1997 Andrew Main <zefram@fysh.org>
* Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com>
*/
 
#include <linux/mm.h>
#include <asm/uaccess.h>
 
kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
 
/* Note: never hold tasklist_lock while spinning for this one */
spinlock_t task_capability_lock = SPIN_LOCK_UNLOCKED;
 
/*
* For sys_getproccap() and sys_setproccap(), any of the three
* capability set pointers may be NULL -- indicating that that set is
* uninteresting and/or not to be changed.
*/
 
asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
{
int error, pid;
__u32 version;
struct task_struct *target;
struct __user_cap_data_struct data;
 
if (get_user(version, &header->version))
return -EFAULT;
error = -EINVAL;
if (version != _LINUX_CAPABILITY_VERSION) {
version = _LINUX_CAPABILITY_VERSION;
if (put_user(version, &header->version))
error = -EFAULT;
return error;
}
 
if (get_user(pid, &header->pid))
return -EFAULT;
 
if (pid < 0)
return -EINVAL;
 
error = 0;
 
spin_lock(&task_capability_lock);
 
if (pid && pid != current->pid) {
read_lock(&tasklist_lock);
target = find_task_by_pid(pid); /* identify target of query */
if (!target)
error = -ESRCH;
} else {
target = current;
}
 
if (!error) {
data.permitted = cap_t(target->cap_permitted);
data.inheritable = cap_t(target->cap_inheritable);
data.effective = cap_t(target->cap_effective);
}
 
if (target != current)
read_unlock(&tasklist_lock);
spin_unlock(&task_capability_lock);
 
if (!error) {
if (copy_to_user(dataptr, &data, sizeof data))
return -EFAULT;
}
 
return error;
}
 
/* set capabilities for all processes in a given process group */
 
static void cap_set_pg(int pgrp,
kernel_cap_t *effective,
kernel_cap_t *inheritable,
kernel_cap_t *permitted)
{
struct task_struct *target;
 
/* FIXME: do we need to have a write lock here..? */
read_lock(&tasklist_lock);
for_each_task(target) {
if (target->pgrp != pgrp)
continue;
target->cap_effective = *effective;
target->cap_inheritable = *inheritable;
target->cap_permitted = *permitted;
}
read_unlock(&tasklist_lock);
}
 
/* set capabilities for all processes other than 1 and self */
 
static void cap_set_all(kernel_cap_t *effective,
kernel_cap_t *inheritable,
kernel_cap_t *permitted)
{
struct task_struct *target;
 
/* FIXME: do we need to have a write lock here..? */
read_lock(&tasklist_lock);
/* ALL means everyone other than self or 'init' */
for_each_task(target) {
if (target == current || target->pid == 1)
continue;
target->cap_effective = *effective;
target->cap_inheritable = *inheritable;
target->cap_permitted = *permitted;
}
read_unlock(&tasklist_lock);
}
 
/*
* The restrictions on setting capabilities are specified as:
*
* [pid is for the 'target' task. 'current' is the calling task.]
*
* I: any raised capabilities must be a subset of the (old current) Permitted
* P: any raised capabilities must be a subset of the (old current) permitted
* E: must be set to a subset of (new target) Permitted
*/
 
asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
{
kernel_cap_t inheritable, permitted, effective;
__u32 version;
struct task_struct *target;
int error, pid;
 
if (get_user(version, &header->version))
return -EFAULT;
 
if (version != _LINUX_CAPABILITY_VERSION) {
version = _LINUX_CAPABILITY_VERSION;
if (put_user(version, &header->version))
return -EFAULT;
return -EINVAL;
}
 
if (get_user(pid, &header->pid))
return -EFAULT;
 
if (pid && !capable(CAP_SETPCAP))
return -EPERM;
 
if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
return -EFAULT;
 
error = -EPERM;
spin_lock(&task_capability_lock);
 
if (pid > 0 && pid != current->pid) {
read_lock(&tasklist_lock);
target = find_task_by_pid(pid); /* identify target of query */
if (!target) {
error = -ESRCH;
goto out;
}
} else {
target = current;
}
 
 
/* verify restrictions on target's new Inheritable set */
if (!cap_issubset(inheritable,
cap_combine(target->cap_inheritable,
current->cap_permitted))) {
goto out;
}
 
/* verify restrictions on target's new Permitted set */
if (!cap_issubset(permitted,
cap_combine(target->cap_permitted,
current->cap_permitted))) {
goto out;
}
 
/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
if (!cap_issubset(effective, permitted)) {
goto out;
}
 
/* having verified that the proposed changes are legal,
we now put them into effect. */
error = 0;
 
if (pid < 0) {
if (pid == -1) /* all procs other than current and init */
cap_set_all(&effective, &inheritable, &permitted);
 
else /* all procs in process group */
cap_set_pg(-pid, &effective, &inheritable, &permitted);
goto spin_out;
} else {
/* FIXME: do we need to have a write lock here..? */
target->cap_effective = effective;
target->cap_inheritable = inheritable;
target->cap_permitted = permitted;
}
 
out:
if (target != current) {
read_unlock(&tasklist_lock);
}
spin_out:
spin_unlock(&task_capability_lock);
return error;
}
/context.c
0,0 → 1,165
/*
* linux/kernel/context.c
*
* Mechanism for running arbitrary tasks in process context
*
* dwmw2@redhat.com: Genesis
*
* andrewm@uow.edu.au: 2.4.0-test12
* - Child reaping
* - Support for tasks which re-add themselves
* - flush_scheduled_tasks.
*/
 
#define __KERNEL_SYSCALLS__
 
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/signal.h>
#include <linux/completion.h>
 
static DECLARE_TASK_QUEUE(tq_context);
static DECLARE_WAIT_QUEUE_HEAD(context_task_wq);
static DECLARE_WAIT_QUEUE_HEAD(context_task_done);
static int keventd_running;
static struct task_struct *keventd_task;
 
static int need_keventd(const char *who)
{
if (keventd_running == 0)
printk(KERN_ERR "%s(): keventd has not started\n", who);
return keventd_running;
}
int current_is_keventd(void)
{
int ret = 0;
if (need_keventd(__FUNCTION__))
ret = (current == keventd_task);
return ret;
}
 
/**
* schedule_task - schedule a function for subsequent execution in process context.
* @task: pointer to a &tq_struct which defines the function to be scheduled.
*
* May be called from interrupt context. The scheduled function is run at some
* time in the near future by the keventd kernel thread. If it can sleep, it
* should be designed to do so for the minimum possible time, as it will be
* stalling all other scheduled tasks.
*
* schedule_task() returns non-zero if the task was successfully scheduled.
* If @task is already residing on a task queue then schedule_task() fails
* to schedule your task and returns zero.
*/
int schedule_task(struct tq_struct *task)
{
int ret;
need_keventd(__FUNCTION__);
ret = queue_task(task, &tq_context);
wake_up(&context_task_wq);
return ret;
}
 
static int context_thread(void *startup)
{
struct task_struct *curtask = current;
DECLARE_WAITQUEUE(wait, curtask);
struct k_sigaction sa;
 
daemonize();
strcpy(curtask->comm, "keventd");
keventd_running = 1;
keventd_task = curtask;
 
spin_lock_irq(&curtask->sigmask_lock);
siginitsetinv(&curtask->blocked, sigmask(SIGCHLD));
recalc_sigpending(curtask);
spin_unlock_irq(&curtask->sigmask_lock);
 
complete((struct completion *)startup);
 
/* Install a handler so SIGCLD is delivered */
sa.sa.sa_handler = SIG_IGN;
sa.sa.sa_flags = 0;
siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
 
/*
* If one of the functions on a task queue re-adds itself
* to the task queue we call schedule() in state TASK_RUNNING
*/
for (;;) {
set_task_state(curtask, TASK_INTERRUPTIBLE);
add_wait_queue(&context_task_wq, &wait);
if (TQ_ACTIVE(tq_context))
set_task_state(curtask, TASK_RUNNING);
schedule();
remove_wait_queue(&context_task_wq, &wait);
run_task_queue(&tq_context);
wake_up(&context_task_done);
if (signal_pending(curtask)) {
while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0)
;
spin_lock_irq(&curtask->sigmask_lock);
flush_signals(curtask);
recalc_sigpending(curtask);
spin_unlock_irq(&curtask->sigmask_lock);
}
}
}
 
/**
* flush_scheduled_tasks - ensure that any scheduled tasks have run to completion.
*
* Forces execution of the schedule_task() queue and blocks until its completion.
*
* If a kernel subsystem uses schedule_task() and wishes to flush any pending
* tasks, it should use this function. This is typically used in driver shutdown
* handlers.
*
* The caller should hold no spinlocks and should hold no semaphores which could
* cause the scheduled tasks to block.
*/
static struct tq_struct dummy_task;
 
void flush_scheduled_tasks(void)
{
int count;
DECLARE_WAITQUEUE(wait, current);
 
/*
* Do it twice. It's possible, albeit highly unlikely, that
* the caller queued a task immediately before calling us,
* and that the eventd thread was already past the run_task_queue()
* but not yet into wake_up(), so it woke us up before completing
* the caller's queued task or our new dummy task.
*/
add_wait_queue(&context_task_done, &wait);
for (count = 0; count < 2; count++) {
set_current_state(TASK_UNINTERRUPTIBLE);
 
/* Queue a dummy task to make sure we get kicked */
schedule_task(&dummy_task);
 
/* Wait for it to complete */
schedule();
}
remove_wait_queue(&context_task_done, &wait);
}
int start_context_thread(void)
{
static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
 
kernel_thread(context_thread, &startup, CLONE_FS | CLONE_FILES);
wait_for_completion(&startup);
return 0;
}
 
EXPORT_SYMBOL(schedule_task);
EXPORT_SYMBOL(flush_scheduled_tasks);
 
/module.c
0,0 → 1,1296
#include <linux/config.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <asm/module.h>
#include <asm/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/smp_lock.h>
#include <asm/pgalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/seq_file.h>
 
/*
* Originally by Anonymous (as far as I know...)
* Linux version by Bas Laarhoven <bas@vimec.nl>
* 0.99.14 version by Jon Tombs <jon@gtex02.us.es>,
* Heavily modified by Bjorn Ekwall <bj0rn@blox.se> May 1994 (C)
* Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
* Add MOD_INITIALIZING Keith Owens <kaos@ocs.com.au> Nov 1999
* Add kallsyms support, Keith Owens <kaos@ocs.com.au> Apr 2000
* Add asm/module support, IA64 has special requirements. Keith Owens <kaos@ocs.com.au> Sep 2000
* Fix assorted bugs in module verification. Keith Owens <kaos@ocs.com.au> Sep 2000
* Fix sys_init_module race, Andrew Morton <andrewm@uow.edu.au> Oct 2000
* http://www.uwsg.iu.edu/hypermail/linux/kernel/0008.3/0379.html
* Replace xxx_module_symbol with inter_module_xxx. Keith Owens <kaos@ocs.com.au> Oct 2000
* Add a module list lock for kernel fault race fixing. Alan Cox <alan@redhat.com>
*
* This source is covered by the GNU GPL, the same as all kernel sources.
*/
 
#if defined(CONFIG_MODULES) || defined(CONFIG_KALLSYMS)
 
extern struct module_symbol __start___ksymtab[];
extern struct module_symbol __stop___ksymtab[];
 
extern const struct exception_table_entry __start___ex_table[];
extern const struct exception_table_entry __stop___ex_table[];
 
extern const char __start___kallsyms[] __attribute__ ((weak));
extern const char __stop___kallsyms[] __attribute__ ((weak));
 
struct module kernel_module =
{
size_of_struct: sizeof(struct module),
name: "",
uc: {ATOMIC_INIT(1)},
flags: MOD_RUNNING,
syms: __start___ksymtab,
ex_table_start: __start___ex_table,
ex_table_end: __stop___ex_table,
kallsyms_start: __start___kallsyms,
kallsyms_end: __stop___kallsyms,
};
 
struct module *module_list = &kernel_module;
 
#endif /* defined(CONFIG_MODULES) || defined(CONFIG_KALLSYMS) */
 
/* inter_module functions are always available, even when the kernel is
* compiled without modules. Consumers of inter_module_xxx routines
* will always work, even when both are built into the kernel, this
* approach removes lots of #ifdefs in mainline code.
*/
 
static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
static spinlock_t ime_lock = SPIN_LOCK_UNLOCKED;
static int kmalloc_failed;
 
/*
* This lock prevents modifications that might race the kernel fault
* fixups. It does not prevent reader walks that the modules code
* does. The kernel lock does that.
*
* Since vmalloc fault fixups occur in any context this lock is taken
* irqsave at all times.
*/
spinlock_t modlist_lock = SPIN_LOCK_UNLOCKED;
 
/**
* inter_module_register - register a new set of inter module data.
* @im_name: an arbitrary string to identify the data, must be unique
* @owner: module that is registering the data, always use THIS_MODULE
* @userdata: pointer to arbitrary userdata to be registered
*
* Description: Check that the im_name has not already been registered,
* complain if it has. For new data, add it to the inter_module_entry
* list.
*/
void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
{
struct list_head *tmp;
struct inter_module_entry *ime, *ime_new;
 
if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
/* Overloaded kernel, not fatal */
printk(KERN_ERR
"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
im_name);
kmalloc_failed = 1;
return;
}
memset(ime_new, 0, sizeof(*ime_new));
ime_new->im_name = im_name;
ime_new->owner = owner;
ime_new->userdata = userdata;
 
spin_lock(&ime_lock);
list_for_each(tmp, &ime_list) {
ime = list_entry(tmp, struct inter_module_entry, list);
if (strcmp(ime->im_name, im_name) == 0) {
spin_unlock(&ime_lock);
kfree(ime_new);
/* Program logic error, fatal */
printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
BUG();
}
}
list_add(&(ime_new->list), &ime_list);
spin_unlock(&ime_lock);
}
 
/**
* inter_module_unregister - unregister a set of inter module data.
* @im_name: an arbitrary string to identify the data, must be unique
*
* Description: Check that the im_name has been registered, complain if
* it has not. For existing data, remove it from the
* inter_module_entry list.
*/
void inter_module_unregister(const char *im_name)
{
struct list_head *tmp;
struct inter_module_entry *ime;
 
spin_lock(&ime_lock);
list_for_each(tmp, &ime_list) {
ime = list_entry(tmp, struct inter_module_entry, list);
if (strcmp(ime->im_name, im_name) == 0) {
list_del(&(ime->list));
spin_unlock(&ime_lock);
kfree(ime);
return;
}
}
spin_unlock(&ime_lock);
if (kmalloc_failed) {
printk(KERN_ERR
"inter_module_unregister: no entry for '%s', "
"probably caused by previous kmalloc failure\n",
im_name);
return;
}
else {
/* Program logic error, fatal */
printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
BUG();
}
}
 
/**
* inter_module_get - return arbitrary userdata from another module.
* @im_name: an arbitrary string to identify the data, must be unique
*
* Description: If the im_name has not been registered, return NULL.
* Try to increment the use count on the owning module, if that fails
* then return NULL. Otherwise return the userdata.
*/
const void *inter_module_get(const char *im_name)
{
struct list_head *tmp;
struct inter_module_entry *ime;
const void *result = NULL;
 
spin_lock(&ime_lock);
list_for_each(tmp, &ime_list) {
ime = list_entry(tmp, struct inter_module_entry, list);
if (strcmp(ime->im_name, im_name) == 0) {
if (try_inc_mod_count(ime->owner))
result = ime->userdata;
break;
}
}
spin_unlock(&ime_lock);
return(result);
}
 
/**
* inter_module_get_request - im get with automatic request_module.
* @im_name: an arbitrary string to identify the data, must be unique
* @modname: module that is expected to register im_name
*
* Description: If inter_module_get fails, do request_module then retry.
*/
const void *inter_module_get_request(const char *im_name, const char *modname)
{
const void *result = inter_module_get(im_name);
if (!result) {
request_module(modname);
result = inter_module_get(im_name);
}
return(result);
}
 
/**
* inter_module_put - release use of data from another module.
* @im_name: an arbitrary string to identify the data, must be unique
*
* Description: If the im_name has not been registered, complain,
* otherwise decrement the use count on the owning module.
*/
void inter_module_put(const char *im_name)
{
struct list_head *tmp;
struct inter_module_entry *ime;
 
spin_lock(&ime_lock);
list_for_each(tmp, &ime_list) {
ime = list_entry(tmp, struct inter_module_entry, list);
if (strcmp(ime->im_name, im_name) == 0) {
if (ime->owner)
__MOD_DEC_USE_COUNT(ime->owner);
spin_unlock(&ime_lock);
return;
}
}
spin_unlock(&ime_lock);
printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
BUG();
}
 
 
#if defined(CONFIG_MODULES) /* The rest of the source */
 
static long get_mod_name(const char *user_name, char **buf);
static void put_mod_name(char *buf);
struct module *find_module(const char *name);
void free_module(struct module *, int tag_freed);
 
 
/*
* Called at boot time
*/
 
void __init init_modules(void)
{
kernel_module.nsyms = __stop___ksymtab - __start___ksymtab;
 
arch_init_modules(&kernel_module);
}
 
/*
* Copy the name of a module from user space.
*/
 
static inline long
get_mod_name(const char *user_name, char **buf)
{
unsigned long page;
long retval;
 
page = __get_free_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
 
retval = strncpy_from_user((char *)page, user_name, PAGE_SIZE);
if (retval > 0) {
if (retval < PAGE_SIZE) {
*buf = (char *)page;
return retval;
}
retval = -ENAMETOOLONG;
} else if (!retval)
retval = -EINVAL;
 
free_page(page);
return retval;
}
 
static inline void
put_mod_name(char *buf)
{
free_page((unsigned long)buf);
}
 
/*
* Allocate space for a module.
*/
 
asmlinkage unsigned long
sys_create_module(const char *name_user, size_t size)
{
char *name;
long namelen, error;
struct module *mod;
unsigned long flags;
 
if (!capable(CAP_SYS_MODULE))
return -EPERM;
lock_kernel();
if ((namelen = get_mod_name(name_user, &name)) < 0) {
error = namelen;
goto err0;
}
if (size < sizeof(struct module)+namelen+1) {
error = -EINVAL;
goto err1;
}
if (find_module(name) != NULL) {
error = -EEXIST;
goto err1;
}
if ((mod = (struct module *)module_map(size)) == NULL) {
error = -ENOMEM;
goto err1;
}
 
memset(mod, 0, sizeof(*mod));
mod->size_of_struct = sizeof(*mod);
mod->name = (char *)(mod + 1);
mod->size = size;
memcpy((char*)(mod+1), name, namelen+1);
 
put_mod_name(name);
 
spin_lock_irqsave(&modlist_lock, flags);
mod->next = module_list;
module_list = mod; /* link it in */
spin_unlock_irqrestore(&modlist_lock, flags);
 
error = (long) mod;
goto err0;
err1:
put_mod_name(name);
err0:
unlock_kernel();
return error;
}
 
/*
* Initialize a module.
*/
 
asmlinkage long
sys_init_module(const char *name_user, struct module *mod_user)
{
struct module mod_tmp, *mod, *mod2 = NULL;
char *name, *n_name, *name_tmp = NULL;
long namelen, n_namelen, i, error;
unsigned long mod_user_size, flags;
struct module_ref *dep;
 
if (!capable(CAP_SYS_MODULE))
return -EPERM;
lock_kernel();
if ((namelen = get_mod_name(name_user, &name)) < 0) {
error = namelen;
goto err0;
}
if ((mod = find_module(name)) == NULL) {
error = -ENOENT;
goto err1;
}
 
/* Check module header size. We allow a bit of slop over the
size we are familiar with to cope with a version of insmod
for a newer kernel. But don't over do it. */
if ((error = get_user(mod_user_size, &mod_user->size_of_struct)) != 0)
goto err1;
if (mod_user_size < (unsigned long)&((struct module *)0L)->persist_start
|| mod_user_size > sizeof(struct module) + 16*sizeof(void*)) {
printk(KERN_ERR "init_module: Invalid module header size.\n"
KERN_ERR "A new version of the modutils is likely "
"needed.\n");
error = -EINVAL;
goto err1;
}
 
/* Hold the current contents while we play with the user's idea
of righteousness. */
mod_tmp = *mod;
name_tmp = kmalloc(strlen(mod->name) + 1, GFP_KERNEL); /* Where's kstrdup()? */
if (name_tmp == NULL) {
error = -ENOMEM;
goto err1;
}
strcpy(name_tmp, mod->name);
 
/* Copying mod_user directly over mod breaks the module_list chain and
* races against search_exception_table. copy_from_user may sleep so it
* cannot be under modlist_lock, do the copy in two stages.
*/
if (!(mod2 = vmalloc(mod_user_size))) {
error = -ENOMEM;
goto err2;
}
error = copy_from_user(mod2, mod_user, mod_user_size);
if (error) {
error = -EFAULT;
goto err2;
}
spin_lock_irqsave(&modlist_lock, flags);
memcpy(mod, mod2, mod_user_size);
mod->next = mod_tmp.next;
spin_unlock_irqrestore(&modlist_lock, flags);
 
/* Sanity check the size of the module. */
error = -EINVAL;
 
if (mod->size > mod_tmp.size) {
printk(KERN_ERR "init_module: Size of initialized module "
"exceeds size of created module.\n");
goto err2;
}
 
/* Make sure all interesting pointers are sane. */
 
if (!mod_bound(mod->name, namelen, mod)) {
printk(KERN_ERR "init_module: mod->name out of bounds.\n");
goto err2;
}
if (mod->nsyms && !mod_bound(mod->syms, mod->nsyms, mod)) {
printk(KERN_ERR "init_module: mod->syms out of bounds.\n");
goto err2;
}
if (mod->ndeps && !mod_bound(mod->deps, mod->ndeps, mod)) {
printk(KERN_ERR "init_module: mod->deps out of bounds.\n");
goto err2;
}
if (mod->init && !mod_bound(mod->init, 0, mod)) {
printk(KERN_ERR "init_module: mod->init out of bounds.\n");
goto err2;
}
if (mod->cleanup && !mod_bound(mod->cleanup, 0, mod)) {
printk(KERN_ERR "init_module: mod->cleanup out of bounds.\n");
goto err2;
}
if (mod->ex_table_start > mod->ex_table_end
|| (mod->ex_table_start &&
!((unsigned long)mod->ex_table_start >= ((unsigned long)mod + mod->size_of_struct)
&& ((unsigned long)mod->ex_table_end
< (unsigned long)mod + mod->size)))
|| (((unsigned long)mod->ex_table_start
- (unsigned long)mod->ex_table_end)
% sizeof(struct exception_table_entry))) {
printk(KERN_ERR "init_module: mod->ex_table_* invalid.\n");
goto err2;
}
if (mod->flags & ~MOD_AUTOCLEAN) {
printk(KERN_ERR "init_module: mod->flags invalid.\n");
goto err2;
}
if (mod_member_present(mod, can_unload)
&& mod->can_unload && !mod_bound(mod->can_unload, 0, mod)) {
printk(KERN_ERR "init_module: mod->can_unload out of bounds.\n");
goto err2;
}
if (mod_member_present(mod, kallsyms_end)) {
if (mod->kallsyms_end &&
(!mod_bound(mod->kallsyms_start, 0, mod) ||
!mod_bound(mod->kallsyms_end, 0, mod))) {
printk(KERN_ERR "init_module: mod->kallsyms out of bounds.\n");
goto err2;
}
if (mod->kallsyms_start > mod->kallsyms_end) {
printk(KERN_ERR "init_module: mod->kallsyms invalid.\n");
goto err2;
}
}
if (mod_member_present(mod, archdata_end)) {
if (mod->archdata_end &&
(!mod_bound(mod->archdata_start, 0, mod) ||
!mod_bound(mod->archdata_end, 0, mod))) {
printk(KERN_ERR "init_module: mod->archdata out of bounds.\n");
goto err2;
}
if (mod->archdata_start > mod->archdata_end) {
printk(KERN_ERR "init_module: mod->archdata invalid.\n");
goto err2;
}
}
if (mod_member_present(mod, kernel_data) && mod->kernel_data) {
printk(KERN_ERR "init_module: mod->kernel_data must be zero.\n");
goto err2;
}
 
/* Check that the user isn't doing something silly with the name. */
 
if ((n_namelen = get_mod_name(mod->name - (unsigned long)mod
+ (unsigned long)mod_user,
&n_name)) < 0) {
printk(KERN_ERR "init_module: get_mod_name failure.\n");
error = n_namelen;
goto err2;
}
if (namelen != n_namelen || strcmp(n_name, name_tmp) != 0) {
printk(KERN_ERR "init_module: changed module name to "
"`%s' from `%s'\n",
n_name, name_tmp);
goto err3;
}
 
/* Ok, that's about all the sanity we can stomach; copy the rest. */
 
if (copy_from_user((char *)mod+mod_user_size,
(char *)mod_user+mod_user_size,
mod->size-mod_user_size)) {
error = -EFAULT;
goto err3;
}
 
if (module_arch_init(mod))
goto err3;
 
/* On some machines it is necessary to do something here
to make the I and D caches consistent. */
flush_icache_range((unsigned long)mod, (unsigned long)mod + mod->size);
 
mod->refs = NULL;
 
/* Sanity check the module's dependents */
for (i = 0, dep = mod->deps; i < mod->ndeps; ++i, ++dep) {
struct module *o, *d = dep->dep;
 
/* Make sure the indicated dependencies are really modules. */
if (d == mod) {
printk(KERN_ERR "init_module: self-referential "
"dependency in mod->deps.\n");
goto err3;
}
 
/* Scan the current modules for this dependency */
for (o = module_list; o != &kernel_module && o != d; o = o->next)
;
 
if (o != d) {
printk(KERN_ERR "init_module: found dependency that is "
"(no longer?) a module.\n");
goto err3;
}
}
 
/* Update module references. */
for (i = 0, dep = mod->deps; i < mod->ndeps; ++i, ++dep) {
struct module *d = dep->dep;
 
dep->ref = mod;
dep->next_ref = d->refs;
d->refs = dep;
/* Being referenced by a dependent module counts as a
use as far as kmod is concerned. */
d->flags |= MOD_USED_ONCE;
}
 
/* Free our temporary memory. */
put_mod_name(n_name);
put_mod_name(name);
 
/* Initialize the module. */
atomic_set(&mod->uc.usecount,1);
mod->flags |= MOD_INITIALIZING;
if (mod->init && (error = mod->init()) != 0) {
atomic_set(&mod->uc.usecount,0);
mod->flags &= ~MOD_INITIALIZING;
if (error > 0) /* Buggy module */
error = -EBUSY;
goto err0;
}
atomic_dec(&mod->uc.usecount);
 
/* And set it running. */
mod->flags = (mod->flags | MOD_RUNNING) & ~MOD_INITIALIZING;
error = 0;
goto err0;
 
err3:
put_mod_name(n_name);
err2:
*mod = mod_tmp;
strcpy((char *)mod->name, name_tmp); /* We know there is room for this */
err1:
put_mod_name(name);
err0:
if (mod2)
vfree(mod2);
unlock_kernel();
kfree(name_tmp);
return error;
}
 
static spinlock_t unload_lock = SPIN_LOCK_UNLOCKED;
int try_inc_mod_count(struct module *mod)
{
int res = 1;
if (mod) {
spin_lock(&unload_lock);
if (mod->flags & MOD_DELETED)
res = 0;
else
__MOD_INC_USE_COUNT(mod);
spin_unlock(&unload_lock);
}
return res;
}
 
asmlinkage long
sys_delete_module(const char *name_user)
{
struct module *mod, *next;
char *name;
long error;
int something_changed;
 
if (!capable(CAP_SYS_MODULE))
return -EPERM;
 
lock_kernel();
if (name_user) {
if ((error = get_mod_name(name_user, &name)) < 0)
goto out;
error = -ENOENT;
if ((mod = find_module(name)) == NULL) {
put_mod_name(name);
goto out;
}
put_mod_name(name);
error = -EBUSY;
if (mod->refs != NULL)
goto out;
 
spin_lock(&unload_lock);
if (!__MOD_IN_USE(mod)) {
mod->flags |= MOD_DELETED;
spin_unlock(&unload_lock);
free_module(mod, 0);
error = 0;
} else {
spin_unlock(&unload_lock);
}
goto out;
}
 
/* Do automatic reaping */
restart:
something_changed = 0;
for (mod = module_list; mod != &kernel_module; mod = next) {
next = mod->next;
spin_lock(&unload_lock);
if (mod->refs == NULL
&& (mod->flags & MOD_AUTOCLEAN)
&& (mod->flags & MOD_RUNNING)
&& !(mod->flags & MOD_DELETED)
&& (mod->flags & MOD_USED_ONCE)
&& !__MOD_IN_USE(mod)) {
if ((mod->flags & MOD_VISITED)
&& !(mod->flags & MOD_JUST_FREED)) {
spin_unlock(&unload_lock);
mod->flags &= ~MOD_VISITED;
} else {
mod->flags |= MOD_DELETED;
spin_unlock(&unload_lock);
free_module(mod, 1);
something_changed = 1;
}
} else {
spin_unlock(&unload_lock);
}
}
if (something_changed)
goto restart;
for (mod = module_list; mod != &kernel_module; mod = mod->next)
mod->flags &= ~MOD_JUST_FREED;
error = 0;
out:
unlock_kernel();
return error;
}
 
/* Query various bits about modules. */
 
static int
qm_modules(char *buf, size_t bufsize, size_t *ret)
{
struct module *mod;
size_t nmod, space, len;
 
nmod = space = 0;
 
for (mod=module_list; mod != &kernel_module; mod=mod->next, ++nmod) {
len = strlen(mod->name)+1;
if (len > bufsize)
goto calc_space_needed;
if (copy_to_user(buf, mod->name, len))
return -EFAULT;
buf += len;
bufsize -= len;
space += len;
}
 
if (put_user(nmod, ret))
return -EFAULT;
else
return 0;
 
calc_space_needed:
space += len;
while ((mod = mod->next) != &kernel_module)
space += strlen(mod->name)+1;
 
if (put_user(space, ret))
return -EFAULT;
else
return -ENOSPC;
}
 
static int
qm_deps(struct module *mod, char *buf, size_t bufsize, size_t *ret)
{
size_t i, space, len;
 
if (mod == &kernel_module)
return -EINVAL;
if (!MOD_CAN_QUERY(mod))
if (put_user(0, ret))
return -EFAULT;
else
return 0;
 
space = 0;
for (i = 0; i < mod->ndeps; ++i) {
const char *dep_name = mod->deps[i].dep->name;
 
len = strlen(dep_name)+1;
if (len > bufsize)
goto calc_space_needed;
if (copy_to_user(buf, dep_name, len))
return -EFAULT;
buf += len;
bufsize -= len;
space += len;
}
 
if (put_user(i, ret))
return -EFAULT;
else
return 0;
 
calc_space_needed:
space += len;
while (++i < mod->ndeps)
space += strlen(mod->deps[i].dep->name)+1;
 
if (put_user(space, ret))
return -EFAULT;
else
return -ENOSPC;
}
 
static int
qm_refs(struct module *mod, char *buf, size_t bufsize, size_t *ret)
{
size_t nrefs, space, len;
struct module_ref *ref;
 
if (mod == &kernel_module)
return -EINVAL;
if (!MOD_CAN_QUERY(mod))
if (put_user(0, ret))
return -EFAULT;
else
return 0;
 
space = 0;
for (nrefs = 0, ref = mod->refs; ref ; ++nrefs, ref = ref->next_ref) {
const char *ref_name = ref->ref->name;
 
len = strlen(ref_name)+1;
if (len > bufsize)
goto calc_space_needed;
if (copy_to_user(buf, ref_name, len))
return -EFAULT;
buf += len;
bufsize -= len;
space += len;
}
 
if (put_user(nrefs, ret))
return -EFAULT;
else
return 0;
 
calc_space_needed:
space += len;
while ((ref = ref->next_ref) != NULL)
space += strlen(ref->ref->name)+1;
 
if (put_user(space, ret))
return -EFAULT;
else
return -ENOSPC;
}
 
static int
qm_symbols(struct module *mod, char *buf, size_t bufsize, size_t *ret)
{
size_t i, space, len;
struct module_symbol *s;
char *strings;
unsigned long *vals;
 
if (!MOD_CAN_QUERY(mod))
if (put_user(0, ret))
return -EFAULT;
else
return 0;
 
space = mod->nsyms * 2*sizeof(void *);
 
i = len = 0;
s = mod->syms;
 
if (space > bufsize)
goto calc_space_needed;
 
if (!access_ok(VERIFY_WRITE, buf, space))
return -EFAULT;
 
bufsize -= space;
vals = (unsigned long *)buf;
strings = buf+space;
 
for (; i < mod->nsyms ; ++i, ++s, vals += 2) {
len = strlen(s->name)+1;
if (len > bufsize)
goto calc_space_needed;
 
if (copy_to_user(strings, s->name, len)
|| __put_user(s->value, vals+0)
|| __put_user(space, vals+1))
return -EFAULT;
 
strings += len;
bufsize -= len;
space += len;
}
if (put_user(i, ret))
return -EFAULT;
else
return 0;
 
calc_space_needed:
for (; i < mod->nsyms; ++i, ++s)
space += strlen(s->name)+1;
 
if (put_user(space, ret))
return -EFAULT;
else
return -ENOSPC;
}
 
static int
qm_info(struct module *mod, char *buf, size_t bufsize, size_t *ret)
{
int error = 0;
 
if (mod == &kernel_module)
return -EINVAL;
 
if (sizeof(struct module_info) <= bufsize) {
struct module_info info;
info.addr = (unsigned long)mod;
info.size = mod->size;
info.flags = mod->flags;
/* usecount is one too high here - report appropriately to
compensate for locking */
info.usecount = (mod_member_present(mod, can_unload)
&& mod->can_unload ? -1 : atomic_read(&mod->uc.usecount)-1);
 
if (copy_to_user(buf, &info, sizeof(struct module_info)))
return -EFAULT;
} else
error = -ENOSPC;
 
if (put_user(sizeof(struct module_info), ret))
return -EFAULT;
 
return error;
}
 
asmlinkage long
sys_query_module(const char *name_user, int which, char *buf, size_t bufsize,
size_t *ret)
{
struct module *mod;
int err;
 
lock_kernel();
if (name_user == NULL)
mod = &kernel_module;
else {
long namelen;
char *name;
 
if ((namelen = get_mod_name(name_user, &name)) < 0) {
err = namelen;
goto out;
}
err = -ENOENT;
if ((mod = find_module(name)) == NULL) {
put_mod_name(name);
goto out;
}
put_mod_name(name);
}
 
/* __MOD_ touches the flags. We must avoid that */
atomic_inc(&mod->uc.usecount);
switch (which)
{
case 0:
err = 0;
break;
case QM_MODULES:
err = qm_modules(buf, bufsize, ret);
break;
case QM_DEPS:
err = qm_deps(mod, buf, bufsize, ret);
break;
case QM_REFS:
err = qm_refs(mod, buf, bufsize, ret);
break;
case QM_SYMBOLS:
err = qm_symbols(mod, buf, bufsize, ret);
break;
case QM_INFO:
err = qm_info(mod, buf, bufsize, ret);
break;
default:
err = -EINVAL;
break;
}
atomic_dec(&mod->uc.usecount);
out:
unlock_kernel();
return err;
}
 
/*
* Copy the kernel symbol table to user space. If the argument is
* NULL, just return the size of the table.
*
* This call is obsolete. New programs should use query_module+QM_SYMBOLS
* which does not arbitrarily limit the length of symbols.
*/
 
asmlinkage long
sys_get_kernel_syms(struct kernel_sym *table)
{
struct module *mod;
int i;
struct kernel_sym ksym;
 
lock_kernel();
for (mod = module_list, i = 0; mod; mod = mod->next) {
/* include the count for the module name! */
i += mod->nsyms + 1;
}
 
if (table == NULL)
goto out;
 
/* So that we don't give the user our stack content */
memset (&ksym, 0, sizeof (ksym));
 
for (mod = module_list, i = 0; mod; mod = mod->next) {
struct module_symbol *msym;
unsigned int j;
 
if (!MOD_CAN_QUERY(mod))
continue;
 
/* magic: write module info as a pseudo symbol */
ksym.value = (unsigned long)mod;
ksym.name[0] = '#';
strncpy(ksym.name+1, mod->name, sizeof(ksym.name)-1);
ksym.name[sizeof(ksym.name)-1] = '\0';
 
if (copy_to_user(table, &ksym, sizeof(ksym)) != 0)
goto out;
++i, ++table;
 
if (mod->nsyms == 0)
continue;
 
for (j = 0, msym = mod->syms; j < mod->nsyms; ++j, ++msym) {
ksym.value = msym->value;
strncpy(ksym.name, msym->name, sizeof(ksym.name));
ksym.name[sizeof(ksym.name)-1] = '\0';
 
if (copy_to_user(table, &ksym, sizeof(ksym)) != 0)
goto out;
++i, ++table;
}
}
out:
unlock_kernel();
return i;
}
 
/*
* Look for a module by name, ignoring modules marked for deletion.
*/
 
struct module *
find_module(const char *name)
{
struct module *mod;
 
for (mod = module_list; mod ; mod = mod->next) {
if (mod->flags & MOD_DELETED)
continue;
if (!strcmp(mod->name, name))
break;
}
 
return mod;
}
 
/*
* Free the given module.
*/
 
void
free_module(struct module *mod, int tag_freed)
{
struct module_ref *dep;
unsigned i;
unsigned long flags;
 
/* Let the module clean up. */
 
if (mod->flags & MOD_RUNNING)
{
if(mod->cleanup)
mod->cleanup();
mod->flags &= ~MOD_RUNNING;
}
 
/* Remove the module from the dependency lists. */
 
for (i = 0, dep = mod->deps; i < mod->ndeps; ++i, ++dep) {
struct module_ref **pp;
for (pp = &dep->dep->refs; *pp != dep; pp = &(*pp)->next_ref)
continue;
*pp = dep->next_ref;
if (tag_freed && dep->dep->refs == NULL)
dep->dep->flags |= MOD_JUST_FREED;
}
 
/* And from the main module list. */
 
spin_lock_irqsave(&modlist_lock, flags);
if (mod == module_list) {
module_list = mod->next;
} else {
struct module *p;
for (p = module_list; p->next != mod; p = p->next)
continue;
p->next = mod->next;
}
spin_unlock_irqrestore(&modlist_lock, flags);
 
/* And free the memory. */
 
module_unmap(mod);
}
 
/*
* Called by the /proc file system to return a current list of modules.
*/
 
int get_module_list(char *p)
{
size_t left = PAGE_SIZE;
struct module *mod;
char tmpstr[64];
struct module_ref *ref;
 
for (mod = module_list; mod != &kernel_module; mod = mod->next) {
long len;
const char *q;
 
#define safe_copy_str(str, len) \
do { \
if (left < len) \
goto fini; \
memcpy(p, str, len); p += len, left -= len; \
} while (0)
#define safe_copy_cstr(str) safe_copy_str(str, sizeof(str)-1)
 
len = strlen(mod->name);
safe_copy_str(mod->name, len);
 
if ((len = 20 - len) > 0) {
if (left < len)
goto fini;
memset(p, ' ', len);
p += len;
left -= len;
}
 
len = sprintf(tmpstr, "%8lu", mod->size);
safe_copy_str(tmpstr, len);
 
if (mod->flags & MOD_RUNNING) {
len = sprintf(tmpstr, "%4ld",
(mod_member_present(mod, can_unload)
&& mod->can_unload
? -1L : (long)atomic_read(&mod->uc.usecount)));
safe_copy_str(tmpstr, len);
}
 
if (mod->flags & MOD_DELETED)
safe_copy_cstr(" (deleted)");
else if (mod->flags & MOD_RUNNING) {
if (mod->flags & MOD_AUTOCLEAN)
safe_copy_cstr(" (autoclean)");
if (!(mod->flags & MOD_USED_ONCE))
safe_copy_cstr(" (unused)");
}
else if (mod->flags & MOD_INITIALIZING)
safe_copy_cstr(" (initializing)");
else
safe_copy_cstr(" (uninitialized)");
 
if ((ref = mod->refs) != NULL) {
safe_copy_cstr(" [");
while (1) {
q = ref->ref->name;
len = strlen(q);
safe_copy_str(q, len);
 
if ((ref = ref->next_ref) != NULL)
safe_copy_cstr(" ");
else
break;
}
safe_copy_cstr("]");
}
safe_copy_cstr("\n");
 
#undef safe_copy_str
#undef safe_copy_cstr
}
 
fini:
return PAGE_SIZE - left;
}
 
/*
* Called by the /proc file system to return a current list of ksyms.
*/
 
struct mod_sym {
struct module *mod;
int index;
};
 
/* iterator */
 
static void *s_start(struct seq_file *m, loff_t *pos)
{
struct mod_sym *p = kmalloc(sizeof(*p), GFP_KERNEL);
struct module *v;
loff_t n = *pos;
 
if (!p)
return ERR_PTR(-ENOMEM);
lock_kernel();
for (v = module_list, n = *pos; v; n -= v->nsyms, v = v->next) {
if (n < v->nsyms) {
p->mod = v;
p->index = n;
return p;
}
}
unlock_kernel();
kfree(p);
return NULL;
}
 
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
struct mod_sym *v = p;
(*pos)++;
if (++v->index >= v->mod->nsyms) {
do {
v->mod = v->mod->next;
if (!v->mod) {
unlock_kernel();
kfree(p);
return NULL;
}
} while (!v->mod->nsyms);
v->index = 0;
}
return p;
}
 
static void s_stop(struct seq_file *m, void *p)
{
if (p && !IS_ERR(p)) {
unlock_kernel();
kfree(p);
}
}
 
static int s_show(struct seq_file *m, void *p)
{
struct mod_sym *v = p;
struct module_symbol *sym;
 
if (!MOD_CAN_QUERY(v->mod))
return 0;
sym = &v->mod->syms[v->index];
if (*v->mod->name)
seq_printf(m, "%0*lx %s\t[%s]\n", (int)(2*sizeof(void*)),
sym->value, sym->name, v->mod->name);
else
seq_printf(m, "%0*lx %s\n", (int)(2*sizeof(void*)),
sym->value, sym->name);
return 0;
}
 
struct seq_operations ksyms_op = {
start: s_start,
next: s_next,
stop: s_stop,
show: s_show
};
 
#else /* CONFIG_MODULES */
 
/* Dummy syscalls for people who don't want modules */
 
asmlinkage unsigned long
sys_create_module(const char *name_user, size_t size)
{
return -ENOSYS;
}
 
asmlinkage long
sys_init_module(const char *name_user, struct module *mod_user)
{
return -ENOSYS;
}
 
asmlinkage long
sys_delete_module(const char *name_user)
{
return -ENOSYS;
}
 
asmlinkage long
sys_query_module(const char *name_user, int which, char *buf, size_t bufsize,
size_t *ret)
{
/* Let the program know about the new interface. Not that
it'll do them much good. */
if (which == 0)
return 0;
 
return -ENOSYS;
}
 
asmlinkage long
sys_get_kernel_syms(struct kernel_sym *table)
{
return -ENOSYS;
}
 
int try_inc_mod_count(struct module *mod)
{
return 1;
}
 
#endif /* CONFIG_MODULES */
/sched.c
0,0 → 1,1397
/*
* linux/kernel/sched.c
*
* Kernel scheduler and related syscalls
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
* 1998-11-19 Implemented schedule_timeout() and related stuff
* by Andrea Arcangeli
* 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
*/
 
/*
* 'sched.c' is the main kernel file. It contains scheduling primitives
* (sleep_on, wakeup, schedule etc) as well as a number of simple system
* call functions (type getpid()), which just extract a field from
* current-task
*/
 
#include <linux/config.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/smp_lock.h>
#include <linux/nmi.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/completion.h>
#include <linux/prefetch.h>
#include <linux/compiler.h>
 
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
 
extern void timer_bh(void);
extern void tqueue_bh(void);
extern void immediate_bh(void);
 
/*
* scheduler variables
*/
 
unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
 
extern void mem_use(void);
 
/*
* Scheduling quanta.
*
* NOTE! The unix "nice" value influences how long a process
* gets. The nice value ranges from -20 to +19, where a -20
* is a "high-priority" task, and a "+10" is a low-priority
* task.
*
* We want the time-slice to be around 50ms or so, so this
* calculation depends on the value of HZ.
*/
#if HZ < 200
#define TICK_SCALE(x) ((x) >> 2)
#elif HZ < 400
#define TICK_SCALE(x) ((x) >> 1)
#elif HZ < 800
#define TICK_SCALE(x) (x)
#elif HZ < 1600
#define TICK_SCALE(x) ((x) << 1)
#else
#define TICK_SCALE(x) ((x) << 2)
#endif
 
#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
 
 
/*
* Init task must be ok at boot for the ix86 as we will check its signals
* via the SMP irq return path.
*/
struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
 
/*
* The tasklist_lock protects the linked list of processes.
*
* The runqueue_lock locks the parts that actually access
* and change the run-queues, and have to be interrupt-safe.
*
* If both locks are to be concurrently held, the runqueue_lock
* nests inside the tasklist_lock.
*
* task->alloc_lock nests inside tasklist_lock.
*/
spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
 
static LIST_HEAD(runqueue_head);
 
/*
* We align per-CPU scheduling data on cacheline boundaries,
* to prevent cacheline ping-pong.
*/
static union {
struct schedule_data {
struct task_struct * curr;
cycles_t last_schedule;
} schedule_data;
char __pad [SMP_CACHE_BYTES];
} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 
#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 
struct kernel_stat kstat;
extern struct task_struct *child_reaper;
 
#ifdef CONFIG_SMP
 
#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
#define can_schedule(p,cpu) \
((p)->cpus_runnable & (p)->cpus_allowed & (1UL << cpu))
 
#else
 
#define idle_task(cpu) (&init_task)
#define can_schedule(p,cpu) (1)
 
#endif
 
void scheduling_functions_start_here(void) { }
 
/*
* This is the function that decides how desirable a process is..
* You can weigh different processes against each other depending
* on what CPU they've run on lately etc to try to handle cache
* and TLB miss penalties.
*
* Return values:
* -1000: never select this
* 0: out of time, recalculate counters (but it might still be
* selected)
* +ve: "goodness" value (the larger, the better)
* +1000: realtime process, select this.
*/
 
static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
{
int weight;
 
/*
* select the current process after every other
* runnable process, but before the idle thread.
* Also, dont trigger a counter recalculation.
*/
weight = -1;
if (p->policy & SCHED_YIELD)
goto out;
 
/*
* Non-RT process - normal case first.
*/
if (p->policy == SCHED_OTHER) {
/*
* Give the process a first-approximation goodness value
* according to the number of clock-ticks it has left.
*
* Don't do any other calculations if the time slice is
* over..
*/
weight = p->counter;
if (!weight)
goto out;
#ifdef CONFIG_SMP
/* Give a largish advantage to the same processor... */
/* (this is equivalent to penalizing other processors) */
if (p->processor == this_cpu)
weight += PROC_CHANGE_PENALTY;
#endif
 
/* .. and a slight advantage to the current MM */
if (p->mm == this_mm || !p->mm)
weight += 1;
weight += 20 - p->nice;
goto out;
}
 
/*
* Realtime process, select the first one on the
* runqueue (taking priorities within processes
* into account).
*/
weight = 1000 + p->rt_priority;
out:
return weight;
}
 
/*
* the 'goodness value' of replacing a process on a given CPU.
* positive value means 'replace', zero or negative means 'dont'.
*/
static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
{
return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
}
 
/*
* This is ugly, but reschedule_idle() is very timing-critical.
* We are called with the runqueue spinlock held and we must
* not claim the tasklist_lock.
*/
static FASTCALL(void reschedule_idle(struct task_struct * p));
 
static void reschedule_idle(struct task_struct * p)
{
#ifdef CONFIG_SMP
int this_cpu = smp_processor_id();
struct task_struct *tsk, *target_tsk;
int cpu, best_cpu, i, max_prio;
cycles_t oldest_idle;
 
/*
* shortcut if the woken up task's last CPU is
* idle now.
*/
best_cpu = p->processor;
if (can_schedule(p, best_cpu)) {
tsk = idle_task(best_cpu);
if (cpu_curr(best_cpu) == tsk) {
int need_resched;
send_now_idle:
/*
* If need_resched == -1 then we can skip sending
* the IPI altogether, tsk->need_resched is
* actively watched by the idle thread.
*/
need_resched = tsk->need_resched;
tsk->need_resched = 1;
if ((best_cpu != this_cpu) && !need_resched)
smp_send_reschedule(best_cpu);
return;
}
}
 
/*
* We know that the preferred CPU has a cache-affine current
* process, lets try to find a new idle CPU for the woken-up
* process. Select the least recently active idle CPU. (that
* one will have the least active cache context.) Also find
* the executing process which has the least priority.
*/
oldest_idle = (cycles_t) -1;
target_tsk = NULL;
max_prio = 0;
 
for (i = 0; i < smp_num_cpus; i++) {
cpu = cpu_logical_map(i);
if (!can_schedule(p, cpu))
continue;
tsk = cpu_curr(cpu);
/*
* We use the first available idle CPU. This creates
* a priority list between idle CPUs, but this is not
* a problem.
*/
if (tsk == idle_task(cpu)) {
#if defined(__i386__) && defined(CONFIG_SMP)
/*
* Check if two siblings are idle in the same
* physical package. Use them if found.
*/
if (smp_num_siblings == 2) {
if (cpu_curr(cpu_sibling_map[cpu]) ==
idle_task(cpu_sibling_map[cpu])) {
oldest_idle = last_schedule(cpu);
target_tsk = tsk;
break;
}
}
#endif
if (last_schedule(cpu) < oldest_idle) {
oldest_idle = last_schedule(cpu);
target_tsk = tsk;
}
} else {
if (oldest_idle == (cycles_t)-1) {
int prio = preemption_goodness(tsk, p, cpu);
 
if (prio > max_prio) {
max_prio = prio;
target_tsk = tsk;
}
}
}
}
tsk = target_tsk;
if (tsk) {
if (oldest_idle != (cycles_t)-1) {
best_cpu = tsk->processor;
goto send_now_idle;
}
tsk->need_resched = 1;
if (tsk->processor != this_cpu)
smp_send_reschedule(tsk->processor);
}
return;
 
#else /* UP */
int this_cpu = smp_processor_id();
struct task_struct *tsk;
 
tsk = cpu_curr(this_cpu);
if (preemption_goodness(tsk, p, this_cpu) > 0)
tsk->need_resched = 1;
#endif
}
 
/*
* Careful!
*
* This has to add the process to the _end_ of the
* run-queue, not the beginning. The goodness value will
* determine whether this process will run next. This is
* important to get SCHED_FIFO and SCHED_RR right, where
* a process that is either pre-empted or its time slice
* has expired, should be moved to the tail of the run
* queue for its priority - Bhavesh Davda
*/
static inline void add_to_runqueue(struct task_struct * p)
{
list_add_tail(&p->run_list, &runqueue_head);
nr_running++;
}
 
static inline void move_last_runqueue(struct task_struct * p)
{
list_del(&p->run_list);
list_add_tail(&p->run_list, &runqueue_head);
}
 
/*
* Wake up a process. Put it on the run-queue if it's not
* already there. The "current" process is always on the
* run-queue (except when the actual re-schedule is in
* progress), and as such you're allowed to do the simpler
* "current->state = TASK_RUNNING" to mark yourself runnable
* without the overhead of this.
*/
static inline int try_to_wake_up(struct task_struct * p, int synchronous)
{
unsigned long flags;
int success = 0;
 
/*
* We want the common case fall through straight, thus the goto.
*/
spin_lock_irqsave(&runqueue_lock, flags);
p->state = TASK_RUNNING;
if (task_on_runqueue(p))
goto out;
add_to_runqueue(p);
if (!synchronous || !(p->cpus_allowed & (1UL << smp_processor_id())))
reschedule_idle(p);
success = 1;
out:
spin_unlock_irqrestore(&runqueue_lock, flags);
return success;
}
 
inline int wake_up_process(struct task_struct * p)
{
return try_to_wake_up(p, 0);
}
 
static void process_timeout(unsigned long __data)
{
struct task_struct * p = (struct task_struct *) __data;
 
wake_up_process(p);
}
 
/**
* schedule_timeout - sleep until timeout
* @timeout: timeout value in jiffies
*
* Make the current task sleep until @timeout jiffies have
* elapsed. The routine will return immediately unless
* the current task state has been set (see set_current_state()).
*
* You can set the task state as follows -
*
* %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
* pass before the routine returns. The routine will return 0
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task. In this case the remaining time
* in jiffies will be returned, or 0 if the timer expired in time
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns.
*
* Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
* the CPU away without a bound on the timeout. In this case the return
* value will be %MAX_SCHEDULE_TIMEOUT.
*
* In all cases the return value is guaranteed to be non-negative.
*/
signed long schedule_timeout(signed long timeout)
{
struct timer_list timer;
unsigned long expire;
 
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
/*
* These two special cases are useful to be comfortable
* in the caller. Nothing more. We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I' d like to return a valid offset (>=0) to allow
* the caller to do everything it want with the retval.
*/
schedule();
goto out;
default:
/*
* Another bit of PARANOID. Note that the retval will be
* 0 since no piece of kernel is supposed to do a check
* for a negative retval of schedule_timeout() (since it
* should never happens anyway). You just have the printk()
* that will tell you if something is gone wrong and where.
*/
if (timeout < 0)
{
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx from %p\n", timeout,
__builtin_return_address(0));
current->state = TASK_RUNNING;
goto out;
}
}
 
expire = timeout + jiffies;
 
init_timer(&timer);
timer.expires = expire;
timer.data = (unsigned long) current;
timer.function = process_timeout;
 
add_timer(&timer);
schedule();
del_timer_sync(&timer);
 
timeout = expire - jiffies;
 
out:
return timeout < 0 ? 0 : timeout;
}
 
/*
* schedule_tail() is getting called from the fork return path. This
* cleans up all remaining scheduler things, without impacting the
* common case.
*/
static inline void __schedule_tail(struct task_struct *prev)
{
#ifdef CONFIG_SMP
int policy;
 
/*
* prev->policy can be written from here only before `prev'
* can be scheduled (before setting prev->cpus_runnable to ~0UL).
* Of course it must also be read before allowing prev
* to be rescheduled, but since the write depends on the read
* to complete, wmb() is enough. (the spin_lock() acquired
* before setting cpus_runnable is not enough because the spin_lock()
* common code semantics allows code outside the critical section
* to enter inside the critical section)
*/
policy = prev->policy;
prev->policy = policy & ~SCHED_YIELD;
wmb();
 
/*
* fast path falls through. We have to clear cpus_runnable before
* checking prev->state to avoid a wakeup race. Protect against
* the task exiting early.
*/
task_lock(prev);
task_release_cpu(prev);
mb();
if (prev->state == TASK_RUNNING)
goto needs_resched;
 
out_unlock:
task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
return;
 
/*
* Slow path - we 'push' the previous process and
* reschedule_idle() will attempt to find a new
* processor for it. (but it might preempt the
* current process as well.) We must take the runqueue
* lock and re-check prev->state to be correct. It might
* still happen that this process has a preemption
* 'in progress' already - but this is not a problem and
* might happen in other circumstances as well.
*/
needs_resched:
{
unsigned long flags;
 
/*
* Avoid taking the runqueue lock in cases where
* no preemption-check is necessery:
*/
if ((prev == idle_task(smp_processor_id())) ||
(policy & SCHED_YIELD))
goto out_unlock;
 
spin_lock_irqsave(&runqueue_lock, flags);
if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
reschedule_idle(prev);
spin_unlock_irqrestore(&runqueue_lock, flags);
goto out_unlock;
}
#else
prev->policy &= ~SCHED_YIELD;
#endif /* CONFIG_SMP */
}
 
asmlinkage void schedule_tail(struct task_struct *prev)
{
__schedule_tail(prev);
}
 
/*
* 'schedule()' is the scheduler function. It's a very simple and nice
* scheduler: it's not perfect, but certainly works for most things.
*
* The goto is "interesting".
*
* NOTE!! Task 0 is the 'idle' task, which gets called when no other
* tasks can run. It can not be killed, and it cannot sleep. The 'state'
* information in task[0] is never used.
*/
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
struct task_struct *prev, *next, *p;
struct list_head *tmp;
int this_cpu, c;
 
 
spin_lock_prefetch(&runqueue_lock);
 
BUG_ON(!current->active_mm);
need_resched_back:
prev = current;
this_cpu = prev->processor;
 
if (unlikely(in_interrupt())) {
printk("Scheduling in interrupt\n");
BUG();
}
 
release_kernel_lock(prev, this_cpu);
 
/*
* 'sched_data' is protected by the fact that we can run
* only one process per CPU.
*/
sched_data = & aligned_data[this_cpu].schedule_data;
 
spin_lock_irq(&runqueue_lock);
 
/* move an exhausted RR process to be last.. */
if (unlikely(prev->policy == SCHED_RR))
if (!prev->counter) {
prev->counter = NICE_TO_TICKS(prev->nice);
move_last_runqueue(prev);
}
 
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (signal_pending(prev)) {
prev->state = TASK_RUNNING;
break;
}
default:
del_from_runqueue(prev);
case TASK_RUNNING:;
}
prev->need_resched = 0;
 
/*
* this is the scheduler proper:
*/
 
repeat_schedule:
/*
* Default process to select..
*/
next = idle_task(this_cpu);
c = -1000;
list_for_each(tmp, &runqueue_head) {
p = list_entry(tmp, struct task_struct, run_list);
if (can_schedule(p, this_cpu)) {
int weight = goodness(p, this_cpu, prev->active_mm);
if (weight > c)
c = weight, next = p;
}
}
 
/* Do we need to re-calculate counters? */
if (unlikely(!c)) {
struct task_struct *p;
 
spin_unlock_irq(&runqueue_lock);
read_lock(&tasklist_lock);
for_each_task(p)
p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
read_unlock(&tasklist_lock);
spin_lock_irq(&runqueue_lock);
goto repeat_schedule;
}
 
/*
* from this point on nothing can prevent us from
* switching to the next task, save this fact in
* sched_data.
*/
sched_data->curr = next;
task_set_cpu(next, this_cpu);
spin_unlock_irq(&runqueue_lock);
 
if (unlikely(prev == next)) {
/* We won't go through the normal tail, so do this by hand */
prev->policy &= ~SCHED_YIELD;
goto same_process;
}
 
#ifdef CONFIG_SMP
/*
* maintain the per-process 'last schedule' value.
* (this has to be recalculated even if we reschedule to
* the same process) Currently this is only used on SMP,
* and it's approximate, so we do not have to maintain
* it while holding the runqueue spinlock.
*/
sched_data->last_schedule = get_cycles();
 
/*
* We drop the scheduler lock early (it's a global spinlock),
* thus we have to lock the previous process from getting
* rescheduled during switch_to().
*/
 
#endif /* CONFIG_SMP */
 
kstat.context_swtch++;
/*
* there are 3 processes which are affected by a context switch:
*
* prev == .... ==> (last => next)
*
* It's the 'much more previous' 'prev' that is on next's stack,
* but prev is set to (the just run) 'last' process by switch_to().
* This might sound slightly confusing but makes tons of sense.
*/
prepare_to_switch();
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
if (!mm) {
BUG_ON(next->active_mm);
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next, this_cpu);
} else {
BUG_ON(next->active_mm != mm);
switch_mm(oldmm, mm, next, this_cpu);
}
 
if (!prev->mm) {
prev->active_mm = NULL;
mmdrop(oldmm);
}
}
 
/*
* This just switches the register state and the
* stack.
*/
switch_to(prev, next, prev);
__schedule_tail(prev);
 
same_process:
reacquire_kernel_lock(current);
if (current->need_resched)
goto need_resched_back;
return;
}
 
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything
* up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
* non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
* in this (rare) case, and we handle it by contonuing to scan the queue.
*/
static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, const int sync)
{
struct list_head *tmp;
struct task_struct *p;
 
CHECK_MAGIC_WQHEAD(q);
WQ_CHECK_LIST_HEAD(&q->task_list);
list_for_each(tmp,&q->task_list) {
unsigned int state;
wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 
CHECK_MAGIC(curr->__magic);
p = curr->task;
state = p->state;
if (state & mode) {
WQ_NOTE_WAKER(curr);
if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
}
 
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr)
{
if (q) {
unsigned long flags;
wq_read_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr, 0);
wq_read_unlock_irqrestore(&q->lock, flags);
}
}
 
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)
{
if (q) {
unsigned long flags;
wq_read_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr, 1);
wq_read_unlock_irqrestore(&q->lock, flags);
}
}
 
void complete(struct completion *x)
{
unsigned long flags;
 
spin_lock_irqsave(&x->wait.lock, flags);
x->done++;
__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
 
void wait_for_completion(struct completion *x)
{
spin_lock_irq(&x->wait.lock);
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
 
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&x->wait.lock);
schedule();
spin_lock_irq(&x->wait.lock);
} while (!x->done);
__remove_wait_queue(&x->wait, &wait);
}
x->done--;
spin_unlock_irq(&x->wait.lock);
}
 
#define SLEEP_ON_VAR \
unsigned long flags; \
wait_queue_t wait; \
init_waitqueue_entry(&wait, current);
 
#define SLEEP_ON_HEAD \
wq_write_lock_irqsave(&q->lock,flags); \
__add_wait_queue(q, &wait); \
wq_write_unlock(&q->lock);
 
#define SLEEP_ON_TAIL \
wq_write_lock_irq(&q->lock); \
__remove_wait_queue(q, &wait); \
wq_write_unlock_irqrestore(&q->lock,flags);
 
void interruptible_sleep_on(wait_queue_head_t *q)
{
SLEEP_ON_VAR
 
current->state = TASK_INTERRUPTIBLE;
 
SLEEP_ON_HEAD
schedule();
SLEEP_ON_TAIL
}
 
long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
 
current->state = TASK_INTERRUPTIBLE;
 
SLEEP_ON_HEAD
timeout = schedule_timeout(timeout);
SLEEP_ON_TAIL
 
return timeout;
}
 
void sleep_on(wait_queue_head_t *q)
{
SLEEP_ON_VAR
current->state = TASK_UNINTERRUPTIBLE;
 
SLEEP_ON_HEAD
schedule();
SLEEP_ON_TAIL
}
 
long sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
current->state = TASK_UNINTERRUPTIBLE;
 
SLEEP_ON_HEAD
timeout = schedule_timeout(timeout);
SLEEP_ON_TAIL
 
return timeout;
}
 
void scheduling_functions_end_here(void) { }
 
#if CONFIG_SMP
/**
* set_cpus_allowed() - change a given task's processor affinity
* @p: task to bind
* @new_mask: bitmask of allowed processors
*
* Upon return, the task is running on a legal processor. Note the caller
* must have a valid reference to the task: it must not exit() prematurely.
* This call can sleep; do not hold locks on call.
*/
void set_cpus_allowed(struct task_struct *p, unsigned long new_mask)
{
new_mask &= cpu_online_map;
BUG_ON(!new_mask);
 
p->cpus_allowed = new_mask;
 
/*
* If the task is on a no-longer-allowed processor, we need to move
* it. If the task is not current, then set need_resched and send
* its processor an IPI to reschedule.
*/
if (!(p->cpus_runnable & p->cpus_allowed)) {
if (p != current) {
p->need_resched = 1;
smp_send_reschedule(p->processor);
}
/*
* Wait until we are on a legal processor. If the task is
* current, then we should be on a legal processor the next
* time we reschedule. Otherwise, we need to wait for the IPI.
*/
while (!(p->cpus_runnable & p->cpus_allowed))
schedule();
}
}
#endif /* CONFIG_SMP */
 
#ifndef __alpha__
 
/*
* This has been replaced by sys_setpriority. Maybe it should be
* moved into the arch dependent tree for those ports that require
* it for backward compatibility?
*/
 
asmlinkage long sys_nice(int increment)
{
long newprio;
 
/*
* Setpriority might change our priority at the same moment.
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
if (increment < 0) {
if (!capable(CAP_SYS_NICE))
return -EPERM;
if (increment < -40)
increment = -40;
}
if (increment > 40)
increment = 40;
 
newprio = current->nice + increment;
if (newprio < -20)
newprio = -20;
if (newprio > 19)
newprio = 19;
current->nice = newprio;
return 0;
}
 
#endif
 
static inline struct task_struct *find_process_by_pid(pid_t pid)
{
struct task_struct *tsk = current;
 
if (pid)
tsk = find_task_by_pid(pid);
return tsk;
}
 
static int setscheduler(pid_t pid, int policy,
struct sched_param *param)
{
struct sched_param lp;
struct task_struct *p;
int retval;
 
retval = -EINVAL;
if (!param || pid < 0)
goto out_nounlock;
 
retval = -EFAULT;
if (copy_from_user(&lp, param, sizeof(struct sched_param)))
goto out_nounlock;
 
/*
* We play safe to avoid deadlocks.
*/
read_lock_irq(&tasklist_lock);
spin_lock(&runqueue_lock);
 
p = find_process_by_pid(pid);
 
retval = -ESRCH;
if (!p)
goto out_unlock;
if (policy < 0)
policy = p->policy;
else {
retval = -EINVAL;
if (policy != SCHED_FIFO && policy != SCHED_RR &&
policy != SCHED_OTHER)
goto out_unlock;
}
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
* priority for SCHED_OTHER is 0.
*/
retval = -EINVAL;
if (lp.sched_priority < 0 || lp.sched_priority > 99)
goto out_unlock;
if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
goto out_unlock;
 
retval = -EPERM;
if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
!capable(CAP_SYS_NICE))
goto out_unlock;
if ((current->euid != p->euid) && (current->euid != p->uid) &&
!capable(CAP_SYS_NICE))
goto out_unlock;
 
retval = 0;
p->policy = policy;
p->rt_priority = lp.sched_priority;
 
current->need_resched = 1;
 
out_unlock:
spin_unlock(&runqueue_lock);
read_unlock_irq(&tasklist_lock);
 
out_nounlock:
return retval;
}
 
asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
struct sched_param *param)
{
return setscheduler(pid, policy, param);
}
 
asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
{
return setscheduler(pid, -1, param);
}
 
asmlinkage long sys_sched_getscheduler(pid_t pid)
{
struct task_struct *p;
int retval;
 
retval = -EINVAL;
if (pid < 0)
goto out_nounlock;
 
retval = -ESRCH;
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (p)
retval = p->policy & ~SCHED_YIELD;
read_unlock(&tasklist_lock);
 
out_nounlock:
return retval;
}
 
asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
{
struct task_struct *p;
struct sched_param lp;
int retval;
 
retval = -EINVAL;
if (!param || pid < 0)
goto out_nounlock;
 
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
retval = -ESRCH;
if (!p)
goto out_unlock;
lp.sched_priority = p->rt_priority;
read_unlock(&tasklist_lock);
 
/*
* This one might sleep, we cannot do it with a spinlock held ...
*/
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 
out_nounlock:
return retval;
 
out_unlock:
read_unlock(&tasklist_lock);
return retval;
}
 
asmlinkage long sys_sched_yield(void)
{
/*
* Trick. sched_yield() first counts the number of truly
* 'pending' runnable processes, then returns if it's
* only the current processes. (This test does not have
* to be atomic.) In threaded applications this optimization
* gets triggered quite often.
*/
 
int nr_pending = nr_running;
 
#if CONFIG_SMP
int i;
 
// Subtract non-idle processes running on other CPUs.
for (i = 0; i < smp_num_cpus; i++) {
int cpu = cpu_logical_map(i);
if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
nr_pending--;
}
#else
// on UP this process is on the runqueue as well
nr_pending--;
#endif
if (nr_pending) {
/*
* This process can only be rescheduled by us,
* so this is safe without any locking.
*/
if (current->policy == SCHED_OTHER)
current->policy |= SCHED_YIELD;
current->need_resched = 1;
 
spin_lock_irq(&runqueue_lock);
move_last_runqueue(current);
spin_unlock_irq(&runqueue_lock);
}
return 0;
}
 
/**
* yield - yield the current processor to other threads.
*
* this is a shortcut for kernel-space yielding - it marks the
* thread runnable and calls sys_sched_yield().
*/
void yield(void)
{
set_current_state(TASK_RUNNING);
sys_sched_yield();
schedule();
}
 
void __cond_resched(void)
{
set_current_state(TASK_RUNNING);
schedule();
}
 
asmlinkage long sys_sched_get_priority_max(int policy)
{
int ret = -EINVAL;
 
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
ret = 99;
break;
case SCHED_OTHER:
ret = 0;
break;
}
return ret;
}
 
asmlinkage long sys_sched_get_priority_min(int policy)
{
int ret = -EINVAL;
 
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
ret = 1;
break;
case SCHED_OTHER:
ret = 0;
}
return ret;
}
 
asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
{
struct timespec t;
struct task_struct *p;
int retval = -EINVAL;
 
if (pid < 0)
goto out_nounlock;
 
retval = -ESRCH;
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (p)
jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
&t);
read_unlock(&tasklist_lock);
if (p)
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock:
return retval;
}
 
static void show_task(struct task_struct * p)
{
unsigned long free = 0;
int state;
static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
 
printk("%-13.13s ", p->comm);
state = p->state ? ffz(~p->state) + 1 : 0;
if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
printk(stat_nam[state]);
else
printk(" ");
#if (BITS_PER_LONG == 32)
if (p == current)
printk(" current ");
else
printk(" %08lX ", thread_saved_pc(&p->thread));
#else
if (p == current)
printk(" current task ");
else
printk(" %016lx ", thread_saved_pc(&p->thread));
#endif
{
unsigned long * n = (unsigned long *) (p+1);
while (!*n)
n++;
free = (unsigned long) n - (unsigned long)(p+1);
}
printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
if (p->p_cptr)
printk("%5d ", p->p_cptr->pid);
else
printk(" ");
if (p->p_ysptr)
printk("%7d", p->p_ysptr->pid);
else
printk(" ");
if (p->p_osptr)
printk(" %5d", p->p_osptr->pid);
else
printk(" ");
if (!p->mm)
printk(" (L-TLB)\n");
else
printk(" (NOTLB)\n");
 
{
extern void show_trace_task(struct task_struct *tsk);
show_trace_task(p);
}
}
 
char * render_sigset_t(sigset_t *set, char *buffer)
{
int i = _NSIG, x;
do {
i -= 4, x = 0;
if (sigismember(set, i+1)) x |= 1;
if (sigismember(set, i+2)) x |= 2;
if (sigismember(set, i+3)) x |= 4;
if (sigismember(set, i+4)) x |= 8;
*buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
} while (i >= 4);
*buffer = 0;
return buffer;
}
 
void show_state(void)
{
struct task_struct *p;
 
#if (BITS_PER_LONG == 32)
printk("\n"
" free sibling\n");
printk(" task PC stack pid father child younger older\n");
#else
printk("\n"
" free sibling\n");
printk(" task PC stack pid father child younger older\n");
#endif
read_lock(&tasklist_lock);
for_each_task(p) {
/*
* reset the NMI-timeout, listing all files on a slow
* console might take alot of time:
*/
touch_nmi_watchdog();
show_task(p);
}
read_unlock(&tasklist_lock);
}
 
/**
* reparent_to_init() - Reparent the calling kernel thread to the init task.
*
* If a kernel thread is launched as a result of a system call, or if
* it ever exits, it should generally reparent itself to init so that
* it is correctly cleaned up on exit.
*
* The various task state such as scheduling policy and priority may have
* been inherited fro a user process, so we reset them to sane values here.
*
* NOTE that reparent_to_init() gives the caller full capabilities.
*/
void reparent_to_init(void)
{
struct task_struct *this_task = current;
 
write_lock_irq(&tasklist_lock);
 
/* Reparent to init */
REMOVE_LINKS(this_task);
this_task->p_pptr = child_reaper;
this_task->p_opptr = child_reaper;
SET_LINKS(this_task);
 
/* Set the exit signal to SIGCHLD so we signal init on exit */
this_task->exit_signal = SIGCHLD;
 
/* We also take the runqueue_lock while altering task fields
* which affect scheduling decisions */
spin_lock(&runqueue_lock);
 
this_task->ptrace = 0;
this_task->nice = DEF_NICE;
this_task->policy = SCHED_OTHER;
/* cpus_allowed? */
/* rt_priority? */
/* signals? */
this_task->cap_effective = CAP_INIT_EFF_SET;
this_task->cap_inheritable = CAP_INIT_INH_SET;
this_task->cap_permitted = CAP_FULL_SET;
this_task->keep_capabilities = 0;
memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
switch_uid(INIT_USER);
 
spin_unlock(&runqueue_lock);
write_unlock_irq(&tasklist_lock);
}
 
/*
* Put all the gunge required to become a kernel thread without
* attached user resources in one place where it belongs.
*/
 
void daemonize(void)
{
struct fs_struct *fs;
 
 
/*
* If we were started as result of loading a module, close all of the
* user space pages. We don't need them, and if we didn't close them
* they would be locked into memory.
*/
exit_mm(current);
 
current->session = 1;
current->pgrp = 1;
current->tty = NULL;
 
/* Become as one with the init task */
 
exit_fs(current); /* current->fs->count--; */
fs = init_task.fs;
current->fs = fs;
atomic_inc(&fs->count);
exit_files(current);
current->files = init_task.files;
atomic_inc(&current->files->count);
}
 
extern unsigned long wait_init_idle;
 
void __init init_idle(void)
{
struct schedule_data * sched_data;
sched_data = &aligned_data[smp_processor_id()].schedule_data;
 
if (current != &init_task && task_on_runqueue(current)) {
printk("UGH! (%d:%d) was on the runqueue, removing.\n",
smp_processor_id(), current->pid);
del_from_runqueue(current);
}
sched_data->curr = current;
sched_data->last_schedule = get_cycles();
clear_bit(current->processor, &wait_init_idle);
}
 
extern void init_timervecs (void);
 
void __init sched_init(void)
{
/*
* We have to do a little magic to get the first
* process right in SMP mode.
*/
int cpu = smp_processor_id();
int nr;
 
init_task.processor = cpu;
 
for(nr = 0; nr < PIDHASH_SZ; nr++)
pidhash[nr] = NULL;
 
init_timervecs();
 
init_bh(TIMER_BH, timer_bh);
init_bh(TQUEUE_BH, tqueue_bh);
init_bh(IMMEDIATE_BH, immediate_bh);
 
/*
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current, cpu);
}
/softirq.c
0,0 → 1,415
/*
* linux/kernel/softirq.c
*
* Copyright (C) 1992 Linus Torvalds
*
* Fixed a disable_bh()/enable_bh() race (was causing a console lockup)
* due bh_mask_count not atomic handling. Copyright (C) 1998 Andrea Arcangeli
*
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
*/
 
#include <linux/config.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/tqueue.h>
 
/*
- No shared variables, all the data are CPU local.
- If a softirq needs serialization, let it serialize itself
by its own spinlocks.
- Even if softirq is serialized, only local cpu is marked for
execution. Hence, we get something sort of weak cpu binding.
Though it is still not clear, will it result in better locality
or will not.
- These softirqs are not masked by global cli() and start_bh_atomic()
(by clear reasons). Hence, old parts of code still using global locks
MUST NOT use softirqs, but insert interfacing routines acquiring
global locks. F.e. look at BHs implementation.
 
Examples:
- NET RX softirq. It is multithreaded and does not require
any global serialization.
- NET TX softirq. It kicks software netdevice queues, hence
it is logically serialized per device, but this serialization
is invisible to common code.
- Tasklets: serialized wrt itself.
- Bottom halves: globally serialized, grr...
*/
 
irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
 
static struct softirq_action softirq_vec[32] __cacheline_aligned;
 
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
static inline void wakeup_softirqd(unsigned cpu)
{
struct task_struct * tsk = ksoftirqd_task(cpu);
 
if (tsk && tsk->state != TASK_RUNNING)
wake_up_process(tsk);
}
 
asmlinkage void do_softirq()
{
int cpu = smp_processor_id();
__u32 pending;
unsigned long flags;
__u32 mask;
 
if (in_interrupt())
return;
 
local_irq_save(flags);
 
pending = softirq_pending(cpu);
 
if (pending) {
struct softirq_action *h;
 
mask = ~pending;
local_bh_disable();
restart:
/* Reset the pending bitmask before enabling irqs */
softirq_pending(cpu) = 0;
 
local_irq_enable();
 
h = softirq_vec;
 
do {
if (pending & 1)
h->action(h);
h++;
pending >>= 1;
} while (pending);
 
local_irq_disable();
 
pending = softirq_pending(cpu);
if (pending & mask) {
mask &= ~pending;
goto restart;
}
__local_bh_enable();
 
if (pending)
wakeup_softirqd(cpu);
}
 
local_irq_restore(flags);
}
 
/*
* This function must run with irq disabled!
*/
inline void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
{
__cpu_raise_softirq(cpu, nr);
 
/*
* If we're in an interrupt or bh, we're done
* (this also catches bh-disabled code). We will
* actually run the softirq once we return from
* the irq or bh.
*
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
if (!(local_irq_count(cpu) | local_bh_count(cpu)))
wakeup_softirqd(cpu);
}
 
void raise_softirq(unsigned int nr)
{
unsigned long flags;
 
local_irq_save(flags);
cpu_raise_softirq(smp_processor_id(), nr);
local_irq_restore(flags);
}
 
void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
{
softirq_vec[nr].data = data;
softirq_vec[nr].action = action;
}
 
 
/* Tasklets */
 
struct tasklet_head tasklet_vec[NR_CPUS] __cacheline_aligned;
struct tasklet_head tasklet_hi_vec[NR_CPUS] __cacheline_aligned;
 
void __tasklet_schedule(struct tasklet_struct *t)
{
int cpu = smp_processor_id();
unsigned long flags;
 
local_irq_save(flags);
t->next = tasklet_vec[cpu].list;
tasklet_vec[cpu].list = t;
cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
 
void __tasklet_hi_schedule(struct tasklet_struct *t)
{
int cpu = smp_processor_id();
unsigned long flags;
 
local_irq_save(flags);
t->next = tasklet_hi_vec[cpu].list;
tasklet_hi_vec[cpu].list = t;
cpu_raise_softirq(cpu, HI_SOFTIRQ);
local_irq_restore(flags);
}
 
static void tasklet_action(struct softirq_action *a)
{
int cpu = smp_processor_id();
struct tasklet_struct *list;
 
local_irq_disable();
list = tasklet_vec[cpu].list;
tasklet_vec[cpu].list = NULL;
local_irq_enable();
 
while (list) {
struct tasklet_struct *t = list;
 
list = list->next;
 
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
continue;
}
tasklet_unlock(t);
}
 
local_irq_disable();
t->next = tasklet_vec[cpu].list;
tasklet_vec[cpu].list = t;
__cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
local_irq_enable();
}
}
 
static void tasklet_hi_action(struct softirq_action *a)
{
int cpu = smp_processor_id();
struct tasklet_struct *list;
 
local_irq_disable();
list = tasklet_hi_vec[cpu].list;
tasklet_hi_vec[cpu].list = NULL;
local_irq_enable();
 
while (list) {
struct tasklet_struct *t = list;
 
list = list->next;
 
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
continue;
}
tasklet_unlock(t);
}
 
local_irq_disable();
t->next = tasklet_hi_vec[cpu].list;
tasklet_hi_vec[cpu].list = t;
__cpu_raise_softirq(cpu, HI_SOFTIRQ);
local_irq_enable();
}
}
 
 
void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
t->next = NULL;
t->state = 0;
atomic_set(&t->count, 0);
t->func = func;
t->data = data;
}
 
void tasklet_kill(struct tasklet_struct *t)
{
if (in_interrupt())
printk("Attempt to kill tasklet from interrupt\n");
 
while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
current->state = TASK_RUNNING;
do {
yield();
} while (test_bit(TASKLET_STATE_SCHED, &t->state));
}
tasklet_unlock_wait(t);
clear_bit(TASKLET_STATE_SCHED, &t->state);
}
 
 
 
/* Old style BHs */
 
static void (*bh_base[32])(void);
struct tasklet_struct bh_task_vec[32];
 
/* BHs are serialized by spinlock global_bh_lock.
 
It is still possible to make synchronize_bh() as
spin_unlock_wait(&global_bh_lock). This operation is not used
by kernel now, so that this lock is not made private only
due to wait_on_irq().
 
It can be removed only after auditing all the BHs.
*/
spinlock_t global_bh_lock = SPIN_LOCK_UNLOCKED;
 
static void bh_action(unsigned long nr)
{
int cpu = smp_processor_id();
 
if (!spin_trylock(&global_bh_lock))
goto resched;
 
if (!hardirq_trylock(cpu))
goto resched_unlock;
 
if (bh_base[nr])
bh_base[nr]();
 
hardirq_endlock(cpu);
spin_unlock(&global_bh_lock);
return;
 
resched_unlock:
spin_unlock(&global_bh_lock);
resched:
mark_bh(nr);
}
 
void init_bh(int nr, void (*routine)(void))
{
bh_base[nr] = routine;
mb();
}
 
void remove_bh(int nr)
{
tasklet_kill(bh_task_vec+nr);
bh_base[nr] = NULL;
}
 
void __init softirq_init()
{
int i;
 
for (i=0; i<32; i++)
tasklet_init(bh_task_vec+i, bh_action, i);
 
open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
}
 
void __run_task_queue(task_queue *list)
{
struct list_head head, *next;
unsigned long flags;
 
spin_lock_irqsave(&tqueue_lock, flags);
list_add(&head, list);
list_del_init(list);
spin_unlock_irqrestore(&tqueue_lock, flags);
 
next = head.next;
while (next != &head) {
void (*f) (void *);
struct tq_struct *p;
void *data;
 
p = list_entry(next, struct tq_struct, list);
next = next->next;
f = p->routine;
data = p->data;
wmb();
p->sync = 0;
if (f)
f(data);
}
}
 
static int ksoftirqd(void * __bind_cpu)
{
int bind_cpu = (int) (long) __bind_cpu;
int cpu = cpu_logical_map(bind_cpu);
 
daemonize();
current->nice = 19;
sigfillset(&current->blocked);
 
/* Migrate to the right CPU */
current->cpus_allowed = 1UL << cpu;
while (smp_processor_id() != cpu)
schedule();
 
sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
 
__set_current_state(TASK_INTERRUPTIBLE);
mb();
 
ksoftirqd_task(cpu) = current;
 
for (;;) {
if (!softirq_pending(cpu))
schedule();
 
__set_current_state(TASK_RUNNING);
 
while (softirq_pending(cpu)) {
do_softirq();
if (current->need_resched)
schedule();
}
 
__set_current_state(TASK_INTERRUPTIBLE);
}
}
 
static __init int spawn_ksoftirqd(void)
{
int cpu;
 
for (cpu = 0; cpu < smp_num_cpus; cpu++) {
if (kernel_thread(ksoftirqd, (void *) (long) cpu,
CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
else {
while (!ksoftirqd_task(cpu_logical_map(cpu)))
yield();
}
}
 
return 0;
}
 
__initcall(spawn_ksoftirqd);
/resource.c
0,0 → 1,372
/*
* linux/kernel/resource.c
*
* Copyright (C) 1999 Linus Torvalds
* Copyright (C) 1999 Martin Mares <mj@ucw.cz>
*
* Arbitrary resource management.
*/
 
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/seq_file.h>
#include <asm/io.h>
 
struct resource ioport_resource = { "PCI IO", 0x0000, IO_SPACE_LIMIT, IORESOURCE_IO };
struct resource iomem_resource = { "PCI mem", 0x00000000, 0xffffffff, IORESOURCE_MEM };
 
static rwlock_t resource_lock = RW_LOCK_UNLOCKED;
 
enum { MAX_IORES_LEVEL = 5 };
 
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
(*pos)++;
if (p->child)
return p->child;
while (!p->sibling && p->parent)
p = p->parent;
return p->sibling;
}
 
static void *r_start(struct seq_file *m, loff_t *pos)
{
struct resource *p = m->private;
loff_t l = 0;
read_lock(&resource_lock);
for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
;
return p;
}
 
static void r_stop(struct seq_file *m, void *v)
{
read_unlock(&resource_lock);
}
 
static int r_show(struct seq_file *m, void *v)
{
struct resource *root = m->private;
struct resource *r = v, *p;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
 
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
seq_printf(m, "%*s%0*lx-%0*lx : %s\n",
depth * 2, "",
width, r->start,
width, r->end,
r->name ? r->name : "<BAD>");
return 0;
}
 
static struct seq_operations resource_op = {
.start = r_start,
.next = r_next,
.stop = r_stop,
.show = r_show,
};
 
static int ioports_open(struct inode *inode, struct file *file)
{
int res = seq_open(file, &resource_op);
if (!res) {
struct seq_file *m = file->private_data;
m->private = &ioport_resource;
}
return res;
}
 
static int iomem_open(struct inode *inode, struct file *file)
{
int res = seq_open(file, &resource_op);
if (!res) {
struct seq_file *m = file->private_data;
m->private = &iomem_resource;
}
return res;
}
 
struct file_operations proc_ioports_operations = {
.open = ioports_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
 
struct file_operations proc_iomem_operations = {
.open = iomem_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
 
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
unsigned long start = new->start;
unsigned long end = new->end;
struct resource *tmp, **p;
 
if (end < start)
return root;
if (start < root->start)
return root;
if (end > root->end)
return root;
p = &root->child;
for (;;) {
tmp = *p;
if (!tmp || tmp->start > end) {
new->sibling = tmp;
*p = new;
new->parent = root;
return NULL;
}
p = &tmp->sibling;
if (tmp->end < start)
continue;
return tmp;
}
}
 
static int __release_resource(struct resource *old)
{
struct resource *tmp, **p;
 
p = &old->parent->child;
for (;;) {
tmp = *p;
if (!tmp)
break;
if (tmp == old) {
*p = tmp->sibling;
old->parent = NULL;
return 0;
}
p = &tmp->sibling;
}
return -EINVAL;
}
 
int request_resource(struct resource *root, struct resource *new)
{
struct resource *conflict;
 
write_lock(&resource_lock);
conflict = __request_resource(root, new);
write_unlock(&resource_lock);
return conflict ? -EBUSY : 0;
}
 
int release_resource(struct resource *old)
{
int retval;
 
write_lock(&resource_lock);
retval = __release_resource(old);
write_unlock(&resource_lock);
return retval;
}
 
int check_resource(struct resource *root, unsigned long start, unsigned long len)
{
struct resource *conflict, tmp;
 
tmp.start = start;
tmp.end = start + len - 1;
write_lock(&resource_lock);
conflict = __request_resource(root, &tmp);
if (!conflict)
__release_resource(&tmp);
write_unlock(&resource_lock);
return conflict ? -EBUSY : 0;
}
 
/*
* Find empty slot in the resource tree given range and alignment.
*/
static int find_resource(struct resource *root, struct resource *new,
unsigned long size,
unsigned long min, unsigned long max,
unsigned long align,
void (*alignf)(void *, struct resource *,
unsigned long, unsigned long),
void *alignf_data)
{
struct resource *this = root->child;
 
new->start = root->start;
for(;;) {
if (this)
new->end = this->start;
else
new->end = root->end;
if (new->start < min)
new->start = min;
if (new->end > max)
new->end = max;
new->start = (new->start + align - 1) & ~(align - 1);
if (alignf)
alignf(alignf_data, new, size, align);
if (new->start < new->end && new->end - new->start + 1 >= size) {
new->end = new->start + size - 1;
return 0;
}
if (!this)
break;
new->start = this->end + 1;
this = this->sibling;
}
return -EBUSY;
}
 
/*
* Allocate empty slot in the resource tree given range and alignment.
*/
int allocate_resource(struct resource *root, struct resource *new,
unsigned long size,
unsigned long min, unsigned long max,
unsigned long align,
void (*alignf)(void *, struct resource *,
unsigned long, unsigned long),
void *alignf_data)
{
int err;
 
write_lock(&resource_lock);
err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
return err;
}
 
/*
* This is compatibility stuff for IO resources.
*
* Note how this, unlike the above, knows about
* the IO flag meanings (busy etc).
*
* Request-region creates a new busy region.
*
* Check-region returns non-zero if the area is already busy
*
* Release-region releases a matching busy region.
*/
struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
{
struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
 
if (res) {
memset(res, 0, sizeof(*res));
res->name = name;
res->start = start;
res->end = start + n - 1;
res->flags = IORESOURCE_BUSY;
 
write_lock(&resource_lock);
 
for (;;) {
struct resource *conflict;
 
conflict = __request_resource(parent, res);
if (!conflict)
break;
if (conflict != parent) {
parent = conflict;
if (!(conflict->flags & IORESOURCE_BUSY))
continue;
}
 
/* Uhhuh, that didn't work out.. */
kfree(res);
res = NULL;
break;
}
write_unlock(&resource_lock);
}
return res;
}
 
int __check_region(struct resource *parent, unsigned long start, unsigned long n)
{
struct resource * res;
 
res = __request_region(parent, start, n, "check-region");
if (!res)
return -EBUSY;
 
release_resource(res);
kfree(res);
return 0;
}
 
void __release_region(struct resource *parent, unsigned long start, unsigned long n)
{
struct resource **p;
unsigned long end;
 
p = &parent->child;
end = start + n - 1;
 
for (;;) {
struct resource *res = *p;
 
if (!res)
break;
if (res->start <= start && res->end >= end) {
if (!(res->flags & IORESOURCE_BUSY)) {
p = &res->child;
continue;
}
if (res->start != start || res->end != end)
break;
*p = res->sibling;
kfree(res);
return;
}
p = &res->sibling;
}
printk("Trying to free nonexistent resource <%08lx-%08lx>\n", start, end);
}
 
/*
* Called from init/main.c to reserve IO ports.
*/
#define MAXRESERVE 4
static int __init reserve_setup(char *str)
{
static int reserved = 0;
static struct resource reserve[MAXRESERVE];
 
for (;;) {
int io_start, io_num;
int x = reserved;
 
if (get_option (&str, &io_start) != 2)
break;
if (get_option (&str, &io_num) == 0)
break;
if (x < MAXRESERVE) {
struct resource *res = reserve + x;
res->name = "reserved";
res->start = io_start;
res->end = io_start + io_num - 1;
res->flags = IORESOURCE_BUSY;
res->child = NULL;
if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
reserved = x+1;
}
}
return 1;
}
 
__setup("reserve=", reserve_setup);
/uid16.c
0,0 → 1,163
/*
* Wrapper functions for 16bit uid back compatibility. All nicely tied
* together in the faint hope we can take the out in five years time.
*/
 
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/smp_lock.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/init.h>
#include <linux/highuid.h>
 
#include <asm/uaccess.h>
 
extern asmlinkage long sys_chown(const char *, uid_t,gid_t);
extern asmlinkage long sys_lchown(const char *, uid_t,gid_t);
extern asmlinkage long sys_fchown(unsigned int, uid_t,gid_t);
extern asmlinkage long sys_setregid(gid_t, gid_t);
extern asmlinkage long sys_setgid(gid_t);
extern asmlinkage long sys_setreuid(uid_t, uid_t);
extern asmlinkage long sys_setuid(uid_t);
extern asmlinkage long sys_setresuid(uid_t, uid_t, uid_t);
extern asmlinkage long sys_setresgid(gid_t, gid_t, gid_t);
extern asmlinkage long sys_setfsuid(uid_t);
extern asmlinkage long sys_setfsgid(gid_t);
asmlinkage long sys_chown16(const char * filename, old_uid_t user, old_gid_t group)
{
return sys_chown(filename, low2highuid(user), low2highgid(group));
}
 
asmlinkage long sys_lchown16(const char * filename, old_uid_t user, old_gid_t group)
{
return sys_lchown(filename, low2highuid(user), low2highgid(group));
}
 
asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
{
return sys_fchown(fd, low2highuid(user), low2highgid(group));
}
 
asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
{
return sys_setregid(low2highgid(rgid), low2highgid(egid));
}
 
asmlinkage long sys_setgid16(old_gid_t gid)
{
return sys_setgid((gid_t)gid);
}
 
asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
{
return sys_setreuid(low2highuid(ruid), low2highuid(euid));
}
 
asmlinkage long sys_setuid16(old_uid_t uid)
{
return sys_setuid((uid_t)uid);
}
 
asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
{
return sys_setresuid(low2highuid(ruid), low2highuid(euid),
low2highuid(suid));
}
 
asmlinkage long sys_getresuid16(old_uid_t *ruid, old_uid_t *euid, old_uid_t *suid)
{
int retval;
 
if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
!(retval = put_user(high2lowuid(current->euid), euid)))
retval = put_user(high2lowuid(current->suid), suid);
 
return retval;
}
 
asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
{
return sys_setresgid(low2highgid(rgid), low2highgid(egid),
low2highgid(sgid));
}
 
asmlinkage long sys_getresgid16(old_gid_t *rgid, old_gid_t *egid, old_gid_t *sgid)
{
int retval;
 
if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
!(retval = put_user(high2lowgid(current->egid), egid)))
retval = put_user(high2lowgid(current->sgid), sgid);
 
return retval;
}
 
asmlinkage long sys_setfsuid16(old_uid_t uid)
{
return sys_setfsuid((uid_t)uid);
}
 
asmlinkage long sys_setfsgid16(old_gid_t gid)
{
return sys_setfsgid((gid_t)gid);
}
 
asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t *grouplist)
{
old_gid_t groups[NGROUPS];
int i,j;
 
if (gidsetsize < 0)
return -EINVAL;
i = current->ngroups;
if (gidsetsize) {
if (i > gidsetsize)
return -EINVAL;
for(j=0;j<i;j++)
groups[j] = current->groups[j];
if (copy_to_user(grouplist, groups, sizeof(old_gid_t)*i))
return -EFAULT;
}
return i;
}
 
asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t *grouplist)
{
old_gid_t groups[NGROUPS];
int i;
 
if (!capable(CAP_SETGID))
return -EPERM;
if ((unsigned) gidsetsize > NGROUPS)
return -EINVAL;
if (copy_from_user(groups, grouplist, gidsetsize * sizeof(old_gid_t)))
return -EFAULT;
for (i = 0 ; i < gidsetsize ; i++)
current->groups[i] = (gid_t)groups[i];
current->ngroups = gidsetsize;
return 0;
}
 
asmlinkage long sys_getuid16(void)
{
return high2lowuid(current->uid);
}
 
asmlinkage long sys_geteuid16(void)
{
return high2lowuid(current->euid);
}
 
asmlinkage long sys_getgid16(void)
{
return high2lowgid(current->gid);
}
 
asmlinkage long sys_getegid16(void)
{
return high2lowgid(current->egid);
}
/itimer.c
0,0 → 1,170
/*
* linux/kernel/itimer.c
*
* Copyright (C) 1992 Darren Senn
*/
 
/* These are all the functions necessary to implement itimers */
 
#include <linux/mm.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
 
#include <asm/uaccess.h>
 
/*
* change timeval to jiffies, trying to avoid the
* most obvious overflows..
*
* The tv_*sec values are signed, but nothing seems to
* indicate whether we really should use them as signed values
* when doing itimers. POSIX doesn't mention this (but if
* alarm() uses itimers without checking, we have to use unsigned
* arithmetic).
*/
static unsigned long tvtojiffies(struct timeval *value)
{
unsigned long sec = (unsigned) value->tv_sec;
unsigned long usec = (unsigned) value->tv_usec;
 
if (sec > (ULONG_MAX / HZ))
return ULONG_MAX;
usec += 1000000 / HZ - 1;
usec /= 1000000 / HZ;
return HZ*sec+usec;
}
 
static void jiffiestotv(unsigned long jiffies, struct timeval *value)
{
value->tv_usec = (jiffies % HZ) * (1000000 / HZ);
value->tv_sec = jiffies / HZ;
}
 
int do_getitimer(int which, struct itimerval *value)
{
register unsigned long val, interval;
 
switch (which) {
case ITIMER_REAL:
interval = current->it_real_incr;
val = 0;
/*
* FIXME! This needs to be atomic, in case the kernel timer happens!
*/
if (timer_pending(&current->real_timer)) {
val = current->real_timer.expires - jiffies;
 
/* look out for negative/zero itimer.. */
if ((long) val <= 0)
val = 1;
}
break;
case ITIMER_VIRTUAL:
val = current->it_virt_value;
interval = current->it_virt_incr;
break;
case ITIMER_PROF:
val = current->it_prof_value;
interval = current->it_prof_incr;
break;
default:
return(-EINVAL);
}
jiffiestotv(val, &value->it_value);
jiffiestotv(interval, &value->it_interval);
return 0;
}
 
/* SMP: Only we modify our itimer values. */
asmlinkage long sys_getitimer(int which, struct itimerval *value)
{
int error = -EFAULT;
struct itimerval get_buffer;
 
if (value) {
error = do_getitimer(which, &get_buffer);
if (!error &&
copy_to_user(value, &get_buffer, sizeof(get_buffer)))
error = -EFAULT;
}
return error;
}
 
void it_real_fn(unsigned long __data)
{
struct task_struct * p = (struct task_struct *) __data;
unsigned long interval;
 
send_sig(SIGALRM, p, 1);
interval = p->it_real_incr;
if (interval) {
if (interval > (unsigned long) LONG_MAX)
interval = LONG_MAX;
p->real_timer.expires = jiffies + interval;
add_timer(&p->real_timer);
}
}
 
int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
{
register unsigned long i, j;
int k;
 
i = tvtojiffies(&value->it_interval);
j = tvtojiffies(&value->it_value);
if (ovalue && (k = do_getitimer(which, ovalue)) < 0)
return k;
switch (which) {
case ITIMER_REAL:
del_timer_sync(&current->real_timer);
current->it_real_value = j;
current->it_real_incr = i;
if (!j)
break;
if (j > (unsigned long) LONG_MAX)
j = LONG_MAX;
i = j + jiffies;
current->real_timer.expires = i;
add_timer(&current->real_timer);
break;
case ITIMER_VIRTUAL:
if (j)
j++;
current->it_virt_value = j;
current->it_virt_incr = i;
break;
case ITIMER_PROF:
if (j)
j++;
current->it_prof_value = j;
current->it_prof_incr = i;
break;
default:
return -EINVAL;
}
return 0;
}
 
/* SMP: Again, only we play with our itimers, and signals are SMP safe
* now so that is not an issue at all anymore.
*/
asmlinkage long sys_setitimer(int which, struct itimerval *value,
struct itimerval *ovalue)
{
struct itimerval set_buffer, get_buffer;
int error;
 
if (value) {
if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
return -EFAULT;
} else
memset((char *) &set_buffer, 0, sizeof(set_buffer));
 
error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : 0);
if (error || !ovalue)
return error;
 
if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
return -EFAULT;
return 0;
}
/kmod.c
0,0 → 1,376
/*
kmod, the new module loader (replaces kerneld)
Kirk Petersen
 
Reorganized not to be a daemon by Adam Richter, with guidance
from Greg Zornetzer.
 
Modified to avoid chroot and file sharing problems.
Mikael Pettersson
 
Limit the concurrent number of kmod modprobes to catch loops from
"modprobe needs a service that is in a module".
Keith Owens <kaos@ocs.com.au> December 1999
 
Unblock all signals when we exec a usermode process.
Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
*/
 
#define __KERNEL_SYSCALLS__
 
#include <linux/config.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/namespace.h>
#include <linux/completion.h>
 
#include <asm/uaccess.h>
 
extern int max_threads;
 
static inline void
use_init_fs_context(void)
{
struct fs_struct *our_fs, *init_fs;
struct dentry *root, *pwd;
struct vfsmount *rootmnt, *pwdmnt;
struct namespace *our_ns, *init_ns;
 
/*
* Make modprobe's fs context be a copy of init's.
*
* We cannot use the user's fs context, because it
* may have a different root than init.
* Since init was created with CLONE_FS, we can grab
* its fs context from "init_task".
*
* The fs context has to be a copy. If it is shared
* with init, then any chdir() call in modprobe will
* also affect init and the other threads sharing
* init_task's fs context.
*
* We created the exec_modprobe thread without CLONE_FS,
* so we can update the fields in our fs context freely.
*/
 
init_fs = init_task.fs;
init_ns = init_task.namespace;
get_namespace(init_ns);
our_ns = current->namespace;
current->namespace = init_ns;
put_namespace(our_ns);
read_lock(&init_fs->lock);
rootmnt = mntget(init_fs->rootmnt);
root = dget(init_fs->root);
pwdmnt = mntget(init_fs->pwdmnt);
pwd = dget(init_fs->pwd);
read_unlock(&init_fs->lock);
 
/* FIXME - unsafe ->fs access */
our_fs = current->fs;
our_fs->umask = init_fs->umask;
set_fs_root(our_fs, rootmnt, root);
set_fs_pwd(our_fs, pwdmnt, pwd);
write_lock(&our_fs->lock);
if (our_fs->altroot) {
struct vfsmount *mnt = our_fs->altrootmnt;
struct dentry *dentry = our_fs->altroot;
our_fs->altrootmnt = NULL;
our_fs->altroot = NULL;
write_unlock(&our_fs->lock);
dput(dentry);
mntput(mnt);
} else
write_unlock(&our_fs->lock);
dput(root);
mntput(rootmnt);
dput(pwd);
mntput(pwdmnt);
}
 
int exec_usermodehelper(char *program_path, char *argv[], char *envp[])
{
int i;
struct task_struct *curtask = current;
 
curtask->session = 1;
curtask->pgrp = 1;
 
use_init_fs_context();
 
/* Prevent parent user process from sending signals to child.
Otherwise, if the modprobe program does not exist, it might
be possible to get a user defined signal handler to execute
as the super user right after the execve fails if you time
the signal just right.
*/
spin_lock_irq(&curtask->sigmask_lock);
sigemptyset(&curtask->blocked);
flush_signals(curtask);
flush_signal_handlers(curtask);
recalc_sigpending(curtask);
spin_unlock_irq(&curtask->sigmask_lock);
 
for (i = 0; i < curtask->files->max_fds; i++ ) {
if (curtask->files->fd[i]) close(i);
}
 
switch_uid(INIT_USER);
 
/* Give kmod all effective privileges.. */
curtask->euid = curtask->uid = curtask->suid = curtask->fsuid = 0;
curtask->egid = curtask->gid = curtask->sgid = curtask->fsgid = 0;
 
curtask->ngroups = 0;
 
cap_set_full(curtask->cap_effective);
 
/* Allow execve args to be in kernel space. */
set_fs(KERNEL_DS);
 
/* Go, go, go... */
if (execve(program_path, argv, envp) < 0)
return -errno;
return 0;
}
 
#ifdef CONFIG_KMOD
 
/*
modprobe_path is set via /proc/sys.
*/
char modprobe_path[256] = "/sbin/modprobe";
 
static int exec_modprobe(void * module_name)
{
static char * envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
char *argv[] = { modprobe_path, "-s", "-k", "--", (char*)module_name, NULL };
int ret;
 
ret = exec_usermodehelper(modprobe_path, argv, envp);
if (ret) {
printk(KERN_ERR
"kmod: failed to exec %s -s -k %s, errno = %d\n",
modprobe_path, (char*) module_name, errno);
}
return ret;
}
 
/**
* request_module - try to load a kernel module
* @module_name: Name of module
*
* Load a module using the user mode module loader. The function returns
* zero on success or a negative errno code on failure. Note that a
* successful module load does not mean the module did not then unload
* and exit on an error of its own. Callers must check that the service
* they requested is now available not blindly invoke it.
*
* If module auto-loading support is disabled then this function
* becomes a no-operation.
*/
int request_module(const char * module_name)
{
pid_t pid;
int waitpid_result;
sigset_t tmpsig;
int i;
static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
 
/* Don't allow request_module() before the root fs is mounted! */
if ( ! current->fs->root ) {
printk(KERN_ERR "request_module[%s]: Root fs not mounted\n",
module_name);
return -EPERM;
}
 
/* If modprobe needs a service that is in a module, we get a recursive
* loop. Limit the number of running kmod threads to max_threads/2 or
* MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
* would be to run the parents of this process, counting how many times
* kmod was invoked. That would mean accessing the internals of the
* process tables to get the command line, proc_pid_cmdline is static
* and it is not worth changing the proc code just to handle this case.
* KAO.
*/
i = max_threads/2;
if (i > MAX_KMOD_CONCURRENT)
i = MAX_KMOD_CONCURRENT;
atomic_inc(&kmod_concurrent);
if (atomic_read(&kmod_concurrent) > i) {
if (kmod_loop_msg++ < 5)
printk(KERN_ERR
"kmod: runaway modprobe loop assumed and stopped\n");
atomic_dec(&kmod_concurrent);
return -ENOMEM;
}
 
pid = kernel_thread(exec_modprobe, (void*) module_name, 0);
if (pid < 0) {
printk(KERN_ERR "request_module[%s]: fork failed, errno %d\n", module_name, -pid);
atomic_dec(&kmod_concurrent);
return pid;
}
 
/* Block everything but SIGKILL/SIGSTOP */
spin_lock_irq(&current->sigmask_lock);
tmpsig = current->blocked;
siginitsetinv(&current->blocked, sigmask(SIGKILL) | sigmask(SIGSTOP));
recalc_sigpending(current);
spin_unlock_irq(&current->sigmask_lock);
 
waitpid_result = waitpid(pid, NULL, __WCLONE);
atomic_dec(&kmod_concurrent);
 
/* Allow signals again.. */
spin_lock_irq(&current->sigmask_lock);
current->blocked = tmpsig;
recalc_sigpending(current);
spin_unlock_irq(&current->sigmask_lock);
 
if (waitpid_result != pid) {
printk(KERN_ERR "request_module[%s]: waitpid(%d,...) failed, errno %d\n",
module_name, pid, -waitpid_result);
}
return 0;
}
#endif /* CONFIG_KMOD */
 
 
#ifdef CONFIG_HOTPLUG
/*
hotplug path is set via /proc/sys
invoked by hotplug-aware bus drivers,
with exec_usermodehelper and some thread-spawner
 
argv [0] = hotplug_path;
argv [1] = "usb", "scsi", "pci", "network", etc;
... plus optional type-specific parameters
argv [n] = 0;
 
envp [*] = HOME, PATH; optional type-specific parameters
 
a hotplug bus should invoke this for device add/remove
events. the command is expected to load drivers when
necessary, and may perform additional system setup.
*/
char hotplug_path[256] = "/sbin/hotplug";
 
EXPORT_SYMBOL(hotplug_path);
 
#endif /* CONFIG_HOTPLUG */
 
struct subprocess_info {
struct completion *complete;
char *path;
char **argv;
char **envp;
pid_t retval;
};
 
/*
* This is the task which runs the usermode application
*/
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
int retval;
 
retval = -EPERM;
if (current->fs->root)
retval = exec_usermodehelper(sub_info->path, sub_info->argv, sub_info->envp);
 
/* Exec failed? */
sub_info->retval = (pid_t)retval;
do_exit(0);
}
 
/*
* This is run by keventd.
*/
static void __call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
 
/*
* CLONE_VFORK: wait until the usermode helper has execve'd successfully
* We need the data structures to stay around until that is done.
*/
pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD);
if (pid < 0)
sub_info->retval = pid;
complete(sub_info->complete);
}
 
/**
* call_usermodehelper - start a usermode application
* @path: pathname for the application
* @argv: null-terminated argument list
* @envp: null-terminated environment list
*
* Runs a user-space application. The application is started asynchronously. It
* runs as a child of keventd. It runs with full root capabilities. keventd silently
* reaps the child when it exits.
*
* Must be called from process context. Returns zero on success, else a negative
* error code.
*/
int call_usermodehelper(char *path, char **argv, char **envp)
{
DECLARE_COMPLETION(work);
struct subprocess_info sub_info = {
complete: &work,
path: path,
argv: argv,
envp: envp,
retval: 0,
};
struct tq_struct tqs = {
routine: __call_usermodehelper,
data: &sub_info,
};
 
if (path[0] == '\0')
goto out;
 
if (current_is_keventd()) {
/* We can't wait on keventd! */
__call_usermodehelper(&sub_info);
} else {
schedule_task(&tqs);
wait_for_completion(&work);
}
out:
return sub_info.retval;
}
 
/*
* This is for the serialisation of device probe() functions
* against device open() functions
*/
static DECLARE_MUTEX(dev_probe_sem);
 
void dev_probe_lock(void)
{
down(&dev_probe_sem);
}
 
void dev_probe_unlock(void)
{
up(&dev_probe_sem);
}
 
EXPORT_SYMBOL(exec_usermodehelper);
EXPORT_SYMBOL(call_usermodehelper);
 
#ifdef CONFIG_KMOD
EXPORT_SYMBOL(request_module);
#endif
 
/panic.c
0,0 → 1,154
/*
* linux/kernel/panic.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
 
/*
* This function is used through-out the kernel (including mm and fs)
* to indicate a major problem.
*/
#include <linux/config.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/reboot.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/interrupt.h>
#include <linux/console.h>
 
asmlinkage void sys_sync(void); /* it's really int */
 
int panic_timeout;
 
struct notifier_block *panic_notifier_list;
 
static int __init panic_setup(char *str)
{
panic_timeout = simple_strtoul(str, NULL, 0);
return 1;
}
 
__setup("panic=", panic_setup);
 
int machine_paniced;
 
/**
* panic - halt the system
* @fmt: The text string to print
*
* Display a message, then perform cleanups. Functions in the panic
* notifier list are called after the filesystem cache is flushed (when possible).
*
* This function never returns.
*/
NORET_TYPE void panic(const char * fmt, ...)
{
static char buf[1024];
va_list args;
#if defined(CONFIG_ARCH_S390)
unsigned long caller = (unsigned long) __builtin_return_address(0);
#endif
 
#ifdef CONFIG_VT
disable_console_blank();
#endif
machine_paniced = 1;
bust_spinlocks(1);
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
printk(KERN_EMERG "Kernel panic: %s\n",buf);
if (in_interrupt())
printk(KERN_EMERG "In interrupt handler - not syncing\n");
else if (!current->pid)
printk(KERN_EMERG "In idle task - not syncing\n");
else
sys_sync();
bust_spinlocks(0);
 
#ifdef CONFIG_SMP
smp_send_stop();
#endif
 
notifier_call_chain(&panic_notifier_list, 0, NULL);
 
if (panic_timeout > 0)
{
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked..
*/
printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
mdelay(panic_timeout*1000);
/*
* Should we run the reboot notifier. For the moment Im
* choosing not too. It might crash, be corrupt or do
* more harm than good for other reasons.
*/
machine_restart(NULL);
}
#ifdef __sparc__
{
extern int stop_a_enabled;
/* Make sure the user can actually press L1-A */
stop_a_enabled = 1;
printk("Press L1-A to return to the boot prom\n");
}
#endif
#if defined(CONFIG_ARCH_S390)
disabled_wait(caller);
#endif
sti();
for(;;) {
#if defined(CONFIG_X86) && defined(CONFIG_VT)
extern void panic_blink(void);
panic_blink();
#endif
CHECK_EMERGENCY_SYNC
}
}
 
/**
* print_tainted - return a string to represent the kernel taint state.
*
* The string is overwritten by the next call to print_taint().
*/
const char *print_tainted()
{
static char buf[20];
if (tainted) {
snprintf(buf, sizeof(buf), "Tainted: %c%c",
tainted & 1 ? 'P' : 'G',
tainted & 2 ? 'F' : ' ');
}
else
snprintf(buf, sizeof(buf), "Not tainted");
return(buf);
}
 
int tainted = 0;
 
/*
* A BUG() call in an inline function in a header should be avoided,
* because it can seriously bloat the kernel. So here we have
* helper functions.
* We lose the BUG()-time file-and-line info this way, but it's
* usually not very useful from an inline anyway. The backtrace
* tells us what we want to know.
*/
 
void __out_of_line_bug(int line)
{
printk("kernel BUG in header file at line %d\n", line);
 
BUG();
 
/* Satisfy __attribute__((noreturn)) */
for ( ; ; )
;
}
/info.c
0,0 → 1,79
/*
* linux/kernel/info.c
*
* Copyright (C) 1992 Darren Senn
*/
 
/* This implements the sysinfo() system call */
 
#include <linux/mm.h>
#include <linux/unistd.h>
#include <linux/swap.h>
#include <linux/smp_lock.h>
 
#include <asm/uaccess.h>
 
asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
 
memset((char *)&val, 0, sizeof(struct sysinfo));
 
cli();
val.uptime = jiffies / HZ;
 
val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
 
val.procs = nr_threads-1;
sti();
 
si_meminfo(&val);
si_swapinfo(&val);
 
{
unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount;
 
/* If the sum of all the available memory (i.e. ram + swap)
* is less than can be stored in a 32 bit unsigned long then
* we can be binary compatible with 2.2.x kernels. If not,
* well, in that case 2.2.x was broken anyways...
*
* -Erik Andersen <andersee@debian.org> */
 
mem_total = val.totalram + val.totalswap;
if (mem_total < val.totalram || mem_total < val.totalswap)
goto out;
bitcount = 0;
mem_unit = val.mem_unit;
while (mem_unit > 1) {
bitcount++;
mem_unit >>= 1;
sav_total = mem_total;
mem_total <<= 1;
if (mem_total < sav_total)
goto out;
}
 
/* If mem_total did not overflow, multiply all memory values by
* val.mem_unit and set it to 1. This leaves things compatible
* with 2.2.x, and also retains compatibility with earlier 2.4.x
* kernels... */
 
val.mem_unit = 1;
val.totalram <<= bitcount;
val.freeram <<= bitcount;
val.sharedram <<= bitcount;
val.bufferram <<= bitcount;
val.totalswap <<= bitcount;
val.freeswap <<= bitcount;
val.totalhigh <<= bitcount;
val.freehigh <<= bitcount;
}
out:
if (copy_to_user(info, &val, sizeof(struct sysinfo)))
return -EFAULT;
return 0;
}
/Makefile
0,0 → 1,34
#
# Makefile for the linux kernel.
#
# Note! Dependencies are done automagically by 'make dep', which also
# removes any old dependencies. DON'T put your own dependencies here
# unless it's something special (ie not a .c file).
#
# Note 2! The CFLAGS definitions are now in the main makefile...
 
O_TARGET := kernel.o
 
export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o printk.o
 
obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o info.o time.o softirq.o resource.o \
sysctl.o acct.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o context.o
 
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += ksyms.o
obj-$(CONFIG_PM) += pm.o
 
ifneq ($(CONFIG_IA64),y)
ifneq ($(CONFIG_OR32),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
# me. I suspect most platforms don't need this, but until we know that for sure
# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
# to get a correct value for the wait-channel (WCHAN in ps). --davidm
CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
endif
endif
 
include $(TOPDIR)/Rules.make

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.