OpenCores
URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [kernel/] [cpuset.c] - Blame information for rev 17

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 3 xianfeng
/*
2
 *  kernel/cpuset.c
3
 *
4
 *  Processor and Memory placement constraints for sets of tasks.
5
 *
6
 *  Copyright (C) 2003 BULL SA.
7
 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8
 *  Copyright (C) 2006 Google, Inc
9
 *
10
 *  Portions derived from Patrick Mochel's sysfs code.
11
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
12
 *
13
 *  2003-10-10 Written by Simon Derr.
14
 *  2003-10-22 Updates by Stephen Hemminger.
15
 *  2004 May-July Rework by Paul Jackson.
16
 *  2006 Rework by Paul Menage to use generic cgroups
17
 *
18
 *  This file is subject to the terms and conditions of the GNU General Public
19
 *  License.  See the file COPYING in the main directory of the Linux
20
 *  distribution for more details.
21
 */
22
 
23
#include <linux/cpu.h>
24
#include <linux/cpumask.h>
25
#include <linux/cpuset.h>
26
#include <linux/err.h>
27
#include <linux/errno.h>
28
#include <linux/file.h>
29
#include <linux/fs.h>
30
#include <linux/init.h>
31
#include <linux/interrupt.h>
32
#include <linux/kernel.h>
33
#include <linux/kmod.h>
34
#include <linux/list.h>
35
#include <linux/mempolicy.h>
36
#include <linux/mm.h>
37
#include <linux/module.h>
38
#include <linux/mount.h>
39
#include <linux/namei.h>
40
#include <linux/pagemap.h>
41
#include <linux/prio_heap.h>
42
#include <linux/proc_fs.h>
43
#include <linux/rcupdate.h>
44
#include <linux/sched.h>
45
#include <linux/seq_file.h>
46
#include <linux/security.h>
47
#include <linux/slab.h>
48
#include <linux/spinlock.h>
49
#include <linux/stat.h>
50
#include <linux/string.h>
51
#include <linux/time.h>
52
#include <linux/backing-dev.h>
53
#include <linux/sort.h>
54
 
55
#include <asm/uaccess.h>
56
#include <asm/atomic.h>
57
#include <linux/mutex.h>
58
#include <linux/kfifo.h>
59
 
60
/*
61
 * Tracks how many cpusets are currently defined in system.
62
 * When there is only one cpuset (the root cpuset) we can
63
 * short circuit some hooks.
64
 */
65
int number_of_cpusets __read_mostly;
66
 
67
/* Retrieve the cpuset from a cgroup */
68
struct cgroup_subsys cpuset_subsys;
69
struct cpuset;
70
 
71
/* See "Frequency meter" comments, below. */
72
 
73
struct fmeter {
74
        int cnt;                /* unprocessed events count */
75
        int val;                /* most recent output value */
76
        time_t time;            /* clock (secs) when val computed */
77
        spinlock_t lock;        /* guards read or write of above */
78
};
79
 
80
struct cpuset {
81
        struct cgroup_subsys_state css;
82
 
83
        unsigned long flags;            /* "unsigned long" so bitops work */
84
        cpumask_t cpus_allowed;         /* CPUs allowed to tasks in cpuset */
85
        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
86
 
87
        struct cpuset *parent;          /* my parent */
88
 
89
        /*
90
         * Copy of global cpuset_mems_generation as of the most
91
         * recent time this cpuset changed its mems_allowed.
92
         */
93
        int mems_generation;
94
 
95
        struct fmeter fmeter;           /* memory_pressure filter */
96
 
97
        /* partition number for rebuild_sched_domains() */
98
        int pn;
99
};
100
 
101
/* Retrieve the cpuset for a cgroup */
102
static inline struct cpuset *cgroup_cs(struct cgroup *cont)
103
{
104
        return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
105
                            struct cpuset, css);
106
}
107
 
108
/* Retrieve the cpuset for a task */
109
static inline struct cpuset *task_cs(struct task_struct *task)
110
{
111
        return container_of(task_subsys_state(task, cpuset_subsys_id),
112
                            struct cpuset, css);
113
}
114
 
115
 
116
/* bits in struct cpuset flags field */
117
typedef enum {
118
        CS_CPU_EXCLUSIVE,
119
        CS_MEM_EXCLUSIVE,
120
        CS_MEMORY_MIGRATE,
121
        CS_SCHED_LOAD_BALANCE,
122
        CS_SPREAD_PAGE,
123
        CS_SPREAD_SLAB,
124
} cpuset_flagbits_t;
125
 
126
/* convenient tests for these bits */
127
static inline int is_cpu_exclusive(const struct cpuset *cs)
128
{
129
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
130
}
131
 
132
static inline int is_mem_exclusive(const struct cpuset *cs)
133
{
134
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
135
}
136
 
137
static inline int is_sched_load_balance(const struct cpuset *cs)
138
{
139
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
140
}
141
 
142
static inline int is_memory_migrate(const struct cpuset *cs)
143
{
144
        return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
145
}
146
 
147
static inline int is_spread_page(const struct cpuset *cs)
148
{
149
        return test_bit(CS_SPREAD_PAGE, &cs->flags);
150
}
151
 
152
static inline int is_spread_slab(const struct cpuset *cs)
153
{
154
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
155
}
156
 
157
/*
158
 * Increment this integer everytime any cpuset changes its
159
 * mems_allowed value.  Users of cpusets can track this generation
160
 * number, and avoid having to lock and reload mems_allowed unless
161
 * the cpuset they're using changes generation.
162
 *
163
 * A single, global generation is needed because attach_task() could
164
 * reattach a task to a different cpuset, which must not have its
165
 * generation numbers aliased with those of that tasks previous cpuset.
166
 *
167
 * Generations are needed for mems_allowed because one task cannot
168
 * modify anothers memory placement.  So we must enable every task,
169
 * on every visit to __alloc_pages(), to efficiently check whether
170
 * its current->cpuset->mems_allowed has changed, requiring an update
171
 * of its current->mems_allowed.
172
 *
173
 * Since cpuset_mems_generation is guarded by manage_mutex,
174
 * there is no need to mark it atomic.
175
 */
176
static int cpuset_mems_generation;
177
 
178
static struct cpuset top_cpuset = {
179
        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
180
        .cpus_allowed = CPU_MASK_ALL,
181
        .mems_allowed = NODE_MASK_ALL,
182
};
183
 
184
/*
185
 * We have two global cpuset mutexes below.  They can nest.
186
 * It is ok to first take manage_mutex, then nest callback_mutex.  We also
187
 * require taking task_lock() when dereferencing a tasks cpuset pointer.
188
 * See "The task_lock() exception", at the end of this comment.
189
 *
190
 * A task must hold both mutexes to modify cpusets.  If a task
191
 * holds manage_mutex, then it blocks others wanting that mutex,
192
 * ensuring that it is the only task able to also acquire callback_mutex
193
 * and be able to modify cpusets.  It can perform various checks on
194
 * the cpuset structure first, knowing nothing will change.  It can
195
 * also allocate memory while just holding manage_mutex.  While it is
196
 * performing these checks, various callback routines can briefly
197
 * acquire callback_mutex to query cpusets.  Once it is ready to make
198
 * the changes, it takes callback_mutex, blocking everyone else.
199
 *
200
 * Calls to the kernel memory allocator can not be made while holding
201
 * callback_mutex, as that would risk double tripping on callback_mutex
202
 * from one of the callbacks into the cpuset code from within
203
 * __alloc_pages().
204
 *
205
 * If a task is only holding callback_mutex, then it has read-only
206
 * access to cpusets.
207
 *
208
 * The task_struct fields mems_allowed and mems_generation may only
209
 * be accessed in the context of that task, so require no locks.
210
 *
211
 * Any task can increment and decrement the count field without lock.
212
 * So in general, code holding manage_mutex or callback_mutex can't rely
213
 * on the count field not changing.  However, if the count goes to
214
 * zero, then only attach_task(), which holds both mutexes, can
215
 * increment it again.  Because a count of zero means that no tasks
216
 * are currently attached, therefore there is no way a task attached
217
 * to that cpuset can fork (the other way to increment the count).
218
 * So code holding manage_mutex or callback_mutex can safely assume that
219
 * if the count is zero, it will stay zero.  Similarly, if a task
220
 * holds manage_mutex or callback_mutex on a cpuset with zero count, it
221
 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
222
 * both of those mutexes.
223
 *
224
 * The cpuset_common_file_write handler for operations that modify
225
 * the cpuset hierarchy holds manage_mutex across the entire operation,
226
 * single threading all such cpuset modifications across the system.
227
 *
228
 * The cpuset_common_file_read() handlers only hold callback_mutex across
229
 * small pieces of code, such as when reading out possibly multi-word
230
 * cpumasks and nodemasks.
231
 *
232
 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
233
 * (usually) take either mutex.  These are the two most performance
234
 * critical pieces of code here.  The exception occurs on cpuset_exit(),
235
 * when a task in a notify_on_release cpuset exits.  Then manage_mutex
236
 * is taken, and if the cpuset count is zero, a usermode call made
237
 * to /sbin/cpuset_release_agent with the name of the cpuset (path
238
 * relative to the root of cpuset file system) as the argument.
239
 *
240
 * A cpuset can only be deleted if both its 'count' of using tasks
241
 * is zero, and its list of 'children' cpusets is empty.  Since all
242
 * tasks in the system use _some_ cpuset, and since there is always at
243
 * least one task in the system (init), therefore, top_cpuset
244
 * always has either children cpusets and/or using tasks.  So we don't
245
 * need a special hack to ensure that top_cpuset cannot be deleted.
246
 *
247
 * The above "Tale of Two Semaphores" would be complete, but for:
248
 *
249
 *      The task_lock() exception
250
 *
251
 * The need for this exception arises from the action of attach_task(),
252
 * which overwrites one tasks cpuset pointer with another.  It does
253
 * so using both mutexes, however there are several performance
254
 * critical places that need to reference task->cpuset without the
255
 * expense of grabbing a system global mutex.  Therefore except as
256
 * noted below, when dereferencing or, as in attach_task(), modifying
257
 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
258
 * (task->alloc_lock) already in the task_struct routinely used for
259
 * such matters.
260
 *
261
 * P.S.  One more locking exception.  RCU is used to guard the
262
 * update of a tasks cpuset pointer by attach_task() and the
263
 * access of task->cpuset->mems_generation via that pointer in
264
 * the routine cpuset_update_task_memory_state().
265
 */
266
 
267
static DEFINE_MUTEX(callback_mutex);
268
 
269
/* This is ugly, but preserves the userspace API for existing cpuset
270
 * users. If someone tries to mount the "cpuset" filesystem, we
271
 * silently switch it to mount "cgroup" instead */
272
static int cpuset_get_sb(struct file_system_type *fs_type,
273
                         int flags, const char *unused_dev_name,
274
                         void *data, struct vfsmount *mnt)
275
{
276
        struct file_system_type *cgroup_fs = get_fs_type("cgroup");
277
        int ret = -ENODEV;
278
        if (cgroup_fs) {
279
                char mountopts[] =
280
                        "cpuset,noprefix,"
281
                        "release_agent=/sbin/cpuset_release_agent";
282
                ret = cgroup_fs->get_sb(cgroup_fs, flags,
283
                                           unused_dev_name, mountopts, mnt);
284
                put_filesystem(cgroup_fs);
285
        }
286
        return ret;
287
}
288
 
289
static struct file_system_type cpuset_fs_type = {
290
        .name = "cpuset",
291
        .get_sb = cpuset_get_sb,
292
};
293
 
294
/*
295
 * Return in *pmask the portion of a cpusets's cpus_allowed that
296
 * are online.  If none are online, walk up the cpuset hierarchy
297
 * until we find one that does have some online cpus.  If we get
298
 * all the way to the top and still haven't found any online cpus,
299
 * return cpu_online_map.  Or if passed a NULL cs from an exit'ing
300
 * task, return cpu_online_map.
301
 *
302
 * One way or another, we guarantee to return some non-empty subset
303
 * of cpu_online_map.
304
 *
305
 * Call with callback_mutex held.
306
 */
307
 
308
static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
309
{
310
        while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
311
                cs = cs->parent;
312
        if (cs)
313
                cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
314
        else
315
                *pmask = cpu_online_map;
316
        BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
317
}
318
 
319
/*
320
 * Return in *pmask the portion of a cpusets's mems_allowed that
321
 * are online, with memory.  If none are online with memory, walk
322
 * up the cpuset hierarchy until we find one that does have some
323
 * online mems.  If we get all the way to the top and still haven't
324
 * found any online mems, return node_states[N_HIGH_MEMORY].
325
 *
326
 * One way or another, we guarantee to return some non-empty subset
327
 * of node_states[N_HIGH_MEMORY].
328
 *
329
 * Call with callback_mutex held.
330
 */
331
 
332
static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
333
{
334
        while (cs && !nodes_intersects(cs->mems_allowed,
335
                                        node_states[N_HIGH_MEMORY]))
336
                cs = cs->parent;
337
        if (cs)
338
                nodes_and(*pmask, cs->mems_allowed,
339
                                        node_states[N_HIGH_MEMORY]);
340
        else
341
                *pmask = node_states[N_HIGH_MEMORY];
342
        BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
343
}
344
 
345
/**
346
 * cpuset_update_task_memory_state - update task memory placement
347
 *
348
 * If the current tasks cpusets mems_allowed changed behind our
349
 * backs, update current->mems_allowed, mems_generation and task NUMA
350
 * mempolicy to the new value.
351
 *
352
 * Task mempolicy is updated by rebinding it relative to the
353
 * current->cpuset if a task has its memory placement changed.
354
 * Do not call this routine if in_interrupt().
355
 *
356
 * Call without callback_mutex or task_lock() held.  May be
357
 * called with or without manage_mutex held.  Thanks in part to
358
 * 'the_top_cpuset_hack', the tasks cpuset pointer will never
359
 * be NULL.  This routine also might acquire callback_mutex and
360
 * current->mm->mmap_sem during call.
361
 *
362
 * Reading current->cpuset->mems_generation doesn't need task_lock
363
 * to guard the current->cpuset derefence, because it is guarded
364
 * from concurrent freeing of current->cpuset by attach_task(),
365
 * using RCU.
366
 *
367
 * The rcu_dereference() is technically probably not needed,
368
 * as I don't actually mind if I see a new cpuset pointer but
369
 * an old value of mems_generation.  However this really only
370
 * matters on alpha systems using cpusets heavily.  If I dropped
371
 * that rcu_dereference(), it would save them a memory barrier.
372
 * For all other arch's, rcu_dereference is a no-op anyway, and for
373
 * alpha systems not using cpusets, another planned optimization,
374
 * avoiding the rcu critical section for tasks in the root cpuset
375
 * which is statically allocated, so can't vanish, will make this
376
 * irrelevant.  Better to use RCU as intended, than to engage in
377
 * some cute trick to save a memory barrier that is impossible to
378
 * test, for alpha systems using cpusets heavily, which might not
379
 * even exist.
380
 *
381
 * This routine is needed to update the per-task mems_allowed data,
382
 * within the tasks context, when it is trying to allocate memory
383
 * (in various mm/mempolicy.c routines) and notices that some other
384
 * task has been modifying its cpuset.
385
 */
386
 
387
void cpuset_update_task_memory_state(void)
388
{
389
        int my_cpusets_mem_gen;
390
        struct task_struct *tsk = current;
391
        struct cpuset *cs;
392
 
393
        if (task_cs(tsk) == &top_cpuset) {
394
                /* Don't need rcu for top_cpuset.  It's never freed. */
395
                my_cpusets_mem_gen = top_cpuset.mems_generation;
396
        } else {
397
                rcu_read_lock();
398
                my_cpusets_mem_gen = task_cs(current)->mems_generation;
399
                rcu_read_unlock();
400
        }
401
 
402
        if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
403
                mutex_lock(&callback_mutex);
404
                task_lock(tsk);
405
                cs = task_cs(tsk); /* Maybe changed when task not locked */
406
                guarantee_online_mems(cs, &tsk->mems_allowed);
407
                tsk->cpuset_mems_generation = cs->mems_generation;
408
                if (is_spread_page(cs))
409
                        tsk->flags |= PF_SPREAD_PAGE;
410
                else
411
                        tsk->flags &= ~PF_SPREAD_PAGE;
412
                if (is_spread_slab(cs))
413
                        tsk->flags |= PF_SPREAD_SLAB;
414
                else
415
                        tsk->flags &= ~PF_SPREAD_SLAB;
416
                task_unlock(tsk);
417
                mutex_unlock(&callback_mutex);
418
                mpol_rebind_task(tsk, &tsk->mems_allowed);
419
        }
420
}
421
 
422
/*
423
 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
424
 *
425
 * One cpuset is a subset of another if all its allowed CPUs and
426
 * Memory Nodes are a subset of the other, and its exclusive flags
427
 * are only set if the other's are set.  Call holding manage_mutex.
428
 */
429
 
430
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
431
{
432
        return  cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
433
                nodes_subset(p->mems_allowed, q->mems_allowed) &&
434
                is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
435
                is_mem_exclusive(p) <= is_mem_exclusive(q);
436
}
437
 
438
/*
439
 * validate_change() - Used to validate that any proposed cpuset change
440
 *                     follows the structural rules for cpusets.
441
 *
442
 * If we replaced the flag and mask values of the current cpuset
443
 * (cur) with those values in the trial cpuset (trial), would
444
 * our various subset and exclusive rules still be valid?  Presumes
445
 * manage_mutex held.
446
 *
447
 * 'cur' is the address of an actual, in-use cpuset.  Operations
448
 * such as list traversal that depend on the actual address of the
449
 * cpuset in the list must use cur below, not trial.
450
 *
451
 * 'trial' is the address of bulk structure copy of cur, with
452
 * perhaps one or more of the fields cpus_allowed, mems_allowed,
453
 * or flags changed to new, trial values.
454
 *
455
 * Return 0 if valid, -errno if not.
456
 */
457
 
458
static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
459
{
460
        struct cgroup *cont;
461
        struct cpuset *c, *par;
462
 
463
        /* Each of our child cpusets must be a subset of us */
464
        list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
465
                if (!is_cpuset_subset(cgroup_cs(cont), trial))
466
                        return -EBUSY;
467
        }
468
 
469
        /* Remaining checks don't apply to root cpuset */
470
        if (cur == &top_cpuset)
471
                return 0;
472
 
473
        par = cur->parent;
474
 
475
        /* We must be a subset of our parent cpuset */
476
        if (!is_cpuset_subset(trial, par))
477
                return -EACCES;
478
 
479
        /* If either I or some sibling (!= me) is exclusive, we can't overlap */
480
        list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
481
                c = cgroup_cs(cont);
482
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
483
                    c != cur &&
484
                    cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
485
                        return -EINVAL;
486
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
487
                    c != cur &&
488
                    nodes_intersects(trial->mems_allowed, c->mems_allowed))
489
                        return -EINVAL;
490
        }
491
 
492
        /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
493
        if (cgroup_task_count(cur->css.cgroup)) {
494
                if (cpus_empty(trial->cpus_allowed) ||
495
                    nodes_empty(trial->mems_allowed)) {
496
                        return -ENOSPC;
497
                }
498
        }
499
 
500
        return 0;
501
}
502
 
503
/*
504
 * Helper routine for rebuild_sched_domains().
505
 * Do cpusets a, b have overlapping cpus_allowed masks?
506
 */
507
 
508
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
509
{
510
        return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
511
}
512
 
513
/*
514
 * rebuild_sched_domains()
515
 *
516
 * If the flag 'sched_load_balance' of any cpuset with non-empty
517
 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
518
 * which has that flag enabled, or if any cpuset with a non-empty
519
 * 'cpus' is removed, then call this routine to rebuild the
520
 * scheduler's dynamic sched domains.
521
 *
522
 * This routine builds a partial partition of the systems CPUs
523
 * (the set of non-overlappping cpumask_t's in the array 'part'
524
 * below), and passes that partial partition to the kernel/sched.c
525
 * partition_sched_domains() routine, which will rebuild the
526
 * schedulers load balancing domains (sched domains) as specified
527
 * by that partial partition.  A 'partial partition' is a set of
528
 * non-overlapping subsets whose union is a subset of that set.
529
 *
530
 * See "What is sched_load_balance" in Documentation/cpusets.txt
531
 * for a background explanation of this.
532
 *
533
 * Does not return errors, on the theory that the callers of this
534
 * routine would rather not worry about failures to rebuild sched
535
 * domains when operating in the severe memory shortage situations
536
 * that could cause allocation failures below.
537
 *
538
 * Call with cgroup_mutex held.  May take callback_mutex during
539
 * call due to the kfifo_alloc() and kmalloc() calls.  May nest
540
 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
541
 * Must not be called holding callback_mutex, because we must not
542
 * call lock_cpu_hotplug() while holding callback_mutex.  Elsewhere
543
 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
544
 * So the reverse nesting would risk an ABBA deadlock.
545
 *
546
 * The three key local variables below are:
547
 *    q  - a kfifo queue of cpuset pointers, used to implement a
548
 *         top-down scan of all cpusets.  This scan loads a pointer
549
 *         to each cpuset marked is_sched_load_balance into the
550
 *         array 'csa'.  For our purposes, rebuilding the schedulers
551
 *         sched domains, we can ignore !is_sched_load_balance cpusets.
552
 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
553
 *         that need to be load balanced, for convenient iterative
554
 *         access by the subsequent code that finds the best partition,
555
 *         i.e the set of domains (subsets) of CPUs such that the
556
 *         cpus_allowed of every cpuset marked is_sched_load_balance
557
 *         is a subset of one of these domains, while there are as
558
 *         many such domains as possible, each as small as possible.
559
 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
560
 *         the kernel/sched.c routine partition_sched_domains() in a
561
 *         convenient format, that can be easily compared to the prior
562
 *         value to determine what partition elements (sched domains)
563
 *         were changed (added or removed.)
564
 *
565
 * Finding the best partition (set of domains):
566
 *      The triple nested loops below over i, j, k scan over the
567
 *      load balanced cpusets (using the array of cpuset pointers in
568
 *      csa[]) looking for pairs of cpusets that have overlapping
569
 *      cpus_allowed, but which don't have the same 'pn' partition
570
 *      number and gives them in the same partition number.  It keeps
571
 *      looping on the 'restart' label until it can no longer find
572
 *      any such pairs.
573
 *
574
 *      The union of the cpus_allowed masks from the set of
575
 *      all cpusets having the same 'pn' value then form the one
576
 *      element of the partition (one sched domain) to be passed to
577
 *      partition_sched_domains().
578
 */
579
 
580
static void rebuild_sched_domains(void)
581
{
582
        struct kfifo *q;        /* queue of cpusets to be scanned */
583
        struct cpuset *cp;      /* scans q */
584
        struct cpuset **csa;    /* array of all cpuset ptrs */
585
        int csn;                /* how many cpuset ptrs in csa so far */
586
        int i, j, k;            /* indices for partition finding loops */
587
        cpumask_t *doms;        /* resulting partition; i.e. sched domains */
588
        int ndoms;              /* number of sched domains in result */
589
        int nslot;              /* next empty doms[] cpumask_t slot */
590
 
591
        q = NULL;
592
        csa = NULL;
593
        doms = NULL;
594
 
595
        /* Special case for the 99% of systems with one, full, sched domain */
596
        if (is_sched_load_balance(&top_cpuset)) {
597
                ndoms = 1;
598
                doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
599
                if (!doms)
600
                        goto rebuild;
601
                *doms = top_cpuset.cpus_allowed;
602
                goto rebuild;
603
        }
604
 
605
        q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
606
        if (IS_ERR(q))
607
                goto done;
608
        csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
609
        if (!csa)
610
                goto done;
611
        csn = 0;
612
 
613
        cp = &top_cpuset;
614
        __kfifo_put(q, (void *)&cp, sizeof(cp));
615
        while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
616
                struct cgroup *cont;
617
                struct cpuset *child;   /* scans child cpusets of cp */
618
                if (is_sched_load_balance(cp))
619
                        csa[csn++] = cp;
620
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
621
                        child = cgroup_cs(cont);
622
                        __kfifo_put(q, (void *)&child, sizeof(cp));
623
                }
624
        }
625
 
626
        for (i = 0; i < csn; i++)
627
                csa[i]->pn = i;
628
        ndoms = csn;
629
 
630
restart:
631
        /* Find the best partition (set of sched domains) */
632
        for (i = 0; i < csn; i++) {
633
                struct cpuset *a = csa[i];
634
                int apn = a->pn;
635
 
636
                for (j = 0; j < csn; j++) {
637
                        struct cpuset *b = csa[j];
638
                        int bpn = b->pn;
639
 
640
                        if (apn != bpn && cpusets_overlap(a, b)) {
641
                                for (k = 0; k < csn; k++) {
642
                                        struct cpuset *c = csa[k];
643
 
644
                                        if (c->pn == bpn)
645
                                                c->pn = apn;
646
                                }
647
                                ndoms--;        /* one less element */
648
                                goto restart;
649
                        }
650
                }
651
        }
652
 
653
        /* Convert <csn, csa> to <ndoms, doms> */
654
        doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
655
        if (!doms)
656
                goto rebuild;
657
 
658
        for (nslot = 0, i = 0; i < csn; i++) {
659
                struct cpuset *a = csa[i];
660
                int apn = a->pn;
661
 
662
                if (apn >= 0) {
663
                        cpumask_t *dp = doms + nslot;
664
 
665
                        if (nslot == ndoms) {
666
                                static int warnings = 10;
667
                                if (warnings) {
668
                                        printk(KERN_WARNING
669
                                         "rebuild_sched_domains confused:"
670
                                          " nslot %d, ndoms %d, csn %d, i %d,"
671
                                          " apn %d\n",
672
                                          nslot, ndoms, csn, i, apn);
673
                                        warnings--;
674
                                }
675
                                continue;
676
                        }
677
 
678
                        cpus_clear(*dp);
679
                        for (j = i; j < csn; j++) {
680
                                struct cpuset *b = csa[j];
681
 
682
                                if (apn == b->pn) {
683
                                        cpus_or(*dp, *dp, b->cpus_allowed);
684
                                        b->pn = -1;
685
                                }
686
                        }
687
                        nslot++;
688
                }
689
        }
690
        BUG_ON(nslot != ndoms);
691
 
692
rebuild:
693
        /* Have scheduler rebuild sched domains */
694
        lock_cpu_hotplug();
695
        partition_sched_domains(ndoms, doms);
696
        unlock_cpu_hotplug();
697
 
698
done:
699
        if (q && !IS_ERR(q))
700
                kfifo_free(q);
701
        kfree(csa);
702
        /* Don't kfree(doms) -- partition_sched_domains() does that. */
703
}
704
 
705
static inline int started_after_time(struct task_struct *t1,
706
                                     struct timespec *time,
707
                                     struct task_struct *t2)
708
{
709
        int start_diff = timespec_compare(&t1->start_time, time);
710
        if (start_diff > 0) {
711
                return 1;
712
        } else if (start_diff < 0) {
713
                return 0;
714
        } else {
715
                /*
716
                 * Arbitrarily, if two processes started at the same
717
                 * time, we'll say that the lower pointer value
718
                 * started first. Note that t2 may have exited by now
719
                 * so this may not be a valid pointer any longer, but
720
                 * that's fine - it still serves to distinguish
721
                 * between two tasks started (effectively)
722
                 * simultaneously.
723
                 */
724
                return t1 > t2;
725
        }
726
}
727
 
728
static inline int started_after(void *p1, void *p2)
729
{
730
        struct task_struct *t1 = p1;
731
        struct task_struct *t2 = p2;
732
        return started_after_time(t1, &t2->start_time, t2);
733
}
734
 
735
/*
736
 * Call with manage_mutex held.  May take callback_mutex during call.
737
 */
738
 
739
static int update_cpumask(struct cpuset *cs, char *buf)
740
{
741
        struct cpuset trialcs;
742
        int retval, i;
743
        int is_load_balanced;
744
        struct cgroup_iter it;
745
        struct cgroup *cgrp = cs->css.cgroup;
746
        struct task_struct *p, *dropped;
747
        /* Never dereference latest_task, since it's not refcounted */
748
        struct task_struct *latest_task = NULL;
749
        struct ptr_heap heap;
750
        struct timespec latest_time = { 0, 0 };
751
 
752
        /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
753
        if (cs == &top_cpuset)
754
                return -EACCES;
755
 
756
        trialcs = *cs;
757
 
758
        /*
759
         * An empty cpus_allowed is ok iff there are no tasks in the cpuset.
760
         * Since cpulist_parse() fails on an empty mask, we special case
761
         * that parsing.  The validate_change() call ensures that cpusets
762
         * with tasks have cpus.
763
         */
764
        buf = strstrip(buf);
765
        if (!*buf) {
766
                cpus_clear(trialcs.cpus_allowed);
767
        } else {
768
                retval = cpulist_parse(buf, trialcs.cpus_allowed);
769
                if (retval < 0)
770
                        return retval;
771
        }
772
        cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
773
        retval = validate_change(cs, &trialcs);
774
        if (retval < 0)
775
                return retval;
776
 
777
        /* Nothing to do if the cpus didn't change */
778
        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
779
                return 0;
780
        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
781
        if (retval)
782
                return retval;
783
 
784
        is_load_balanced = is_sched_load_balance(&trialcs);
785
 
786
        mutex_lock(&callback_mutex);
787
        cs->cpus_allowed = trialcs.cpus_allowed;
788
        mutex_unlock(&callback_mutex);
789
 
790
 again:
791
        /*
792
         * Scan tasks in the cpuset, and update the cpumasks of any
793
         * that need an update. Since we can't call set_cpus_allowed()
794
         * while holding tasklist_lock, gather tasks to be processed
795
         * in a heap structure. If the statically-sized heap fills up,
796
         * overflow tasks that started later, and in future iterations
797
         * only consider tasks that started after the latest task in
798
         * the previous pass. This guarantees forward progress and
799
         * that we don't miss any tasks
800
         */
801
        heap.size = 0;
802
        cgroup_iter_start(cgrp, &it);
803
        while ((p = cgroup_iter_next(cgrp, &it))) {
804
                /* Only affect tasks that don't have the right cpus_allowed */
805
                if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
806
                        continue;
807
                /*
808
                 * Only process tasks that started after the last task
809
                 * we processed
810
                 */
811
                if (!started_after_time(p, &latest_time, latest_task))
812
                        continue;
813
                dropped = heap_insert(&heap, p);
814
                if (dropped == NULL) {
815
                        get_task_struct(p);
816
                } else if (dropped != p) {
817
                        get_task_struct(p);
818
                        put_task_struct(dropped);
819
                }
820
        }
821
        cgroup_iter_end(cgrp, &it);
822
        if (heap.size) {
823
                for (i = 0; i < heap.size; i++) {
824
                        struct task_struct *p = heap.ptrs[i];
825
                        if (i == 0) {
826
                                latest_time = p->start_time;
827
                                latest_task = p;
828
                        }
829
                        set_cpus_allowed(p, cs->cpus_allowed);
830
                        put_task_struct(p);
831
                }
832
                /*
833
                 * If we had to process any tasks at all, scan again
834
                 * in case some of them were in the middle of forking
835
                 * children that didn't notice the new cpumask
836
                 * restriction.  Not the most efficient way to do it,
837
                 * but it avoids having to take callback_mutex in the
838
                 * fork path
839
                 */
840
                goto again;
841
        }
842
        heap_free(&heap);
843
        if (is_load_balanced)
844
                rebuild_sched_domains();
845
 
846
        return 0;
847
}
848
 
849
/*
850
 * cpuset_migrate_mm
851
 *
852
 *    Migrate memory region from one set of nodes to another.
853
 *
854
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
855
 *    so that the migration code can allocate pages on these nodes.
856
 *
857
 *    Call holding manage_mutex, so our current->cpuset won't change
858
 *    during this call, as manage_mutex holds off any attach_task()
859
 *    calls.  Therefore we don't need to take task_lock around the
860
 *    call to guarantee_online_mems(), as we know no one is changing
861
 *    our tasks cpuset.
862
 *
863
 *    Hold callback_mutex around the two modifications of our tasks
864
 *    mems_allowed to synchronize with cpuset_mems_allowed().
865
 *
866
 *    While the mm_struct we are migrating is typically from some
867
 *    other task, the task_struct mems_allowed that we are hacking
868
 *    is for our current task, which must allocate new pages for that
869
 *    migrating memory region.
870
 *
871
 *    We call cpuset_update_task_memory_state() before hacking
872
 *    our tasks mems_allowed, so that we are assured of being in
873
 *    sync with our tasks cpuset, and in particular, callbacks to
874
 *    cpuset_update_task_memory_state() from nested page allocations
875
 *    won't see any mismatch of our cpuset and task mems_generation
876
 *    values, so won't overwrite our hacked tasks mems_allowed
877
 *    nodemask.
878
 */
879
 
880
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
881
                                                        const nodemask_t *to)
882
{
883
        struct task_struct *tsk = current;
884
 
885
        cpuset_update_task_memory_state();
886
 
887
        mutex_lock(&callback_mutex);
888
        tsk->mems_allowed = *to;
889
        mutex_unlock(&callback_mutex);
890
 
891
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
892
 
893
        mutex_lock(&callback_mutex);
894
        guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
895
        mutex_unlock(&callback_mutex);
896
}
897
 
898
/*
899
 * Handle user request to change the 'mems' memory placement
900
 * of a cpuset.  Needs to validate the request, update the
901
 * cpusets mems_allowed and mems_generation, and for each
902
 * task in the cpuset, rebind any vma mempolicies and if
903
 * the cpuset is marked 'memory_migrate', migrate the tasks
904
 * pages to the new memory.
905
 *
906
 * Call with manage_mutex held.  May take callback_mutex during call.
907
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
908
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
909
 * their mempolicies to the cpusets new mems_allowed.
910
 */
911
 
912
static void *cpuset_being_rebound;
913
 
914
static int update_nodemask(struct cpuset *cs, char *buf)
915
{
916
        struct cpuset trialcs;
917
        nodemask_t oldmem;
918
        struct task_struct *p;
919
        struct mm_struct **mmarray;
920
        int i, n, ntasks;
921
        int migrate;
922
        int fudge;
923
        int retval;
924
        struct cgroup_iter it;
925
 
926
        /*
927
         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
928
         * it's read-only
929
         */
930
        if (cs == &top_cpuset)
931
                return -EACCES;
932
 
933
        trialcs = *cs;
934
 
935
        /*
936
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
937
         * Since nodelist_parse() fails on an empty mask, we special case
938
         * that parsing.  The validate_change() call ensures that cpusets
939
         * with tasks have memory.
940
         */
941
        buf = strstrip(buf);
942
        if (!*buf) {
943
                nodes_clear(trialcs.mems_allowed);
944
        } else {
945
                retval = nodelist_parse(buf, trialcs.mems_allowed);
946
                if (retval < 0)
947
                        goto done;
948
        }
949
        nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
950
                                                node_states[N_HIGH_MEMORY]);
951
        oldmem = cs->mems_allowed;
952
        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
953
                retval = 0;              /* Too easy - nothing to do */
954
                goto done;
955
        }
956
        retval = validate_change(cs, &trialcs);
957
        if (retval < 0)
958
                goto done;
959
 
960
        mutex_lock(&callback_mutex);
961
        cs->mems_allowed = trialcs.mems_allowed;
962
        cs->mems_generation = cpuset_mems_generation++;
963
        mutex_unlock(&callback_mutex);
964
 
965
        cpuset_being_rebound = cs;              /* causes mpol_copy() rebind */
966
 
967
        fudge = 10;                             /* spare mmarray[] slots */
968
        fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
969
        retval = -ENOMEM;
970
 
971
        /*
972
         * Allocate mmarray[] to hold mm reference for each task
973
         * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
974
         * tasklist_lock.  We could use GFP_ATOMIC, but with a
975
         * few more lines of code, we can retry until we get a big
976
         * enough mmarray[] w/o using GFP_ATOMIC.
977
         */
978
        while (1) {
979
                ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
980
                ntasks += fudge;
981
                mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
982
                if (!mmarray)
983
                        goto done;
984
                read_lock(&tasklist_lock);              /* block fork */
985
                if (cgroup_task_count(cs->css.cgroup) <= ntasks)
986
                        break;                          /* got enough */
987
                read_unlock(&tasklist_lock);            /* try again */
988
                kfree(mmarray);
989
        }
990
 
991
        n = 0;
992
 
993
        /* Load up mmarray[] with mm reference for each task in cpuset. */
994
        cgroup_iter_start(cs->css.cgroup, &it);
995
        while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
996
                struct mm_struct *mm;
997
 
998
                if (n >= ntasks) {
999
                        printk(KERN_WARNING
1000
                                "Cpuset mempolicy rebind incomplete.\n");
1001
                        break;
1002
                }
1003
                mm = get_task_mm(p);
1004
                if (!mm)
1005
                        continue;
1006
                mmarray[n++] = mm;
1007
        }
1008
        cgroup_iter_end(cs->css.cgroup, &it);
1009
        read_unlock(&tasklist_lock);
1010
 
1011
        /*
1012
         * Now that we've dropped the tasklist spinlock, we can
1013
         * rebind the vma mempolicies of each mm in mmarray[] to their
1014
         * new cpuset, and release that mm.  The mpol_rebind_mm()
1015
         * call takes mmap_sem, which we couldn't take while holding
1016
         * tasklist_lock.  Forks can happen again now - the mpol_copy()
1017
         * cpuset_being_rebound check will catch such forks, and rebind
1018
         * their vma mempolicies too.  Because we still hold the global
1019
         * cpuset manage_mutex, we know that no other rebind effort will
1020
         * be contending for the global variable cpuset_being_rebound.
1021
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1022
         * is idempotent.  Also migrate pages in each mm to new nodes.
1023
         */
1024
        migrate = is_memory_migrate(cs);
1025
        for (i = 0; i < n; i++) {
1026
                struct mm_struct *mm = mmarray[i];
1027
 
1028
                mpol_rebind_mm(mm, &cs->mems_allowed);
1029
                if (migrate)
1030
                        cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
1031
                mmput(mm);
1032
        }
1033
 
1034
        /* We're done rebinding vma's to this cpusets new mems_allowed. */
1035
        kfree(mmarray);
1036
        cpuset_being_rebound = NULL;
1037
        retval = 0;
1038
done:
1039
        return retval;
1040
}
1041
 
1042
int current_cpuset_is_being_rebound(void)
1043
{
1044
        return task_cs(current) == cpuset_being_rebound;
1045
}
1046
 
1047
/*
1048
 * Call with manage_mutex held.
1049
 */
1050
 
1051
static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1052
{
1053
        if (simple_strtoul(buf, NULL, 10) != 0)
1054
                cpuset_memory_pressure_enabled = 1;
1055
        else
1056
                cpuset_memory_pressure_enabled = 0;
1057
        return 0;
1058
}
1059
 
1060
/*
1061
 * update_flag - read a 0 or a 1 in a file and update associated flag
1062
 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
1063
 *                              CS_SCHED_LOAD_BALANCE,
1064
 *                              CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
1065
 *                              CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1066
 * cs:  the cpuset to update
1067
 * buf: the buffer where we read the 0 or 1
1068
 *
1069
 * Call with manage_mutex held.
1070
 */
1071
 
1072
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1073
{
1074
        int turning_on;
1075
        struct cpuset trialcs;
1076
        int err;
1077
        int cpus_nonempty, balance_flag_changed;
1078
 
1079
        turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1080
 
1081
        trialcs = *cs;
1082
        if (turning_on)
1083
                set_bit(bit, &trialcs.flags);
1084
        else
1085
                clear_bit(bit, &trialcs.flags);
1086
 
1087
        err = validate_change(cs, &trialcs);
1088
        if (err < 0)
1089
                return err;
1090
 
1091
        cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1092
        balance_flag_changed = (is_sched_load_balance(cs) !=
1093
                                        is_sched_load_balance(&trialcs));
1094
 
1095
        mutex_lock(&callback_mutex);
1096
        cs->flags = trialcs.flags;
1097
        mutex_unlock(&callback_mutex);
1098
 
1099
        if (cpus_nonempty && balance_flag_changed)
1100
                rebuild_sched_domains();
1101
 
1102
        return 0;
1103
}
1104
 
1105
/*
1106
 * Frequency meter - How fast is some event occurring?
1107
 *
1108
 * These routines manage a digitally filtered, constant time based,
1109
 * event frequency meter.  There are four routines:
1110
 *   fmeter_init() - initialize a frequency meter.
1111
 *   fmeter_markevent() - called each time the event happens.
1112
 *   fmeter_getrate() - returns the recent rate of such events.
1113
 *   fmeter_update() - internal routine used to update fmeter.
1114
 *
1115
 * A common data structure is passed to each of these routines,
1116
 * which is used to keep track of the state required to manage the
1117
 * frequency meter and its digital filter.
1118
 *
1119
 * The filter works on the number of events marked per unit time.
1120
 * The filter is single-pole low-pass recursive (IIR).  The time unit
1121
 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
1122
 * simulate 3 decimal digits of precision (multiplied by 1000).
1123
 *
1124
 * With an FM_COEF of 933, and a time base of 1 second, the filter
1125
 * has a half-life of 10 seconds, meaning that if the events quit
1126
 * happening, then the rate returned from the fmeter_getrate()
1127
 * will be cut in half each 10 seconds, until it converges to zero.
1128
 *
1129
 * It is not worth doing a real infinitely recursive filter.  If more
1130
 * than FM_MAXTICKS ticks have elapsed since the last filter event,
1131
 * just compute FM_MAXTICKS ticks worth, by which point the level
1132
 * will be stable.
1133
 *
1134
 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
1135
 * arithmetic overflow in the fmeter_update() routine.
1136
 *
1137
 * Given the simple 32 bit integer arithmetic used, this meter works
1138
 * best for reporting rates between one per millisecond (msec) and
1139
 * one per 32 (approx) seconds.  At constant rates faster than one
1140
 * per msec it maxes out at values just under 1,000,000.  At constant
1141
 * rates between one per msec, and one per second it will stabilize
1142
 * to a value N*1000, where N is the rate of events per second.
1143
 * At constant rates between one per second and one per 32 seconds,
1144
 * it will be choppy, moving up on the seconds that have an event,
1145
 * and then decaying until the next event.  At rates slower than
1146
 * about one in 32 seconds, it decays all the way back to zero between
1147
 * each event.
1148
 */
1149
 
1150
#define FM_COEF 933             /* coefficient for half-life of 10 secs */
1151
#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
1152
#define FM_MAXCNT 1000000       /* limit cnt to avoid overflow */
1153
#define FM_SCALE 1000           /* faux fixed point scale */
1154
 
1155
/* Initialize a frequency meter */
1156
static void fmeter_init(struct fmeter *fmp)
1157
{
1158
        fmp->cnt = 0;
1159
        fmp->val = 0;
1160
        fmp->time = 0;
1161
        spin_lock_init(&fmp->lock);
1162
}
1163
 
1164
/* Internal meter update - process cnt events and update value */
1165
static void fmeter_update(struct fmeter *fmp)
1166
{
1167
        time_t now = get_seconds();
1168
        time_t ticks = now - fmp->time;
1169
 
1170
        if (ticks == 0)
1171
                return;
1172
 
1173
        ticks = min(FM_MAXTICKS, ticks);
1174
        while (ticks-- > 0)
1175
                fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1176
        fmp->time = now;
1177
 
1178
        fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1179
        fmp->cnt = 0;
1180
}
1181
 
1182
/* Process any previous ticks, then bump cnt by one (times scale). */
1183
static void fmeter_markevent(struct fmeter *fmp)
1184
{
1185
        spin_lock(&fmp->lock);
1186
        fmeter_update(fmp);
1187
        fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1188
        spin_unlock(&fmp->lock);
1189
}
1190
 
1191
/* Process any previous ticks, then return current value. */
1192
static int fmeter_getrate(struct fmeter *fmp)
1193
{
1194
        int val;
1195
 
1196
        spin_lock(&fmp->lock);
1197
        fmeter_update(fmp);
1198
        val = fmp->val;
1199
        spin_unlock(&fmp->lock);
1200
        return val;
1201
}
1202
 
1203
static int cpuset_can_attach(struct cgroup_subsys *ss,
1204
                             struct cgroup *cont, struct task_struct *tsk)
1205
{
1206
        struct cpuset *cs = cgroup_cs(cont);
1207
 
1208
        if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1209
                return -ENOSPC;
1210
 
1211
        return security_task_setscheduler(tsk, 0, NULL);
1212
}
1213
 
1214
static void cpuset_attach(struct cgroup_subsys *ss,
1215
                          struct cgroup *cont, struct cgroup *oldcont,
1216
                          struct task_struct *tsk)
1217
{
1218
        cpumask_t cpus;
1219
        nodemask_t from, to;
1220
        struct mm_struct *mm;
1221
        struct cpuset *cs = cgroup_cs(cont);
1222
        struct cpuset *oldcs = cgroup_cs(oldcont);
1223
 
1224
        mutex_lock(&callback_mutex);
1225
        guarantee_online_cpus(cs, &cpus);
1226
        set_cpus_allowed(tsk, cpus);
1227
        mutex_unlock(&callback_mutex);
1228
 
1229
        from = oldcs->mems_allowed;
1230
        to = cs->mems_allowed;
1231
        mm = get_task_mm(tsk);
1232
        if (mm) {
1233
                mpol_rebind_mm(mm, &to);
1234
                if (is_memory_migrate(cs))
1235
                        cpuset_migrate_mm(mm, &from, &to);
1236
                mmput(mm);
1237
        }
1238
 
1239
}
1240
 
1241
/* The various types of files and directories in a cpuset file system */
1242
 
1243
typedef enum {
1244
        FILE_MEMORY_MIGRATE,
1245
        FILE_CPULIST,
1246
        FILE_MEMLIST,
1247
        FILE_CPU_EXCLUSIVE,
1248
        FILE_MEM_EXCLUSIVE,
1249
        FILE_SCHED_LOAD_BALANCE,
1250
        FILE_MEMORY_PRESSURE_ENABLED,
1251
        FILE_MEMORY_PRESSURE,
1252
        FILE_SPREAD_PAGE,
1253
        FILE_SPREAD_SLAB,
1254
} cpuset_filetype_t;
1255
 
1256
static ssize_t cpuset_common_file_write(struct cgroup *cont,
1257
                                        struct cftype *cft,
1258
                                        struct file *file,
1259
                                        const char __user *userbuf,
1260
                                        size_t nbytes, loff_t *unused_ppos)
1261
{
1262
        struct cpuset *cs = cgroup_cs(cont);
1263
        cpuset_filetype_t type = cft->private;
1264
        char *buffer;
1265
        int retval = 0;
1266
 
1267
        /* Crude upper limit on largest legitimate cpulist user might write. */
1268
        if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1269
                return -E2BIG;
1270
 
1271
        /* +1 for nul-terminator */
1272
        if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
1273
                return -ENOMEM;
1274
 
1275
        if (copy_from_user(buffer, userbuf, nbytes)) {
1276
                retval = -EFAULT;
1277
                goto out1;
1278
        }
1279
        buffer[nbytes] = 0;      /* nul-terminate */
1280
 
1281
        cgroup_lock();
1282
 
1283
        if (cgroup_is_removed(cont)) {
1284
                retval = -ENODEV;
1285
                goto out2;
1286
        }
1287
 
1288
        switch (type) {
1289
        case FILE_CPULIST:
1290
                retval = update_cpumask(cs, buffer);
1291
                break;
1292
        case FILE_MEMLIST:
1293
                retval = update_nodemask(cs, buffer);
1294
                break;
1295
        case FILE_CPU_EXCLUSIVE:
1296
                retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
1297
                break;
1298
        case FILE_MEM_EXCLUSIVE:
1299
                retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
1300
                break;
1301
        case FILE_SCHED_LOAD_BALANCE:
1302
                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1303
                break;
1304
        case FILE_MEMORY_MIGRATE:
1305
                retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1306
                break;
1307
        case FILE_MEMORY_PRESSURE_ENABLED:
1308
                retval = update_memory_pressure_enabled(cs, buffer);
1309
                break;
1310
        case FILE_MEMORY_PRESSURE:
1311
                retval = -EACCES;
1312
                break;
1313
        case FILE_SPREAD_PAGE:
1314
                retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
1315
                cs->mems_generation = cpuset_mems_generation++;
1316
                break;
1317
        case FILE_SPREAD_SLAB:
1318
                retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1319
                cs->mems_generation = cpuset_mems_generation++;
1320
                break;
1321
        default:
1322
                retval = -EINVAL;
1323
                goto out2;
1324
        }
1325
 
1326
        if (retval == 0)
1327
                retval = nbytes;
1328
out2:
1329
        cgroup_unlock();
1330
out1:
1331
        kfree(buffer);
1332
        return retval;
1333
}
1334
 
1335
/*
1336
 * These ascii lists should be read in a single call, by using a user
1337
 * buffer large enough to hold the entire map.  If read in smaller
1338
 * chunks, there is no guarantee of atomicity.  Since the display format
1339
 * used, list of ranges of sequential numbers, is variable length,
1340
 * and since these maps can change value dynamically, one could read
1341
 * gibberish by doing partial reads while a list was changing.
1342
 * A single large read to a buffer that crosses a page boundary is
1343
 * ok, because the result being copied to user land is not recomputed
1344
 * across a page fault.
1345
 */
1346
 
1347
static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1348
{
1349
        cpumask_t mask;
1350
 
1351
        mutex_lock(&callback_mutex);
1352
        mask = cs->cpus_allowed;
1353
        mutex_unlock(&callback_mutex);
1354
 
1355
        return cpulist_scnprintf(page, PAGE_SIZE, mask);
1356
}
1357
 
1358
static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1359
{
1360
        nodemask_t mask;
1361
 
1362
        mutex_lock(&callback_mutex);
1363
        mask = cs->mems_allowed;
1364
        mutex_unlock(&callback_mutex);
1365
 
1366
        return nodelist_scnprintf(page, PAGE_SIZE, mask);
1367
}
1368
 
1369
static ssize_t cpuset_common_file_read(struct cgroup *cont,
1370
                                       struct cftype *cft,
1371
                                       struct file *file,
1372
                                       char __user *buf,
1373
                                       size_t nbytes, loff_t *ppos)
1374
{
1375
        struct cpuset *cs = cgroup_cs(cont);
1376
        cpuset_filetype_t type = cft->private;
1377
        char *page;
1378
        ssize_t retval = 0;
1379
        char *s;
1380
 
1381
        if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1382
                return -ENOMEM;
1383
 
1384
        s = page;
1385
 
1386
        switch (type) {
1387
        case FILE_CPULIST:
1388
                s += cpuset_sprintf_cpulist(s, cs);
1389
                break;
1390
        case FILE_MEMLIST:
1391
                s += cpuset_sprintf_memlist(s, cs);
1392
                break;
1393
        case FILE_CPU_EXCLUSIVE:
1394
                *s++ = is_cpu_exclusive(cs) ? '1' : '0';
1395
                break;
1396
        case FILE_MEM_EXCLUSIVE:
1397
                *s++ = is_mem_exclusive(cs) ? '1' : '0';
1398
                break;
1399
        case FILE_SCHED_LOAD_BALANCE:
1400
                *s++ = is_sched_load_balance(cs) ? '1' : '0';
1401
                break;
1402
        case FILE_MEMORY_MIGRATE:
1403
                *s++ = is_memory_migrate(cs) ? '1' : '0';
1404
                break;
1405
        case FILE_MEMORY_PRESSURE_ENABLED:
1406
                *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1407
                break;
1408
        case FILE_MEMORY_PRESSURE:
1409
                s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1410
                break;
1411
        case FILE_SPREAD_PAGE:
1412
                *s++ = is_spread_page(cs) ? '1' : '0';
1413
                break;
1414
        case FILE_SPREAD_SLAB:
1415
                *s++ = is_spread_slab(cs) ? '1' : '0';
1416
                break;
1417
        default:
1418
                retval = -EINVAL;
1419
                goto out;
1420
        }
1421
        *s++ = '\n';
1422
 
1423
        retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1424
out:
1425
        free_page((unsigned long)page);
1426
        return retval;
1427
}
1428
 
1429
 
1430
 
1431
 
1432
 
1433
/*
1434
 * for the common functions, 'private' gives the type of file
1435
 */
1436
 
1437
static struct cftype cft_cpus = {
1438
        .name = "cpus",
1439
        .read = cpuset_common_file_read,
1440
        .write = cpuset_common_file_write,
1441
        .private = FILE_CPULIST,
1442
};
1443
 
1444
static struct cftype cft_mems = {
1445
        .name = "mems",
1446
        .read = cpuset_common_file_read,
1447
        .write = cpuset_common_file_write,
1448
        .private = FILE_MEMLIST,
1449
};
1450
 
1451
static struct cftype cft_cpu_exclusive = {
1452
        .name = "cpu_exclusive",
1453
        .read = cpuset_common_file_read,
1454
        .write = cpuset_common_file_write,
1455
        .private = FILE_CPU_EXCLUSIVE,
1456
};
1457
 
1458
static struct cftype cft_mem_exclusive = {
1459
        .name = "mem_exclusive",
1460
        .read = cpuset_common_file_read,
1461
        .write = cpuset_common_file_write,
1462
        .private = FILE_MEM_EXCLUSIVE,
1463
};
1464
 
1465
static struct cftype cft_sched_load_balance = {
1466
        .name = "sched_load_balance",
1467
        .read = cpuset_common_file_read,
1468
        .write = cpuset_common_file_write,
1469
        .private = FILE_SCHED_LOAD_BALANCE,
1470
};
1471
 
1472
static struct cftype cft_memory_migrate = {
1473
        .name = "memory_migrate",
1474
        .read = cpuset_common_file_read,
1475
        .write = cpuset_common_file_write,
1476
        .private = FILE_MEMORY_MIGRATE,
1477
};
1478
 
1479
static struct cftype cft_memory_pressure_enabled = {
1480
        .name = "memory_pressure_enabled",
1481
        .read = cpuset_common_file_read,
1482
        .write = cpuset_common_file_write,
1483
        .private = FILE_MEMORY_PRESSURE_ENABLED,
1484
};
1485
 
1486
static struct cftype cft_memory_pressure = {
1487
        .name = "memory_pressure",
1488
        .read = cpuset_common_file_read,
1489
        .write = cpuset_common_file_write,
1490
        .private = FILE_MEMORY_PRESSURE,
1491
};
1492
 
1493
static struct cftype cft_spread_page = {
1494
        .name = "memory_spread_page",
1495
        .read = cpuset_common_file_read,
1496
        .write = cpuset_common_file_write,
1497
        .private = FILE_SPREAD_PAGE,
1498
};
1499
 
1500
static struct cftype cft_spread_slab = {
1501
        .name = "memory_spread_slab",
1502
        .read = cpuset_common_file_read,
1503
        .write = cpuset_common_file_write,
1504
        .private = FILE_SPREAD_SLAB,
1505
};
1506
 
1507
static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1508
{
1509
        int err;
1510
 
1511
        if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
1512
                return err;
1513
        if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1514
                return err;
1515
        if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1516
                return err;
1517
        if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1518
                return err;
1519
        if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1520
                return err;
1521
        if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1522
                return err;
1523
        if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1524
                return err;
1525
        if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1526
                return err;
1527
        if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1528
                return err;
1529
        /* memory_pressure_enabled is in root cpuset only */
1530
        if (err == 0 && !cont->parent)
1531
                err = cgroup_add_file(cont, ss,
1532
                                         &cft_memory_pressure_enabled);
1533
        return 0;
1534
}
1535
 
1536
/*
1537
 * post_clone() is called at the end of cgroup_clone().
1538
 * 'cgroup' was just created automatically as a result of
1539
 * a cgroup_clone(), and the current task is about to
1540
 * be moved into 'cgroup'.
1541
 *
1542
 * Currently we refuse to set up the cgroup - thereby
1543
 * refusing the task to be entered, and as a result refusing
1544
 * the sys_unshare() or clone() which initiated it - if any
1545
 * sibling cpusets have exclusive cpus or mem.
1546
 *
1547
 * If this becomes a problem for some users who wish to
1548
 * allow that scenario, then cpuset_post_clone() could be
1549
 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1550
 * (and likewise for mems) to the new cgroup.
1551
 */
1552
static void cpuset_post_clone(struct cgroup_subsys *ss,
1553
                              struct cgroup *cgroup)
1554
{
1555
        struct cgroup *parent, *child;
1556
        struct cpuset *cs, *parent_cs;
1557
 
1558
        parent = cgroup->parent;
1559
        list_for_each_entry(child, &parent->children, sibling) {
1560
                cs = cgroup_cs(child);
1561
                if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1562
                        return;
1563
        }
1564
        cs = cgroup_cs(cgroup);
1565
        parent_cs = cgroup_cs(parent);
1566
 
1567
        cs->mems_allowed = parent_cs->mems_allowed;
1568
        cs->cpus_allowed = parent_cs->cpus_allowed;
1569
        return;
1570
}
1571
 
1572
/*
1573
 *      cpuset_create - create a cpuset
1574
 *      parent: cpuset that will be parent of the new cpuset.
1575
 *      name:           name of the new cpuset. Will be strcpy'ed.
1576
 *      mode:           mode to set on new inode
1577
 *
1578
 *      Must be called with the mutex on the parent inode held
1579
 */
1580
 
1581
static struct cgroup_subsys_state *cpuset_create(
1582
        struct cgroup_subsys *ss,
1583
        struct cgroup *cont)
1584
{
1585
        struct cpuset *cs;
1586
        struct cpuset *parent;
1587
 
1588
        if (!cont->parent) {
1589
                /* This is early initialization for the top cgroup */
1590
                top_cpuset.mems_generation = cpuset_mems_generation++;
1591
                return &top_cpuset.css;
1592
        }
1593
        parent = cgroup_cs(cont->parent);
1594
        cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1595
        if (!cs)
1596
                return ERR_PTR(-ENOMEM);
1597
 
1598
        cpuset_update_task_memory_state();
1599
        cs->flags = 0;
1600
        if (is_spread_page(parent))
1601
                set_bit(CS_SPREAD_PAGE, &cs->flags);
1602
        if (is_spread_slab(parent))
1603
                set_bit(CS_SPREAD_SLAB, &cs->flags);
1604
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1605
        cs->cpus_allowed = CPU_MASK_NONE;
1606
        cs->mems_allowed = NODE_MASK_NONE;
1607
        cs->mems_generation = cpuset_mems_generation++;
1608
        fmeter_init(&cs->fmeter);
1609
 
1610
        cs->parent = parent;
1611
        number_of_cpusets++;
1612
        return &cs->css ;
1613
}
1614
 
1615
/*
1616
 * Locking note on the strange update_flag() call below:
1617
 *
1618
 * If the cpuset being removed has its flag 'sched_load_balance'
1619
 * enabled, then simulate turning sched_load_balance off, which
1620
 * will call rebuild_sched_domains().  The lock_cpu_hotplug()
1621
 * call in rebuild_sched_domains() must not be made while holding
1622
 * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
1623
 * lock_cpu_hotplug() calls.  So the reverse nesting would risk an
1624
 * ABBA deadlock.
1625
 */
1626
 
1627
static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1628
{
1629
        struct cpuset *cs = cgroup_cs(cont);
1630
 
1631
        cpuset_update_task_memory_state();
1632
 
1633
        if (is_sched_load_balance(cs))
1634
                update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
1635
 
1636
        number_of_cpusets--;
1637
        kfree(cs);
1638
}
1639
 
1640
struct cgroup_subsys cpuset_subsys = {
1641
        .name = "cpuset",
1642
        .create = cpuset_create,
1643
        .destroy  = cpuset_destroy,
1644
        .can_attach = cpuset_can_attach,
1645
        .attach = cpuset_attach,
1646
        .populate = cpuset_populate,
1647
        .post_clone = cpuset_post_clone,
1648
        .subsys_id = cpuset_subsys_id,
1649
        .early_init = 1,
1650
};
1651
 
1652
/*
1653
 * cpuset_init_early - just enough so that the calls to
1654
 * cpuset_update_task_memory_state() in early init code
1655
 * are harmless.
1656
 */
1657
 
1658
int __init cpuset_init_early(void)
1659
{
1660
        top_cpuset.mems_generation = cpuset_mems_generation++;
1661
        return 0;
1662
}
1663
 
1664
 
1665
/**
1666
 * cpuset_init - initialize cpusets at system boot
1667
 *
1668
 * Description: Initialize top_cpuset and the cpuset internal file system,
1669
 **/
1670
 
1671
int __init cpuset_init(void)
1672
{
1673
        int err = 0;
1674
 
1675
        top_cpuset.cpus_allowed = CPU_MASK_ALL;
1676
        top_cpuset.mems_allowed = NODE_MASK_ALL;
1677
 
1678
        fmeter_init(&top_cpuset.fmeter);
1679
        top_cpuset.mems_generation = cpuset_mems_generation++;
1680
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1681
 
1682
        err = register_filesystem(&cpuset_fs_type);
1683
        if (err < 0)
1684
                return err;
1685
 
1686
        number_of_cpusets = 1;
1687
        return 0;
1688
}
1689
 
1690
/*
1691
 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1692
 * or memory nodes, we need to walk over the cpuset hierarchy,
1693
 * removing that CPU or node from all cpusets.  If this removes the
1694
 * last CPU or node from a cpuset, then the guarantee_online_cpus()
1695
 * or guarantee_online_mems() code will use that emptied cpusets
1696
 * parent online CPUs or nodes.  Cpusets that were already empty of
1697
 * CPUs or nodes are left empty.
1698
 *
1699
 * This routine is intentionally inefficient in a couple of regards.
1700
 * It will check all cpusets in a subtree even if the top cpuset of
1701
 * the subtree has no offline CPUs or nodes.  It checks both CPUs and
1702
 * nodes, even though the caller could have been coded to know that
1703
 * only one of CPUs or nodes needed to be checked on a given call.
1704
 * This was done to minimize text size rather than cpu cycles.
1705
 *
1706
 * Call with both manage_mutex and callback_mutex held.
1707
 *
1708
 * Recursive, on depth of cpuset subtree.
1709
 */
1710
 
1711
static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1712
{
1713
        struct cgroup *cont;
1714
        struct cpuset *c;
1715
 
1716
        /* Each of our child cpusets mems must be online */
1717
        list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
1718
                c = cgroup_cs(cont);
1719
                guarantee_online_cpus_mems_in_subtree(c);
1720
                if (!cpus_empty(c->cpus_allowed))
1721
                        guarantee_online_cpus(c, &c->cpus_allowed);
1722
                if (!nodes_empty(c->mems_allowed))
1723
                        guarantee_online_mems(c, &c->mems_allowed);
1724
        }
1725
}
1726
 
1727
/*
1728
 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1729
 * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
1730
 * track what's online after any CPU or memory node hotplug or unplug
1731
 * event.
1732
 *
1733
 * To ensure that we don't remove a CPU or node from the top cpuset
1734
 * that is currently in use by a child cpuset (which would violate
1735
 * the rule that cpusets must be subsets of their parent), we first
1736
 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
1737
 *
1738
 * Since there are two callers of this routine, one for CPU hotplug
1739
 * events and one for memory node hotplug events, we could have coded
1740
 * two separate routines here.  We code it as a single common routine
1741
 * in order to minimize text size.
1742
 */
1743
 
1744
static void common_cpu_mem_hotplug_unplug(void)
1745
{
1746
        cgroup_lock();
1747
        mutex_lock(&callback_mutex);
1748
 
1749
        guarantee_online_cpus_mems_in_subtree(&top_cpuset);
1750
        top_cpuset.cpus_allowed = cpu_online_map;
1751
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1752
 
1753
        mutex_unlock(&callback_mutex);
1754
        cgroup_unlock();
1755
}
1756
 
1757
/*
1758
 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1759
 * period.  This is necessary in order to make cpusets transparent
1760
 * (of no affect) on systems that are actively using CPU hotplug
1761
 * but making no active use of cpusets.
1762
 *
1763
 * This routine ensures that top_cpuset.cpus_allowed tracks
1764
 * cpu_online_map on each CPU hotplug (cpuhp) event.
1765
 */
1766
 
1767
static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1768
                                unsigned long phase, void *unused_cpu)
1769
{
1770
        if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
1771
                return NOTIFY_DONE;
1772
 
1773
        common_cpu_mem_hotplug_unplug();
1774
        return 0;
1775
}
1776
 
1777
#ifdef CONFIG_MEMORY_HOTPLUG
1778
/*
1779
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1780
 * Call this routine anytime after you change
1781
 * node_states[N_HIGH_MEMORY].
1782
 * See also the previous routine cpuset_handle_cpuhp().
1783
 */
1784
 
1785
void cpuset_track_online_nodes(void)
1786
{
1787
        common_cpu_mem_hotplug_unplug();
1788
}
1789
#endif
1790
 
1791
/**
1792
 * cpuset_init_smp - initialize cpus_allowed
1793
 *
1794
 * Description: Finish top cpuset after cpu, node maps are initialized
1795
 **/
1796
 
1797
void __init cpuset_init_smp(void)
1798
{
1799
        top_cpuset.cpus_allowed = cpu_online_map;
1800
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1801
 
1802
        hotcpu_notifier(cpuset_handle_cpuhp, 0);
1803
}
1804
 
1805
/**
1806
 
1807
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1808
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1809
 *
1810
 * Description: Returns the cpumask_t cpus_allowed of the cpuset
1811
 * attached to the specified @tsk.  Guaranteed to return some non-empty
1812
 * subset of cpu_online_map, even if this means going outside the
1813
 * tasks cpuset.
1814
 **/
1815
 
1816
cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
1817
{
1818
        cpumask_t mask;
1819
 
1820
        mutex_lock(&callback_mutex);
1821
        mask = cpuset_cpus_allowed_locked(tsk);
1822
        mutex_unlock(&callback_mutex);
1823
 
1824
        return mask;
1825
}
1826
 
1827
/**
1828
 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1829
 * Must be  called with callback_mutex held.
1830
 **/
1831
cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
1832
{
1833
        cpumask_t mask;
1834
 
1835
        task_lock(tsk);
1836
        guarantee_online_cpus(task_cs(tsk), &mask);
1837
        task_unlock(tsk);
1838
 
1839
        return mask;
1840
}
1841
 
1842
void cpuset_init_current_mems_allowed(void)
1843
{
1844
        current->mems_allowed = NODE_MASK_ALL;
1845
}
1846
 
1847
/**
1848
 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
1849
 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
1850
 *
1851
 * Description: Returns the nodemask_t mems_allowed of the cpuset
1852
 * attached to the specified @tsk.  Guaranteed to return some non-empty
1853
 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
1854
 * tasks cpuset.
1855
 **/
1856
 
1857
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1858
{
1859
        nodemask_t mask;
1860
 
1861
        mutex_lock(&callback_mutex);
1862
        task_lock(tsk);
1863
        guarantee_online_mems(task_cs(tsk), &mask);
1864
        task_unlock(tsk);
1865
        mutex_unlock(&callback_mutex);
1866
 
1867
        return mask;
1868
}
1869
 
1870
/**
1871
 * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
1872
 * @zl: the zonelist to be checked
1873
 *
1874
 * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
1875
 */
1876
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1877
{
1878
        int i;
1879
 
1880
        for (i = 0; zl->zones[i]; i++) {
1881
                int nid = zone_to_nid(zl->zones[i]);
1882
 
1883
                if (node_isset(nid, current->mems_allowed))
1884
                        return 1;
1885
        }
1886
        return 0;
1887
}
1888
 
1889
/*
1890
 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1891
 * ancestor to the specified cpuset.  Call holding callback_mutex.
1892
 * If no ancestor is mem_exclusive (an unusual configuration), then
1893
 * returns the root cpuset.
1894
 */
1895
static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1896
{
1897
        while (!is_mem_exclusive(cs) && cs->parent)
1898
                cs = cs->parent;
1899
        return cs;
1900
}
1901
 
1902
/**
1903
 * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
1904
 * @z: is this zone on an allowed node?
1905
 * @gfp_mask: memory allocation flags
1906
 *
1907
 * If we're in interrupt, yes, we can always allocate.  If
1908
 * __GFP_THISNODE is set, yes, we can always allocate.  If zone
1909
 * z's node is in our tasks mems_allowed, yes.  If it's not a
1910
 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1911
 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
1912
 * If the task has been OOM killed and has access to memory reserves
1913
 * as specified by the TIF_MEMDIE flag, yes.
1914
 * Otherwise, no.
1915
 *
1916
 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
1917
 * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
1918
 * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
1919
 * from an enclosing cpuset.
1920
 *
1921
 * cpuset_zone_allowed_hardwall() only handles the simpler case of
1922
 * hardwall cpusets, and never sleeps.
1923
 *
1924
 * The __GFP_THISNODE placement logic is really handled elsewhere,
1925
 * by forcibly using a zonelist starting at a specified node, and by
1926
 * (in get_page_from_freelist()) refusing to consider the zones for
1927
 * any node on the zonelist except the first.  By the time any such
1928
 * calls get to this routine, we should just shut up and say 'yes'.
1929
 *
1930
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
1931
 * and do not allow allocations outside the current tasks cpuset
1932
 * unless the task has been OOM killed as is marked TIF_MEMDIE.
1933
 * GFP_KERNEL allocations are not so marked, so can escape to the
1934
 * nearest enclosing mem_exclusive ancestor cpuset.
1935
 *
1936
 * Scanning up parent cpusets requires callback_mutex.  The
1937
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
1938
 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
1939
 * current tasks mems_allowed came up empty on the first pass over
1940
 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
1941
 * cpuset are short of memory, might require taking the callback_mutex
1942
 * mutex.
1943
 *
1944
 * The first call here from mm/page_alloc:get_page_from_freelist()
1945
 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
1946
 * so no allocation on a node outside the cpuset is allowed (unless
1947
 * in interrupt, of course).
1948
 *
1949
 * The second pass through get_page_from_freelist() doesn't even call
1950
 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
1951
 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
1952
 * in alloc_flags.  That logic and the checks below have the combined
1953
 * affect that:
1954
 *      in_interrupt - any node ok (current task context irrelevant)
1955
 *      GFP_ATOMIC   - any node ok
1956
 *      TIF_MEMDIE   - any node ok
1957
 *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
1958
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
1959
 *
1960
 * Rule:
1961
 *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
1962
 *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
1963
 *    the code that might scan up ancestor cpusets and sleep.
1964
 */
1965
 
1966
int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
1967
{
1968
        int node;                       /* node that zone z is on */
1969
        const struct cpuset *cs;        /* current cpuset ancestors */
1970
        int allowed;                    /* is allocation in zone z allowed? */
1971
 
1972
        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
1973
                return 1;
1974
        node = zone_to_nid(z);
1975
        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
1976
        if (node_isset(node, current->mems_allowed))
1977
                return 1;
1978
        /*
1979
         * Allow tasks that have access to memory reserves because they have
1980
         * been OOM killed to get memory anywhere.
1981
         */
1982
        if (unlikely(test_thread_flag(TIF_MEMDIE)))
1983
                return 1;
1984
        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
1985
                return 0;
1986
 
1987
        if (current->flags & PF_EXITING) /* Let dying task have memory */
1988
                return 1;
1989
 
1990
        /* Not hardwall and node outside mems_allowed: scan up cpusets */
1991
        mutex_lock(&callback_mutex);
1992
 
1993
        task_lock(current);
1994
        cs = nearest_exclusive_ancestor(task_cs(current));
1995
        task_unlock(current);
1996
 
1997
        allowed = node_isset(node, cs->mems_allowed);
1998
        mutex_unlock(&callback_mutex);
1999
        return allowed;
2000
}
2001
 
2002
/*
2003
 * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
2004
 * @z: is this zone on an allowed node?
2005
 * @gfp_mask: memory allocation flags
2006
 *
2007
 * If we're in interrupt, yes, we can always allocate.
2008
 * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
2009
 * z's node is in our tasks mems_allowed, yes.   If the task has been
2010
 * OOM killed and has access to memory reserves as specified by the
2011
 * TIF_MEMDIE flag, yes.  Otherwise, no.
2012
 *
2013
 * The __GFP_THISNODE placement logic is really handled elsewhere,
2014
 * by forcibly using a zonelist starting at a specified node, and by
2015
 * (in get_page_from_freelist()) refusing to consider the zones for
2016
 * any node on the zonelist except the first.  By the time any such
2017
 * calls get to this routine, we should just shut up and say 'yes'.
2018
 *
2019
 * Unlike the cpuset_zone_allowed_softwall() variant, above,
2020
 * this variant requires that the zone be in the current tasks
2021
 * mems_allowed or that we're in interrupt.  It does not scan up the
2022
 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2023
 * It never sleeps.
2024
 */
2025
 
2026
int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2027
{
2028
        int node;                       /* node that zone z is on */
2029
 
2030
        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2031
                return 1;
2032
        node = zone_to_nid(z);
2033
        if (node_isset(node, current->mems_allowed))
2034
                return 1;
2035
        /*
2036
         * Allow tasks that have access to memory reserves because they have
2037
         * been OOM killed to get memory anywhere.
2038
         */
2039
        if (unlikely(test_thread_flag(TIF_MEMDIE)))
2040
                return 1;
2041
        return 0;
2042
}
2043
 
2044
/**
2045
 * cpuset_lock - lock out any changes to cpuset structures
2046
 *
2047
 * The out of memory (oom) code needs to mutex_lock cpusets
2048
 * from being changed while it scans the tasklist looking for a
2049
 * task in an overlapping cpuset.  Expose callback_mutex via this
2050
 * cpuset_lock() routine, so the oom code can lock it, before
2051
 * locking the task list.  The tasklist_lock is a spinlock, so
2052
 * must be taken inside callback_mutex.
2053
 */
2054
 
2055
void cpuset_lock(void)
2056
{
2057
        mutex_lock(&callback_mutex);
2058
}
2059
 
2060
/**
2061
 * cpuset_unlock - release lock on cpuset changes
2062
 *
2063
 * Undo the lock taken in a previous cpuset_lock() call.
2064
 */
2065
 
2066
void cpuset_unlock(void)
2067
{
2068
        mutex_unlock(&callback_mutex);
2069
}
2070
 
2071
/**
2072
 * cpuset_mem_spread_node() - On which node to begin search for a page
2073
 *
2074
 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2075
 * tasks in a cpuset with is_spread_page or is_spread_slab set),
2076
 * and if the memory allocation used cpuset_mem_spread_node()
2077
 * to determine on which node to start looking, as it will for
2078
 * certain page cache or slab cache pages such as used for file
2079
 * system buffers and inode caches, then instead of starting on the
2080
 * local node to look for a free page, rather spread the starting
2081
 * node around the tasks mems_allowed nodes.
2082
 *
2083
 * We don't have to worry about the returned node being offline
2084
 * because "it can't happen", and even if it did, it would be ok.
2085
 *
2086
 * The routines calling guarantee_online_mems() are careful to
2087
 * only set nodes in task->mems_allowed that are online.  So it
2088
 * should not be possible for the following code to return an
2089
 * offline node.  But if it did, that would be ok, as this routine
2090
 * is not returning the node where the allocation must be, only
2091
 * the node where the search should start.  The zonelist passed to
2092
 * __alloc_pages() will include all nodes.  If the slab allocator
2093
 * is passed an offline node, it will fall back to the local node.
2094
 * See kmem_cache_alloc_node().
2095
 */
2096
 
2097
int cpuset_mem_spread_node(void)
2098
{
2099
        int node;
2100
 
2101
        node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2102
        if (node == MAX_NUMNODES)
2103
                node = first_node(current->mems_allowed);
2104
        current->cpuset_mem_spread_rotor = node;
2105
        return node;
2106
}
2107
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2108
 
2109
/**
2110
 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
2111
 * @tsk1: pointer to task_struct of some task.
2112
 * @tsk2: pointer to task_struct of some other task.
2113
 *
2114
 * Description: Return true if @tsk1's mems_allowed intersects the
2115
 * mems_allowed of @tsk2.  Used by the OOM killer to determine if
2116
 * one of the task's memory usage might impact the memory available
2117
 * to the other.
2118
 **/
2119
 
2120
int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2121
                                   const struct task_struct *tsk2)
2122
{
2123
        return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2124
}
2125
 
2126
/*
2127
 * Collection of memory_pressure is suppressed unless
2128
 * this flag is enabled by writing "1" to the special
2129
 * cpuset file 'memory_pressure_enabled' in the root cpuset.
2130
 */
2131
 
2132
int cpuset_memory_pressure_enabled __read_mostly;
2133
 
2134
/**
2135
 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
2136
 *
2137
 * Keep a running average of the rate of synchronous (direct)
2138
 * page reclaim efforts initiated by tasks in each cpuset.
2139
 *
2140
 * This represents the rate at which some task in the cpuset
2141
 * ran low on memory on all nodes it was allowed to use, and
2142
 * had to enter the kernels page reclaim code in an effort to
2143
 * create more free memory by tossing clean pages or swapping
2144
 * or writing dirty pages.
2145
 *
2146
 * Display to user space in the per-cpuset read-only file
2147
 * "memory_pressure".  Value displayed is an integer
2148
 * representing the recent rate of entry into the synchronous
2149
 * (direct) page reclaim by any task attached to the cpuset.
2150
 **/
2151
 
2152
void __cpuset_memory_pressure_bump(void)
2153
{
2154
        task_lock(current);
2155
        fmeter_markevent(&task_cs(current)->fmeter);
2156
        task_unlock(current);
2157
}
2158
 
2159
#ifdef CONFIG_PROC_PID_CPUSET
2160
/*
2161
 * proc_cpuset_show()
2162
 *  - Print tasks cpuset path into seq_file.
2163
 *  - Used for /proc/<pid>/cpuset.
2164
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2165
 *    doesn't really matter if tsk->cpuset changes after we read it,
2166
 *    and we take manage_mutex, keeping attach_task() from changing it
2167
 *    anyway.  No need to check that tsk->cpuset != NULL, thanks to
2168
 *    the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2169
 *    cpuset to top_cpuset.
2170
 */
2171
static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2172
{
2173
        struct pid *pid;
2174
        struct task_struct *tsk;
2175
        char *buf;
2176
        struct cgroup_subsys_state *css;
2177
        int retval;
2178
 
2179
        retval = -ENOMEM;
2180
        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2181
        if (!buf)
2182
                goto out;
2183
 
2184
        retval = -ESRCH;
2185
        pid = m->private;
2186
        tsk = get_pid_task(pid, PIDTYPE_PID);
2187
        if (!tsk)
2188
                goto out_free;
2189
 
2190
        retval = -EINVAL;
2191
        cgroup_lock();
2192
        css = task_subsys_state(tsk, cpuset_subsys_id);
2193
        retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2194
        if (retval < 0)
2195
                goto out_unlock;
2196
        seq_puts(m, buf);
2197
        seq_putc(m, '\n');
2198
out_unlock:
2199
        cgroup_unlock();
2200
        put_task_struct(tsk);
2201
out_free:
2202
        kfree(buf);
2203
out:
2204
        return retval;
2205
}
2206
 
2207
static int cpuset_open(struct inode *inode, struct file *file)
2208
{
2209
        struct pid *pid = PROC_I(inode)->pid;
2210
        return single_open(file, proc_cpuset_show, pid);
2211
}
2212
 
2213
const struct file_operations proc_cpuset_operations = {
2214
        .open           = cpuset_open,
2215
        .read           = seq_read,
2216
        .llseek         = seq_lseek,
2217
        .release        = single_release,
2218
};
2219
#endif /* CONFIG_PROC_PID_CPUSET */
2220
 
2221
/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
2222
char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
2223
{
2224
        buffer += sprintf(buffer, "Cpus_allowed:\t");
2225
        buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
2226
        buffer += sprintf(buffer, "\n");
2227
        buffer += sprintf(buffer, "Mems_allowed:\t");
2228
        buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
2229
        buffer += sprintf(buffer, "\n");
2230
        return buffer;
2231
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.