OpenCores
URL https://opencores.org/ocsvn/test_project/test_project/trunk

Subversion Repositories test_project

[/] [test_project/] [trunk/] [linux_sd_driver/] [mm/] [mempolicy.c] - Blame information for rev 82

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 62 marcus.erl
/*
2
 * Simple NUMA memory policy for the Linux kernel.
3
 *
4
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6
 * Subject to the GNU Public License, version 2.
7
 *
8
 * NUMA policy allows the user to give hints in which node(s) memory should
9
 * be allocated.
10
 *
11
 * Support four policies per VMA and per process:
12
 *
13
 * The VMA policy has priority over the process policy for a page fault.
14
 *
15
 * interleave     Allocate memory interleaved over a set of nodes,
16
 *                with normal fallback if it fails.
17
 *                For VMA based allocations this interleaves based on the
18
 *                offset into the backing object or offset into the mapping
19
 *                for anonymous memory. For process policy an process counter
20
 *                is used.
21
 *
22
 * bind           Only allocate memory on a specific set of nodes,
23
 *                no fallback.
24
 *                FIXME: memory is allocated starting with the first node
25
 *                to the last. It would be better if bind would truly restrict
26
 *                the allocation to memory nodes instead
27
 *
28
 * preferred       Try a specific node first before normal fallback.
29
 *                As a special case node -1 here means do the allocation
30
 *                on the local CPU. This is normally identical to default,
31
 *                but useful to set in a VMA when you have a non default
32
 *                process policy.
33
 *
34
 * default        Allocate on the local node first, or when on a VMA
35
 *                use the process policy. This is what Linux always did
36
 *                in a NUMA aware kernel and still does by, ahem, default.
37
 *
38
 * The process policy is applied for most non interrupt memory allocations
39
 * in that process' context. Interrupts ignore the policies and always
40
 * try to allocate on the local CPU. The VMA policy is only applied for memory
41
 * allocations for a VMA in the VM.
42
 *
43
 * Currently there are a few corner cases in swapping where the policy
44
 * is not applied, but the majority should be handled. When process policy
45
 * is used it is not remembered over swap outs/swap ins.
46
 *
47
 * Only the highest zone in the zone hierarchy gets policied. Allocations
48
 * requesting a lower zone just use default policy. This implies that
49
 * on systems with highmem kernel lowmem allocation don't get policied.
50
 * Same with GFP_DMA allocations.
51
 *
52
 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53
 * all users and remembered even when nobody has memory mapped.
54
 */
55
 
56
/* Notebook:
57
   fix mmap readahead to honour policy and enable policy for any page cache
58
   object
59
   statistics for bigpages
60
   global policy for page cache? currently it uses process policy. Requires
61
   first item above.
62
   handle mremap for shared memory (currently ignored for the policy)
63
   grows down?
64
   make bind policy root only? It can trigger oom much faster and the
65
   kernel is not always grateful with that.
66
   could replace all the switch()es with a mempolicy_ops structure.
67
*/
68
 
69
#include <linux/mempolicy.h>
70
#include <linux/mm.h>
71
#include <linux/highmem.h>
72
#include <linux/hugetlb.h>
73
#include <linux/kernel.h>
74
#include <linux/sched.h>
75
#include <linux/nodemask.h>
76
#include <linux/cpuset.h>
77
#include <linux/gfp.h>
78
#include <linux/slab.h>
79
#include <linux/string.h>
80
#include <linux/module.h>
81
#include <linux/nsproxy.h>
82
#include <linux/interrupt.h>
83
#include <linux/init.h>
84
#include <linux/compat.h>
85
#include <linux/swap.h>
86
#include <linux/seq_file.h>
87
#include <linux/proc_fs.h>
88
#include <linux/migrate.h>
89
#include <linux/rmap.h>
90
#include <linux/security.h>
91
#include <linux/syscalls.h>
92
 
93
#include <asm/tlbflush.h>
94
#include <asm/uaccess.h>
95
 
96
/* Internal flags */
97
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
98
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
99
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
100
 
101
static struct kmem_cache *policy_cache;
102
static struct kmem_cache *sn_cache;
103
 
104
/* Highest zone. An specific allocation for a zone below that is not
105
   policied. */
106
enum zone_type policy_zone = 0;
107
 
108
struct mempolicy default_policy = {
109
        .refcnt = ATOMIC_INIT(1), /* never free it */
110
        .policy = MPOL_DEFAULT,
111
};
112
 
113
static void mpol_rebind_policy(struct mempolicy *pol,
114
                               const nodemask_t *newmask);
115
 
116
/* Do sanity checking on a policy */
117
static int mpol_check_policy(int mode, nodemask_t *nodes)
118
{
119
        int empty = nodes_empty(*nodes);
120
 
121
        switch (mode) {
122
        case MPOL_DEFAULT:
123
                if (!empty)
124
                        return -EINVAL;
125
                break;
126
        case MPOL_BIND:
127
        case MPOL_INTERLEAVE:
128
                /* Preferred will only use the first bit, but allow
129
                   more for now. */
130
                if (empty)
131
                        return -EINVAL;
132
                break;
133
        }
134
        return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
135
}
136
 
137
/* Generate a custom zonelist for the BIND policy. */
138
static struct zonelist *bind_zonelist(nodemask_t *nodes)
139
{
140
        struct zonelist *zl;
141
        int num, max, nd;
142
        enum zone_type k;
143
 
144
        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
145
        max++;                  /* space for zlcache_ptr (see mmzone.h) */
146
        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
147
        if (!zl)
148
                return ERR_PTR(-ENOMEM);
149
        zl->zlcache_ptr = NULL;
150
        num = 0;
151
        /* First put in the highest zones from all nodes, then all the next
152
           lower zones etc. Avoid empty zones because the memory allocator
153
           doesn't like them. If you implement node hot removal you
154
           have to fix that. */
155
        k = MAX_NR_ZONES - 1;
156
        while (1) {
157
                for_each_node_mask(nd, *nodes) {
158
                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
159
                        if (z->present_pages > 0)
160
                                zl->zones[num++] = z;
161
                }
162
                if (k == 0)
163
                        break;
164
                k--;
165
        }
166
        if (num == 0) {
167
                kfree(zl);
168
                return ERR_PTR(-EINVAL);
169
        }
170
        zl->zones[num] = NULL;
171
        return zl;
172
}
173
 
174
/* Create a new policy */
175
static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
176
{
177
        struct mempolicy *policy;
178
 
179
        pr_debug("setting mode %d nodes[0] %lx\n",
180
                 mode, nodes ? nodes_addr(*nodes)[0] : -1);
181
 
182
        if (mode == MPOL_DEFAULT)
183
                return NULL;
184
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
185
        if (!policy)
186
                return ERR_PTR(-ENOMEM);
187
        atomic_set(&policy->refcnt, 1);
188
        switch (mode) {
189
        case MPOL_INTERLEAVE:
190
                policy->v.nodes = *nodes;
191
                nodes_and(policy->v.nodes, policy->v.nodes,
192
                                        node_states[N_HIGH_MEMORY]);
193
                if (nodes_weight(policy->v.nodes) == 0) {
194
                        kmem_cache_free(policy_cache, policy);
195
                        return ERR_PTR(-EINVAL);
196
                }
197
                break;
198
        case MPOL_PREFERRED:
199
                policy->v.preferred_node = first_node(*nodes);
200
                if (policy->v.preferred_node >= MAX_NUMNODES)
201
                        policy->v.preferred_node = -1;
202
                break;
203
        case MPOL_BIND:
204
                policy->v.zonelist = bind_zonelist(nodes);
205
                if (IS_ERR(policy->v.zonelist)) {
206
                        void *error_code = policy->v.zonelist;
207
                        kmem_cache_free(policy_cache, policy);
208
                        return error_code;
209
                }
210
                break;
211
        }
212
        policy->policy = mode;
213
        policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
214
        return policy;
215
}
216
 
217
static void gather_stats(struct page *, void *, int pte_dirty);
218
static void migrate_page_add(struct page *page, struct list_head *pagelist,
219
                                unsigned long flags);
220
 
221
/* Scan through pages checking if pages follow certain conditions. */
222
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
223
                unsigned long addr, unsigned long end,
224
                const nodemask_t *nodes, unsigned long flags,
225
                void *private)
226
{
227
        pte_t *orig_pte;
228
        pte_t *pte;
229
        spinlock_t *ptl;
230
 
231
        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
232
        do {
233
                struct page *page;
234
                int nid;
235
 
236
                if (!pte_present(*pte))
237
                        continue;
238
                page = vm_normal_page(vma, addr, *pte);
239
                if (!page)
240
                        continue;
241
                /*
242
                 * The check for PageReserved here is important to avoid
243
                 * handling zero pages and other pages that may have been
244
                 * marked special by the system.
245
                 *
246
                 * If the PageReserved would not be checked here then f.e.
247
                 * the location of the zero page could have an influence
248
                 * on MPOL_MF_STRICT, zero pages would be counted for
249
                 * the per node stats, and there would be useless attempts
250
                 * to put zero pages on the migration list.
251
                 */
252
                if (PageReserved(page))
253
                        continue;
254
                nid = page_to_nid(page);
255
                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
256
                        continue;
257
 
258
                if (flags & MPOL_MF_STATS)
259
                        gather_stats(page, private, pte_dirty(*pte));
260
                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
261
                        migrate_page_add(page, private, flags);
262
                else
263
                        break;
264
        } while (pte++, addr += PAGE_SIZE, addr != end);
265
        pte_unmap_unlock(orig_pte, ptl);
266
        return addr != end;
267
}
268
 
269
static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
270
                unsigned long addr, unsigned long end,
271
                const nodemask_t *nodes, unsigned long flags,
272
                void *private)
273
{
274
        pmd_t *pmd;
275
        unsigned long next;
276
 
277
        pmd = pmd_offset(pud, addr);
278
        do {
279
                next = pmd_addr_end(addr, end);
280
                if (pmd_none_or_clear_bad(pmd))
281
                        continue;
282
                if (check_pte_range(vma, pmd, addr, next, nodes,
283
                                    flags, private))
284
                        return -EIO;
285
        } while (pmd++, addr = next, addr != end);
286
        return 0;
287
}
288
 
289
static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
290
                unsigned long addr, unsigned long end,
291
                const nodemask_t *nodes, unsigned long flags,
292
                void *private)
293
{
294
        pud_t *pud;
295
        unsigned long next;
296
 
297
        pud = pud_offset(pgd, addr);
298
        do {
299
                next = pud_addr_end(addr, end);
300
                if (pud_none_or_clear_bad(pud))
301
                        continue;
302
                if (check_pmd_range(vma, pud, addr, next, nodes,
303
                                    flags, private))
304
                        return -EIO;
305
        } while (pud++, addr = next, addr != end);
306
        return 0;
307
}
308
 
309
static inline int check_pgd_range(struct vm_area_struct *vma,
310
                unsigned long addr, unsigned long end,
311
                const nodemask_t *nodes, unsigned long flags,
312
                void *private)
313
{
314
        pgd_t *pgd;
315
        unsigned long next;
316
 
317
        pgd = pgd_offset(vma->vm_mm, addr);
318
        do {
319
                next = pgd_addr_end(addr, end);
320
                if (pgd_none_or_clear_bad(pgd))
321
                        continue;
322
                if (check_pud_range(vma, pgd, addr, next, nodes,
323
                                    flags, private))
324
                        return -EIO;
325
        } while (pgd++, addr = next, addr != end);
326
        return 0;
327
}
328
 
329
/*
330
 * Check if all pages in a range are on a set of nodes.
331
 * If pagelist != NULL then isolate pages from the LRU and
332
 * put them on the pagelist.
333
 */
334
static struct vm_area_struct *
335
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
336
                const nodemask_t *nodes, unsigned long flags, void *private)
337
{
338
        int err;
339
        struct vm_area_struct *first, *vma, *prev;
340
 
341
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
342
 
343
                err = migrate_prep();
344
                if (err)
345
                        return ERR_PTR(err);
346
        }
347
 
348
        first = find_vma(mm, start);
349
        if (!first)
350
                return ERR_PTR(-EFAULT);
351
        prev = NULL;
352
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
353
                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
354
                        if (!vma->vm_next && vma->vm_end < end)
355
                                return ERR_PTR(-EFAULT);
356
                        if (prev && prev->vm_end < vma->vm_start)
357
                                return ERR_PTR(-EFAULT);
358
                }
359
                if (!is_vm_hugetlb_page(vma) &&
360
                    ((flags & MPOL_MF_STRICT) ||
361
                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
362
                                vma_migratable(vma)))) {
363
                        unsigned long endvma = vma->vm_end;
364
 
365
                        if (endvma > end)
366
                                endvma = end;
367
                        if (vma->vm_start > start)
368
                                start = vma->vm_start;
369
                        err = check_pgd_range(vma, start, endvma, nodes,
370
                                                flags, private);
371
                        if (err) {
372
                                first = ERR_PTR(err);
373
                                break;
374
                        }
375
                }
376
                prev = vma;
377
        }
378
        return first;
379
}
380
 
381
/* Apply policy to a single VMA */
382
static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
383
{
384
        int err = 0;
385
        struct mempolicy *old = vma->vm_policy;
386
 
387
        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
388
                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
389
                 vma->vm_ops, vma->vm_file,
390
                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
391
 
392
        if (vma->vm_ops && vma->vm_ops->set_policy)
393
                err = vma->vm_ops->set_policy(vma, new);
394
        if (!err) {
395
                mpol_get(new);
396
                vma->vm_policy = new;
397
                mpol_free(old);
398
        }
399
        return err;
400
}
401
 
402
/* Step 2: apply policy to a range and do splits. */
403
static int mbind_range(struct vm_area_struct *vma, unsigned long start,
404
                       unsigned long end, struct mempolicy *new)
405
{
406
        struct vm_area_struct *next;
407
        int err;
408
 
409
        err = 0;
410
        for (; vma && vma->vm_start < end; vma = next) {
411
                next = vma->vm_next;
412
                if (vma->vm_start < start)
413
                        err = split_vma(vma->vm_mm, vma, start, 1);
414
                if (!err && vma->vm_end > end)
415
                        err = split_vma(vma->vm_mm, vma, end, 0);
416
                if (!err)
417
                        err = policy_vma(vma, new);
418
                if (err)
419
                        break;
420
        }
421
        return err;
422
}
423
 
424
static int contextualize_policy(int mode, nodemask_t *nodes)
425
{
426
        if (!nodes)
427
                return 0;
428
 
429
        cpuset_update_task_memory_state();
430
        if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
431
                return -EINVAL;
432
        return mpol_check_policy(mode, nodes);
433
}
434
 
435
 
436
/*
437
 * Update task->flags PF_MEMPOLICY bit: set iff non-default
438
 * mempolicy.  Allows more rapid checking of this (combined perhaps
439
 * with other PF_* flag bits) on memory allocation hot code paths.
440
 *
441
 * If called from outside this file, the task 'p' should -only- be
442
 * a newly forked child not yet visible on the task list, because
443
 * manipulating the task flags of a visible task is not safe.
444
 *
445
 * The above limitation is why this routine has the funny name
446
 * mpol_fix_fork_child_flag().
447
 *
448
 * It is also safe to call this with a task pointer of current,
449
 * which the static wrapper mpol_set_task_struct_flag() does,
450
 * for use within this file.
451
 */
452
 
453
void mpol_fix_fork_child_flag(struct task_struct *p)
454
{
455
        if (p->mempolicy)
456
                p->flags |= PF_MEMPOLICY;
457
        else
458
                p->flags &= ~PF_MEMPOLICY;
459
}
460
 
461
static void mpol_set_task_struct_flag(void)
462
{
463
        mpol_fix_fork_child_flag(current);
464
}
465
 
466
/* Set the process memory policy */
467
static long do_set_mempolicy(int mode, nodemask_t *nodes)
468
{
469
        struct mempolicy *new;
470
 
471
        if (contextualize_policy(mode, nodes))
472
                return -EINVAL;
473
        new = mpol_new(mode, nodes);
474
        if (IS_ERR(new))
475
                return PTR_ERR(new);
476
        mpol_free(current->mempolicy);
477
        current->mempolicy = new;
478
        mpol_set_task_struct_flag();
479
        if (new && new->policy == MPOL_INTERLEAVE)
480
                current->il_next = first_node(new->v.nodes);
481
        return 0;
482
}
483
 
484
/* Fill a zone bitmap for a policy */
485
static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
486
{
487
        int i;
488
 
489
        nodes_clear(*nodes);
490
        switch (p->policy) {
491
        case MPOL_BIND:
492
                for (i = 0; p->v.zonelist->zones[i]; i++)
493
                        node_set(zone_to_nid(p->v.zonelist->zones[i]),
494
                                *nodes);
495
                break;
496
        case MPOL_DEFAULT:
497
                break;
498
        case MPOL_INTERLEAVE:
499
                *nodes = p->v.nodes;
500
                break;
501
        case MPOL_PREFERRED:
502
                /* or use current node instead of memory_map? */
503
                if (p->v.preferred_node < 0)
504
                        *nodes = node_states[N_HIGH_MEMORY];
505
                else
506
                        node_set(p->v.preferred_node, *nodes);
507
                break;
508
        default:
509
                BUG();
510
        }
511
}
512
 
513
static int lookup_node(struct mm_struct *mm, unsigned long addr)
514
{
515
        struct page *p;
516
        int err;
517
 
518
        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
519
        if (err >= 0) {
520
                err = page_to_nid(p);
521
                put_page(p);
522
        }
523
        return err;
524
}
525
 
526
/* Retrieve NUMA policy */
527
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
528
                             unsigned long addr, unsigned long flags)
529
{
530
        int err;
531
        struct mm_struct *mm = current->mm;
532
        struct vm_area_struct *vma = NULL;
533
        struct mempolicy *pol = current->mempolicy;
534
 
535
        cpuset_update_task_memory_state();
536
        if (flags &
537
                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
538
                return -EINVAL;
539
 
540
        if (flags & MPOL_F_MEMS_ALLOWED) {
541
                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
542
                        return -EINVAL;
543
                *policy = 0;     /* just so it's initialized */
544
                *nmask  = cpuset_current_mems_allowed;
545
                return 0;
546
        }
547
 
548
        if (flags & MPOL_F_ADDR) {
549
                down_read(&mm->mmap_sem);
550
                vma = find_vma_intersection(mm, addr, addr+1);
551
                if (!vma) {
552
                        up_read(&mm->mmap_sem);
553
                        return -EFAULT;
554
                }
555
                if (vma->vm_ops && vma->vm_ops->get_policy)
556
                        pol = vma->vm_ops->get_policy(vma, addr);
557
                else
558
                        pol = vma->vm_policy;
559
        } else if (addr)
560
                return -EINVAL;
561
 
562
        if (!pol)
563
                pol = &default_policy;
564
 
565
        if (flags & MPOL_F_NODE) {
566
                if (flags & MPOL_F_ADDR) {
567
                        err = lookup_node(mm, addr);
568
                        if (err < 0)
569
                                goto out;
570
                        *policy = err;
571
                } else if (pol == current->mempolicy &&
572
                                pol->policy == MPOL_INTERLEAVE) {
573
                        *policy = current->il_next;
574
                } else {
575
                        err = -EINVAL;
576
                        goto out;
577
                }
578
        } else
579
                *policy = pol->policy;
580
 
581
        if (vma) {
582
                up_read(&current->mm->mmap_sem);
583
                vma = NULL;
584
        }
585
 
586
        err = 0;
587
        if (nmask)
588
                get_zonemask(pol, nmask);
589
 
590
 out:
591
        if (vma)
592
                up_read(&current->mm->mmap_sem);
593
        return err;
594
}
595
 
596
#ifdef CONFIG_MIGRATION
597
/*
598
 * page migration
599
 */
600
static void migrate_page_add(struct page *page, struct list_head *pagelist,
601
                                unsigned long flags)
602
{
603
        /*
604
         * Avoid migrating a page that is shared with others.
605
         */
606
        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
607
                isolate_lru_page(page, pagelist);
608
}
609
 
610
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
611
{
612
        return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
613
}
614
 
615
/*
616
 * Migrate pages from one node to a target node.
617
 * Returns error or the number of pages not migrated.
618
 */
619
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
620
                           int flags)
621
{
622
        nodemask_t nmask;
623
        LIST_HEAD(pagelist);
624
        int err = 0;
625
 
626
        nodes_clear(nmask);
627
        node_set(source, nmask);
628
 
629
        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
630
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
631
 
632
        if (!list_empty(&pagelist))
633
                err = migrate_pages(&pagelist, new_node_page, dest);
634
 
635
        return err;
636
}
637
 
638
/*
639
 * Move pages between the two nodesets so as to preserve the physical
640
 * layout as much as possible.
641
 *
642
 * Returns the number of page that could not be moved.
643
 */
644
int do_migrate_pages(struct mm_struct *mm,
645
        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
646
{
647
        LIST_HEAD(pagelist);
648
        int busy = 0;
649
        int err = 0;
650
        nodemask_t tmp;
651
 
652
        down_read(&mm->mmap_sem);
653
 
654
        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
655
        if (err)
656
                goto out;
657
 
658
/*
659
 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
660
 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
661
 * bit in 'tmp', and return that <source, dest> pair for migration.
662
 * The pair of nodemasks 'to' and 'from' define the map.
663
 *
664
 * If no pair of bits is found that way, fallback to picking some
665
 * pair of 'source' and 'dest' bits that are not the same.  If the
666
 * 'source' and 'dest' bits are the same, this represents a node
667
 * that will be migrating to itself, so no pages need move.
668
 *
669
 * If no bits are left in 'tmp', or if all remaining bits left
670
 * in 'tmp' correspond to the same bit in 'to', return false
671
 * (nothing left to migrate).
672
 *
673
 * This lets us pick a pair of nodes to migrate between, such that
674
 * if possible the dest node is not already occupied by some other
675
 * source node, minimizing the risk of overloading the memory on a
676
 * node that would happen if we migrated incoming memory to a node
677
 * before migrating outgoing memory source that same node.
678
 *
679
 * A single scan of tmp is sufficient.  As we go, we remember the
680
 * most recent <s, d> pair that moved (s != d).  If we find a pair
681
 * that not only moved, but what's better, moved to an empty slot
682
 * (d is not set in tmp), then we break out then, with that pair.
683
 * Otherwise when we finish scannng from_tmp, we at least have the
684
 * most recent <s, d> pair that moved.  If we get all the way through
685
 * the scan of tmp without finding any node that moved, much less
686
 * moved to an empty node, then there is nothing left worth migrating.
687
 */
688
 
689
        tmp = *from_nodes;
690
        while (!nodes_empty(tmp)) {
691
                int s,d;
692
                int source = -1;
693
                int dest = 0;
694
 
695
                for_each_node_mask(s, tmp) {
696
                        d = node_remap(s, *from_nodes, *to_nodes);
697
                        if (s == d)
698
                                continue;
699
 
700
                        source = s;     /* Node moved. Memorize */
701
                        dest = d;
702
 
703
                        /* dest not in remaining from nodes? */
704
                        if (!node_isset(dest, tmp))
705
                                break;
706
                }
707
                if (source == -1)
708
                        break;
709
 
710
                node_clear(source, tmp);
711
                err = migrate_to_node(mm, source, dest, flags);
712
                if (err > 0)
713
                        busy += err;
714
                if (err < 0)
715
                        break;
716
        }
717
out:
718
        up_read(&mm->mmap_sem);
719
        if (err < 0)
720
                return err;
721
        return busy;
722
 
723
}
724
 
725
/*
726
 * Allocate a new page for page migration based on vma policy.
727
 * Start assuming that page is mapped by vma pointed to by @private.
728
 * Search forward from there, if not.  N.B., this assumes that the
729
 * list of pages handed to migrate_pages()--which is how we get here--
730
 * is in virtual address order.
731
 */
732
static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
733
{
734
        struct vm_area_struct *vma = (struct vm_area_struct *)private;
735
        unsigned long uninitialized_var(address);
736
 
737
        while (vma) {
738
                address = page_address_in_vma(page, vma);
739
                if (address != -EFAULT)
740
                        break;
741
                vma = vma->vm_next;
742
        }
743
 
744
        /*
745
         * if !vma, alloc_page_vma() will use task or system default policy
746
         */
747
        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
748
}
749
#else
750
 
751
static void migrate_page_add(struct page *page, struct list_head *pagelist,
752
                                unsigned long flags)
753
{
754
}
755
 
756
int do_migrate_pages(struct mm_struct *mm,
757
        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
758
{
759
        return -ENOSYS;
760
}
761
 
762
static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
763
{
764
        return NULL;
765
}
766
#endif
767
 
768
static long do_mbind(unsigned long start, unsigned long len,
769
                     unsigned long mode, nodemask_t *nmask,
770
                     unsigned long flags)
771
{
772
        struct vm_area_struct *vma;
773
        struct mm_struct *mm = current->mm;
774
        struct mempolicy *new;
775
        unsigned long end;
776
        int err;
777
        LIST_HEAD(pagelist);
778
 
779
        if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
780
                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
781
            || mode > MPOL_MAX)
782
                return -EINVAL;
783
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
784
                return -EPERM;
785
 
786
        if (start & ~PAGE_MASK)
787
                return -EINVAL;
788
 
789
        if (mode == MPOL_DEFAULT)
790
                flags &= ~MPOL_MF_STRICT;
791
 
792
        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
793
        end = start + len;
794
 
795
        if (end < start)
796
                return -EINVAL;
797
        if (end == start)
798
                return 0;
799
 
800
        if (mpol_check_policy(mode, nmask))
801
                return -EINVAL;
802
 
803
        new = mpol_new(mode, nmask);
804
        if (IS_ERR(new))
805
                return PTR_ERR(new);
806
 
807
        /*
808
         * If we are using the default policy then operation
809
         * on discontinuous address spaces is okay after all
810
         */
811
        if (!new)
812
                flags |= MPOL_MF_DISCONTIG_OK;
813
 
814
        pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
815
                 mode, nmask ? nodes_addr(*nmask)[0] : -1);
816
 
817
        down_write(&mm->mmap_sem);
818
        vma = check_range(mm, start, end, nmask,
819
                          flags | MPOL_MF_INVERT, &pagelist);
820
 
821
        err = PTR_ERR(vma);
822
        if (!IS_ERR(vma)) {
823
                int nr_failed = 0;
824
 
825
                err = mbind_range(vma, start, end, new);
826
 
827
                if (!list_empty(&pagelist))
828
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
829
                                                (unsigned long)vma);
830
 
831
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
832
                        err = -EIO;
833
        }
834
 
835
        up_write(&mm->mmap_sem);
836
        mpol_free(new);
837
        return err;
838
}
839
 
840
/*
841
 * User space interface with variable sized bitmaps for nodelists.
842
 */
843
 
844
/* Copy a node mask from user space. */
845
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
846
                     unsigned long maxnode)
847
{
848
        unsigned long k;
849
        unsigned long nlongs;
850
        unsigned long endmask;
851
 
852
        --maxnode;
853
        nodes_clear(*nodes);
854
        if (maxnode == 0 || !nmask)
855
                return 0;
856
        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
857
                return -EINVAL;
858
 
859
        nlongs = BITS_TO_LONGS(maxnode);
860
        if ((maxnode % BITS_PER_LONG) == 0)
861
                endmask = ~0UL;
862
        else
863
                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
864
 
865
        /* When the user specified more nodes than supported just check
866
           if the non supported part is all zero. */
867
        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
868
                if (nlongs > PAGE_SIZE/sizeof(long))
869
                        return -EINVAL;
870
                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
871
                        unsigned long t;
872
                        if (get_user(t, nmask + k))
873
                                return -EFAULT;
874
                        if (k == nlongs - 1) {
875
                                if (t & endmask)
876
                                        return -EINVAL;
877
                        } else if (t)
878
                                return -EINVAL;
879
                }
880
                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
881
                endmask = ~0UL;
882
        }
883
 
884
        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
885
                return -EFAULT;
886
        nodes_addr(*nodes)[nlongs-1] &= endmask;
887
        return 0;
888
}
889
 
890
/* Copy a kernel node mask to user space */
891
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
892
                              nodemask_t *nodes)
893
{
894
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
895
        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
896
 
897
        if (copy > nbytes) {
898
                if (copy > PAGE_SIZE)
899
                        return -EINVAL;
900
                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
901
                        return -EFAULT;
902
                copy = nbytes;
903
        }
904
        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
905
}
906
 
907
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
908
                        unsigned long mode,
909
                        unsigned long __user *nmask, unsigned long maxnode,
910
                        unsigned flags)
911
{
912
        nodemask_t nodes;
913
        int err;
914
 
915
        err = get_nodes(&nodes, nmask, maxnode);
916
        if (err)
917
                return err;
918
#ifdef CONFIG_CPUSETS
919
        /* Restrict the nodes to the allowed nodes in the cpuset */
920
        nodes_and(nodes, nodes, current->mems_allowed);
921
#endif
922
        return do_mbind(start, len, mode, &nodes, flags);
923
}
924
 
925
/* Set the process memory policy */
926
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
927
                unsigned long maxnode)
928
{
929
        int err;
930
        nodemask_t nodes;
931
 
932
        if (mode < 0 || mode > MPOL_MAX)
933
                return -EINVAL;
934
        err = get_nodes(&nodes, nmask, maxnode);
935
        if (err)
936
                return err;
937
        return do_set_mempolicy(mode, &nodes);
938
}
939
 
940
asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
941
                const unsigned long __user *old_nodes,
942
                const unsigned long __user *new_nodes)
943
{
944
        struct mm_struct *mm;
945
        struct task_struct *task;
946
        nodemask_t old;
947
        nodemask_t new;
948
        nodemask_t task_nodes;
949
        int err;
950
 
951
        err = get_nodes(&old, old_nodes, maxnode);
952
        if (err)
953
                return err;
954
 
955
        err = get_nodes(&new, new_nodes, maxnode);
956
        if (err)
957
                return err;
958
 
959
        /* Find the mm_struct */
960
        read_lock(&tasklist_lock);
961
        task = pid ? find_task_by_vpid(pid) : current;
962
        if (!task) {
963
                read_unlock(&tasklist_lock);
964
                return -ESRCH;
965
        }
966
        mm = get_task_mm(task);
967
        read_unlock(&tasklist_lock);
968
 
969
        if (!mm)
970
                return -EINVAL;
971
 
972
        /*
973
         * Check if this process has the right to modify the specified
974
         * process. The right exists if the process has administrative
975
         * capabilities, superuser privileges or the same
976
         * userid as the target process.
977
         */
978
        if ((current->euid != task->suid) && (current->euid != task->uid) &&
979
            (current->uid != task->suid) && (current->uid != task->uid) &&
980
            !capable(CAP_SYS_NICE)) {
981
                err = -EPERM;
982
                goto out;
983
        }
984
 
985
        task_nodes = cpuset_mems_allowed(task);
986
        /* Is the user allowed to access the target nodes? */
987
        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
988
                err = -EPERM;
989
                goto out;
990
        }
991
 
992
        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
993
                err = -EINVAL;
994
                goto out;
995
        }
996
 
997
        err = security_task_movememory(task);
998
        if (err)
999
                goto out;
1000
 
1001
        err = do_migrate_pages(mm, &old, &new,
1002
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1003
out:
1004
        mmput(mm);
1005
        return err;
1006
}
1007
 
1008
 
1009
/* Retrieve NUMA policy */
1010
asmlinkage long sys_get_mempolicy(int __user *policy,
1011
                                unsigned long __user *nmask,
1012
                                unsigned long maxnode,
1013
                                unsigned long addr, unsigned long flags)
1014
{
1015
        int err;
1016
        int uninitialized_var(pval);
1017
        nodemask_t nodes;
1018
 
1019
        if (nmask != NULL && maxnode < MAX_NUMNODES)
1020
                return -EINVAL;
1021
 
1022
        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1023
 
1024
        if (err)
1025
                return err;
1026
 
1027
        if (policy && put_user(pval, policy))
1028
                return -EFAULT;
1029
 
1030
        if (nmask)
1031
                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1032
 
1033
        return err;
1034
}
1035
 
1036
#ifdef CONFIG_COMPAT
1037
 
1038
asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1039
                                     compat_ulong_t __user *nmask,
1040
                                     compat_ulong_t maxnode,
1041
                                     compat_ulong_t addr, compat_ulong_t flags)
1042
{
1043
        long err;
1044
        unsigned long __user *nm = NULL;
1045
        unsigned long nr_bits, alloc_size;
1046
        DECLARE_BITMAP(bm, MAX_NUMNODES);
1047
 
1048
        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1049
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1050
 
1051
        if (nmask)
1052
                nm = compat_alloc_user_space(alloc_size);
1053
 
1054
        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1055
 
1056
        if (!err && nmask) {
1057
                err = copy_from_user(bm, nm, alloc_size);
1058
                /* ensure entire bitmap is zeroed */
1059
                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1060
                err |= compat_put_bitmap(nmask, bm, nr_bits);
1061
        }
1062
 
1063
        return err;
1064
}
1065
 
1066
asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1067
                                     compat_ulong_t maxnode)
1068
{
1069
        long err = 0;
1070
        unsigned long __user *nm = NULL;
1071
        unsigned long nr_bits, alloc_size;
1072
        DECLARE_BITMAP(bm, MAX_NUMNODES);
1073
 
1074
        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1075
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1076
 
1077
        if (nmask) {
1078
                err = compat_get_bitmap(bm, nmask, nr_bits);
1079
                nm = compat_alloc_user_space(alloc_size);
1080
                err |= copy_to_user(nm, bm, alloc_size);
1081
        }
1082
 
1083
        if (err)
1084
                return -EFAULT;
1085
 
1086
        return sys_set_mempolicy(mode, nm, nr_bits+1);
1087
}
1088
 
1089
asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1090
                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1091
                             compat_ulong_t maxnode, compat_ulong_t flags)
1092
{
1093
        long err = 0;
1094
        unsigned long __user *nm = NULL;
1095
        unsigned long nr_bits, alloc_size;
1096
        nodemask_t bm;
1097
 
1098
        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1099
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1100
 
1101
        if (nmask) {
1102
                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1103
                nm = compat_alloc_user_space(alloc_size);
1104
                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1105
        }
1106
 
1107
        if (err)
1108
                return -EFAULT;
1109
 
1110
        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1111
}
1112
 
1113
#endif
1114
 
1115
/*
1116
 * get_vma_policy(@task, @vma, @addr)
1117
 * @task - task for fallback if vma policy == default
1118
 * @vma   - virtual memory area whose policy is sought
1119
 * @addr  - address in @vma for shared policy lookup
1120
 *
1121
 * Returns effective policy for a VMA at specified address.
1122
 * Falls back to @task or system default policy, as necessary.
1123
 * Returned policy has extra reference count if shared, vma,
1124
 * or some other task's policy [show_numa_maps() can pass
1125
 * @task != current].  It is the caller's responsibility to
1126
 * free the reference in these cases.
1127
 */
1128
static struct mempolicy * get_vma_policy(struct task_struct *task,
1129
                struct vm_area_struct *vma, unsigned long addr)
1130
{
1131
        struct mempolicy *pol = task->mempolicy;
1132
        int shared_pol = 0;
1133
 
1134
        if (vma) {
1135
                if (vma->vm_ops && vma->vm_ops->get_policy) {
1136
                        pol = vma->vm_ops->get_policy(vma, addr);
1137
                        shared_pol = 1; /* if pol non-NULL, add ref below */
1138
                } else if (vma->vm_policy &&
1139
                                vma->vm_policy->policy != MPOL_DEFAULT)
1140
                        pol = vma->vm_policy;
1141
        }
1142
        if (!pol)
1143
                pol = &default_policy;
1144
        else if (!shared_pol && pol != current->mempolicy)
1145
                mpol_get(pol);  /* vma or other task's policy */
1146
        return pol;
1147
}
1148
 
1149
/* Return a zonelist representing a mempolicy */
1150
static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1151
{
1152
        int nd;
1153
 
1154
        switch (policy->policy) {
1155
        case MPOL_PREFERRED:
1156
                nd = policy->v.preferred_node;
1157
                if (nd < 0)
1158
                        nd = numa_node_id();
1159
                break;
1160
        case MPOL_BIND:
1161
                /* Lower zones don't get a policy applied */
1162
                /* Careful: current->mems_allowed might have moved */
1163
                if (gfp_zone(gfp) >= policy_zone)
1164
                        if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1165
                                return policy->v.zonelist;
1166
                /*FALL THROUGH*/
1167
        case MPOL_INTERLEAVE: /* should not happen */
1168
        case MPOL_DEFAULT:
1169
                nd = numa_node_id();
1170
                break;
1171
        default:
1172
                nd = 0;
1173
                BUG();
1174
        }
1175
        return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1176
}
1177
 
1178
/* Do dynamic interleaving for a process */
1179
static unsigned interleave_nodes(struct mempolicy *policy)
1180
{
1181
        unsigned nid, next;
1182
        struct task_struct *me = current;
1183
 
1184
        nid = me->il_next;
1185
        next = next_node(nid, policy->v.nodes);
1186
        if (next >= MAX_NUMNODES)
1187
                next = first_node(policy->v.nodes);
1188
        me->il_next = next;
1189
        return nid;
1190
}
1191
 
1192
/*
1193
 * Depending on the memory policy provide a node from which to allocate the
1194
 * next slab entry.
1195
 */
1196
unsigned slab_node(struct mempolicy *policy)
1197
{
1198
        int pol = policy ? policy->policy : MPOL_DEFAULT;
1199
 
1200
        switch (pol) {
1201
        case MPOL_INTERLEAVE:
1202
                return interleave_nodes(policy);
1203
 
1204
        case MPOL_BIND:
1205
                /*
1206
                 * Follow bind policy behavior and start allocation at the
1207
                 * first node.
1208
                 */
1209
                return zone_to_nid(policy->v.zonelist->zones[0]);
1210
 
1211
        case MPOL_PREFERRED:
1212
                if (policy->v.preferred_node >= 0)
1213
                        return policy->v.preferred_node;
1214
                /* Fall through */
1215
 
1216
        default:
1217
                return numa_node_id();
1218
        }
1219
}
1220
 
1221
/* Do static interleaving for a VMA with known offset. */
1222
static unsigned offset_il_node(struct mempolicy *pol,
1223
                struct vm_area_struct *vma, unsigned long off)
1224
{
1225
        unsigned nnodes = nodes_weight(pol->v.nodes);
1226
        unsigned target = (unsigned)off % nnodes;
1227
        int c;
1228
        int nid = -1;
1229
 
1230
        c = 0;
1231
        do {
1232
                nid = next_node(nid, pol->v.nodes);
1233
                c++;
1234
        } while (c <= target);
1235
        return nid;
1236
}
1237
 
1238
/* Determine a node number for interleave */
1239
static inline unsigned interleave_nid(struct mempolicy *pol,
1240
                 struct vm_area_struct *vma, unsigned long addr, int shift)
1241
{
1242
        if (vma) {
1243
                unsigned long off;
1244
 
1245
                /*
1246
                 * for small pages, there is no difference between
1247
                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1248
                 * for huge pages, since vm_pgoff is in units of small
1249
                 * pages, we need to shift off the always 0 bits to get
1250
                 * a useful offset.
1251
                 */
1252
                BUG_ON(shift < PAGE_SHIFT);
1253
                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1254
                off += (addr - vma->vm_start) >> shift;
1255
                return offset_il_node(pol, vma, off);
1256
        } else
1257
                return interleave_nodes(pol);
1258
}
1259
 
1260
#ifdef CONFIG_HUGETLBFS
1261
/*
1262
 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1263
 * @vma = virtual memory area whose policy is sought
1264
 * @addr = address in @vma for shared policy lookup and interleave policy
1265
 * @gfp_flags = for requested zone
1266
 * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1267
 *
1268
 * Returns a zonelist suitable for a huge page allocation.
1269
 * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1270
 * If it is also a policy for which get_vma_policy() returns an extra
1271
 * reference, we must hold that reference until after allocation.
1272
 * In that case, return policy via @mpol so hugetlb allocation can drop
1273
 * the reference.  For non-'BIND referenced policies, we can/do drop the
1274
 * reference here, so the caller doesn't need to know about the special case
1275
 * for default and current task policy.
1276
 */
1277
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1278
                                gfp_t gfp_flags, struct mempolicy **mpol)
1279
{
1280
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1281
        struct zonelist *zl;
1282
 
1283
        *mpol = NULL;           /* probably no unref needed */
1284
        if (pol->policy == MPOL_INTERLEAVE) {
1285
                unsigned nid;
1286
 
1287
                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1288
                __mpol_free(pol);               /* finished with pol */
1289
                return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1290
        }
1291
 
1292
        zl = zonelist_policy(GFP_HIGHUSER, pol);
1293
        if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1294
                if (pol->policy != MPOL_BIND)
1295
                        __mpol_free(pol);       /* finished with pol */
1296
                else
1297
                        *mpol = pol;    /* unref needed after allocation */
1298
        }
1299
        return zl;
1300
}
1301
#endif
1302
 
1303
/* Allocate a page in interleaved policy.
1304
   Own path because it needs to do special accounting. */
1305
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1306
                                        unsigned nid)
1307
{
1308
        struct zonelist *zl;
1309
        struct page *page;
1310
 
1311
        zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1312
        page = __alloc_pages(gfp, order, zl);
1313
        if (page && page_zone(page) == zl->zones[0])
1314
                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1315
        return page;
1316
}
1317
 
1318
/**
1319
 *      alloc_page_vma  - Allocate a page for a VMA.
1320
 *
1321
 *      @gfp:
1322
 *      %GFP_USER    user allocation.
1323
 *      %GFP_KERNEL  kernel allocations,
1324
 *      %GFP_HIGHMEM highmem/user allocations,
1325
 *      %GFP_FS      allocation should not call back into a file system.
1326
 *      %GFP_ATOMIC  don't sleep.
1327
 *
1328
 *      @vma:  Pointer to VMA or NULL if not available.
1329
 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1330
 *
1331
 *      This function allocates a page from the kernel page pool and applies
1332
 *      a NUMA policy associated with the VMA or the current process.
1333
 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1334
 *      mm_struct of the VMA to prevent it from going away. Should be used for
1335
 *      all allocations for pages that will be mapped into
1336
 *      user space. Returns NULL when no page can be allocated.
1337
 *
1338
 *      Should be called with the mm_sem of the vma hold.
1339
 */
1340
struct page *
1341
alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1342
{
1343
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1344
        struct zonelist *zl;
1345
 
1346
        cpuset_update_task_memory_state();
1347
 
1348
        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1349
                unsigned nid;
1350
 
1351
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1352
                return alloc_page_interleave(gfp, 0, nid);
1353
        }
1354
        zl = zonelist_policy(gfp, pol);
1355
        if (pol != &default_policy && pol != current->mempolicy) {
1356
                /*
1357
                 * slow path: ref counted policy -- shared or vma
1358
                 */
1359
                struct page *page =  __alloc_pages(gfp, 0, zl);
1360
                __mpol_free(pol);
1361
                return page;
1362
        }
1363
        /*
1364
         * fast path:  default or task policy
1365
         */
1366
        return __alloc_pages(gfp, 0, zl);
1367
}
1368
 
1369
/**
1370
 *      alloc_pages_current - Allocate pages.
1371
 *
1372
 *      @gfp:
1373
 *              %GFP_USER   user allocation,
1374
 *              %GFP_KERNEL kernel allocation,
1375
 *              %GFP_HIGHMEM highmem allocation,
1376
 *              %GFP_FS     don't call back into a file system.
1377
 *              %GFP_ATOMIC don't sleep.
1378
 *      @order: Power of two of allocation size in pages. 0 is a single page.
1379
 *
1380
 *      Allocate a page from the kernel page pool.  When not in
1381
 *      interrupt context and apply the current process NUMA policy.
1382
 *      Returns NULL when no page can be allocated.
1383
 *
1384
 *      Don't call cpuset_update_task_memory_state() unless
1385
 *      1) it's ok to take cpuset_sem (can WAIT), and
1386
 *      2) allocating for current task (not interrupt).
1387
 */
1388
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1389
{
1390
        struct mempolicy *pol = current->mempolicy;
1391
 
1392
        if ((gfp & __GFP_WAIT) && !in_interrupt())
1393
                cpuset_update_task_memory_state();
1394
        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1395
                pol = &default_policy;
1396
        if (pol->policy == MPOL_INTERLEAVE)
1397
                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1398
        return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1399
}
1400
EXPORT_SYMBOL(alloc_pages_current);
1401
 
1402
/*
1403
 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1404
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1405
 * with the mems_allowed returned by cpuset_mems_allowed().  This
1406
 * keeps mempolicies cpuset relative after its cpuset moves.  See
1407
 * further kernel/cpuset.c update_nodemask().
1408
 */
1409
 
1410
/* Slow path of a mempolicy copy */
1411
struct mempolicy *__mpol_copy(struct mempolicy *old)
1412
{
1413
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1414
 
1415
        if (!new)
1416
                return ERR_PTR(-ENOMEM);
1417
        if (current_cpuset_is_being_rebound()) {
1418
                nodemask_t mems = cpuset_mems_allowed(current);
1419
                mpol_rebind_policy(old, &mems);
1420
        }
1421
        *new = *old;
1422
        atomic_set(&new->refcnt, 1);
1423
        if (new->policy == MPOL_BIND) {
1424
                int sz = ksize(old->v.zonelist);
1425
                new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1426
                if (!new->v.zonelist) {
1427
                        kmem_cache_free(policy_cache, new);
1428
                        return ERR_PTR(-ENOMEM);
1429
                }
1430
        }
1431
        return new;
1432
}
1433
 
1434
/* Slow path of a mempolicy comparison */
1435
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1436
{
1437
        if (!a || !b)
1438
                return 0;
1439
        if (a->policy != b->policy)
1440
                return 0;
1441
        switch (a->policy) {
1442
        case MPOL_DEFAULT:
1443
                return 1;
1444
        case MPOL_INTERLEAVE:
1445
                return nodes_equal(a->v.nodes, b->v.nodes);
1446
        case MPOL_PREFERRED:
1447
                return a->v.preferred_node == b->v.preferred_node;
1448
        case MPOL_BIND: {
1449
                int i;
1450
                for (i = 0; a->v.zonelist->zones[i]; i++)
1451
                        if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1452
                                return 0;
1453
                return b->v.zonelist->zones[i] == NULL;
1454
        }
1455
        default:
1456
                BUG();
1457
                return 0;
1458
        }
1459
}
1460
 
1461
/* Slow path of a mpol destructor. */
1462
void __mpol_free(struct mempolicy *p)
1463
{
1464
        if (!atomic_dec_and_test(&p->refcnt))
1465
                return;
1466
        if (p->policy == MPOL_BIND)
1467
                kfree(p->v.zonelist);
1468
        p->policy = MPOL_DEFAULT;
1469
        kmem_cache_free(policy_cache, p);
1470
}
1471
 
1472
/*
1473
 * Shared memory backing store policy support.
1474
 *
1475
 * Remember policies even when nobody has shared memory mapped.
1476
 * The policies are kept in Red-Black tree linked from the inode.
1477
 * They are protected by the sp->lock spinlock, which should be held
1478
 * for any accesses to the tree.
1479
 */
1480
 
1481
/* lookup first element intersecting start-end */
1482
/* Caller holds sp->lock */
1483
static struct sp_node *
1484
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1485
{
1486
        struct rb_node *n = sp->root.rb_node;
1487
 
1488
        while (n) {
1489
                struct sp_node *p = rb_entry(n, struct sp_node, nd);
1490
 
1491
                if (start >= p->end)
1492
                        n = n->rb_right;
1493
                else if (end <= p->start)
1494
                        n = n->rb_left;
1495
                else
1496
                        break;
1497
        }
1498
        if (!n)
1499
                return NULL;
1500
        for (;;) {
1501
                struct sp_node *w = NULL;
1502
                struct rb_node *prev = rb_prev(n);
1503
                if (!prev)
1504
                        break;
1505
                w = rb_entry(prev, struct sp_node, nd);
1506
                if (w->end <= start)
1507
                        break;
1508
                n = prev;
1509
        }
1510
        return rb_entry(n, struct sp_node, nd);
1511
}
1512
 
1513
/* Insert a new shared policy into the list. */
1514
/* Caller holds sp->lock */
1515
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1516
{
1517
        struct rb_node **p = &sp->root.rb_node;
1518
        struct rb_node *parent = NULL;
1519
        struct sp_node *nd;
1520
 
1521
        while (*p) {
1522
                parent = *p;
1523
                nd = rb_entry(parent, struct sp_node, nd);
1524
                if (new->start < nd->start)
1525
                        p = &(*p)->rb_left;
1526
                else if (new->end > nd->end)
1527
                        p = &(*p)->rb_right;
1528
                else
1529
                        BUG();
1530
        }
1531
        rb_link_node(&new->nd, parent, p);
1532
        rb_insert_color(&new->nd, &sp->root);
1533
        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1534
                 new->policy ? new->policy->policy : 0);
1535
}
1536
 
1537
/* Find shared policy intersecting idx */
1538
struct mempolicy *
1539
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1540
{
1541
        struct mempolicy *pol = NULL;
1542
        struct sp_node *sn;
1543
 
1544
        if (!sp->root.rb_node)
1545
                return NULL;
1546
        spin_lock(&sp->lock);
1547
        sn = sp_lookup(sp, idx, idx+1);
1548
        if (sn) {
1549
                mpol_get(sn->policy);
1550
                pol = sn->policy;
1551
        }
1552
        spin_unlock(&sp->lock);
1553
        return pol;
1554
}
1555
 
1556
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1557
{
1558
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1559
        rb_erase(&n->nd, &sp->root);
1560
        mpol_free(n->policy);
1561
        kmem_cache_free(sn_cache, n);
1562
}
1563
 
1564
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1565
                                struct mempolicy *pol)
1566
{
1567
        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1568
 
1569
        if (!n)
1570
                return NULL;
1571
        n->start = start;
1572
        n->end = end;
1573
        mpol_get(pol);
1574
        n->policy = pol;
1575
        return n;
1576
}
1577
 
1578
/* Replace a policy range. */
1579
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1580
                                 unsigned long end, struct sp_node *new)
1581
{
1582
        struct sp_node *n, *new2 = NULL;
1583
 
1584
restart:
1585
        spin_lock(&sp->lock);
1586
        n = sp_lookup(sp, start, end);
1587
        /* Take care of old policies in the same range. */
1588
        while (n && n->start < end) {
1589
                struct rb_node *next = rb_next(&n->nd);
1590
                if (n->start >= start) {
1591
                        if (n->end <= end)
1592
                                sp_delete(sp, n);
1593
                        else
1594
                                n->start = end;
1595
                } else {
1596
                        /* Old policy spanning whole new range. */
1597
                        if (n->end > end) {
1598
                                if (!new2) {
1599
                                        spin_unlock(&sp->lock);
1600
                                        new2 = sp_alloc(end, n->end, n->policy);
1601
                                        if (!new2)
1602
                                                return -ENOMEM;
1603
                                        goto restart;
1604
                                }
1605
                                n->end = start;
1606
                                sp_insert(sp, new2);
1607
                                new2 = NULL;
1608
                                break;
1609
                        } else
1610
                                n->end = start;
1611
                }
1612
                if (!next)
1613
                        break;
1614
                n = rb_entry(next, struct sp_node, nd);
1615
        }
1616
        if (new)
1617
                sp_insert(sp, new);
1618
        spin_unlock(&sp->lock);
1619
        if (new2) {
1620
                mpol_free(new2->policy);
1621
                kmem_cache_free(sn_cache, new2);
1622
        }
1623
        return 0;
1624
}
1625
 
1626
void mpol_shared_policy_init(struct shared_policy *info, int policy,
1627
                                nodemask_t *policy_nodes)
1628
{
1629
        info->root = RB_ROOT;
1630
        spin_lock_init(&info->lock);
1631
 
1632
        if (policy != MPOL_DEFAULT) {
1633
                struct mempolicy *newpol;
1634
 
1635
                /* Falls back to MPOL_DEFAULT on any error */
1636
                newpol = mpol_new(policy, policy_nodes);
1637
                if (!IS_ERR(newpol)) {
1638
                        /* Create pseudo-vma that contains just the policy */
1639
                        struct vm_area_struct pvma;
1640
 
1641
                        memset(&pvma, 0, sizeof(struct vm_area_struct));
1642
                        /* Policy covers entire file */
1643
                        pvma.vm_end = TASK_SIZE;
1644
                        mpol_set_shared_policy(info, &pvma, newpol);
1645
                        mpol_free(newpol);
1646
                }
1647
        }
1648
}
1649
 
1650
int mpol_set_shared_policy(struct shared_policy *info,
1651
                        struct vm_area_struct *vma, struct mempolicy *npol)
1652
{
1653
        int err;
1654
        struct sp_node *new = NULL;
1655
        unsigned long sz = vma_pages(vma);
1656
 
1657
        pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1658
                 vma->vm_pgoff,
1659
                 sz, npol? npol->policy : -1,
1660
                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1661
 
1662
        if (npol) {
1663
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1664
                if (!new)
1665
                        return -ENOMEM;
1666
        }
1667
        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1668
        if (err && new)
1669
                kmem_cache_free(sn_cache, new);
1670
        return err;
1671
}
1672
 
1673
/* Free a backing policy store on inode delete. */
1674
void mpol_free_shared_policy(struct shared_policy *p)
1675
{
1676
        struct sp_node *n;
1677
        struct rb_node *next;
1678
 
1679
        if (!p->root.rb_node)
1680
                return;
1681
        spin_lock(&p->lock);
1682
        next = rb_first(&p->root);
1683
        while (next) {
1684
                n = rb_entry(next, struct sp_node, nd);
1685
                next = rb_next(&n->nd);
1686
                rb_erase(&n->nd, &p->root);
1687
                mpol_free(n->policy);
1688
                kmem_cache_free(sn_cache, n);
1689
        }
1690
        spin_unlock(&p->lock);
1691
}
1692
 
1693
/* assumes fs == KERNEL_DS */
1694
void __init numa_policy_init(void)
1695
{
1696
        nodemask_t interleave_nodes;
1697
        unsigned long largest = 0;
1698
        int nid, prefer = 0;
1699
 
1700
        policy_cache = kmem_cache_create("numa_policy",
1701
                                         sizeof(struct mempolicy),
1702
                                         0, SLAB_PANIC, NULL);
1703
 
1704
        sn_cache = kmem_cache_create("shared_policy_node",
1705
                                     sizeof(struct sp_node),
1706
                                     0, SLAB_PANIC, NULL);
1707
 
1708
        /*
1709
         * Set interleaving policy for system init. Interleaving is only
1710
         * enabled across suitably sized nodes (default is >= 16MB), or
1711
         * fall back to the largest node if they're all smaller.
1712
         */
1713
        nodes_clear(interleave_nodes);
1714
        for_each_node_state(nid, N_HIGH_MEMORY) {
1715
                unsigned long total_pages = node_present_pages(nid);
1716
 
1717
                /* Preserve the largest node */
1718
                if (largest < total_pages) {
1719
                        largest = total_pages;
1720
                        prefer = nid;
1721
                }
1722
 
1723
                /* Interleave this node? */
1724
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1725
                        node_set(nid, interleave_nodes);
1726
        }
1727
 
1728
        /* All too small, use the largest */
1729
        if (unlikely(nodes_empty(interleave_nodes)))
1730
                node_set(prefer, interleave_nodes);
1731
 
1732
        if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1733
                printk("numa_policy_init: interleaving failed\n");
1734
}
1735
 
1736
/* Reset policy of current process to default */
1737
void numa_default_policy(void)
1738
{
1739
        do_set_mempolicy(MPOL_DEFAULT, NULL);
1740
}
1741
 
1742
/* Migrate a policy to a different set of nodes */
1743
static void mpol_rebind_policy(struct mempolicy *pol,
1744
                               const nodemask_t *newmask)
1745
{
1746
        nodemask_t *mpolmask;
1747
        nodemask_t tmp;
1748
 
1749
        if (!pol)
1750
                return;
1751
        mpolmask = &pol->cpuset_mems_allowed;
1752
        if (nodes_equal(*mpolmask, *newmask))
1753
                return;
1754
 
1755
        switch (pol->policy) {
1756
        case MPOL_DEFAULT:
1757
                break;
1758
        case MPOL_INTERLEAVE:
1759
                nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1760
                pol->v.nodes = tmp;
1761
                *mpolmask = *newmask;
1762
                current->il_next = node_remap(current->il_next,
1763
                                                *mpolmask, *newmask);
1764
                break;
1765
        case MPOL_PREFERRED:
1766
                pol->v.preferred_node = node_remap(pol->v.preferred_node,
1767
                                                *mpolmask, *newmask);
1768
                *mpolmask = *newmask;
1769
                break;
1770
        case MPOL_BIND: {
1771
                nodemask_t nodes;
1772
                struct zone **z;
1773
                struct zonelist *zonelist;
1774
 
1775
                nodes_clear(nodes);
1776
                for (z = pol->v.zonelist->zones; *z; z++)
1777
                        node_set(zone_to_nid(*z), nodes);
1778
                nodes_remap(tmp, nodes, *mpolmask, *newmask);
1779
                nodes = tmp;
1780
 
1781
                zonelist = bind_zonelist(&nodes);
1782
 
1783
                /* If no mem, then zonelist is NULL and we keep old zonelist.
1784
                 * If that old zonelist has no remaining mems_allowed nodes,
1785
                 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1786
                 */
1787
 
1788
                if (!IS_ERR(zonelist)) {
1789
                        /* Good - got mem - substitute new zonelist */
1790
                        kfree(pol->v.zonelist);
1791
                        pol->v.zonelist = zonelist;
1792
                }
1793
                *mpolmask = *newmask;
1794
                break;
1795
        }
1796
        default:
1797
                BUG();
1798
                break;
1799
        }
1800
}
1801
 
1802
/*
1803
 * Wrapper for mpol_rebind_policy() that just requires task
1804
 * pointer, and updates task mempolicy.
1805
 */
1806
 
1807
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1808
{
1809
        mpol_rebind_policy(tsk->mempolicy, new);
1810
}
1811
 
1812
/*
1813
 * Rebind each vma in mm to new nodemask.
1814
 *
1815
 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1816
 */
1817
 
1818
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1819
{
1820
        struct vm_area_struct *vma;
1821
 
1822
        down_write(&mm->mmap_sem);
1823
        for (vma = mm->mmap; vma; vma = vma->vm_next)
1824
                mpol_rebind_policy(vma->vm_policy, new);
1825
        up_write(&mm->mmap_sem);
1826
}
1827
 
1828
/*
1829
 * Display pages allocated per node and memory policy via /proc.
1830
 */
1831
 
1832
static const char * const policy_types[] =
1833
        { "default", "prefer", "bind", "interleave" };
1834
 
1835
/*
1836
 * Convert a mempolicy into a string.
1837
 * Returns the number of characters in buffer (if positive)
1838
 * or an error (negative)
1839
 */
1840
static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1841
{
1842
        char *p = buffer;
1843
        int l;
1844
        nodemask_t nodes;
1845
        int mode = pol ? pol->policy : MPOL_DEFAULT;
1846
 
1847
        switch (mode) {
1848
        case MPOL_DEFAULT:
1849
                nodes_clear(nodes);
1850
                break;
1851
 
1852
        case MPOL_PREFERRED:
1853
                nodes_clear(nodes);
1854
                node_set(pol->v.preferred_node, nodes);
1855
                break;
1856
 
1857
        case MPOL_BIND:
1858
                get_zonemask(pol, &nodes);
1859
                break;
1860
 
1861
        case MPOL_INTERLEAVE:
1862
                nodes = pol->v.nodes;
1863
                break;
1864
 
1865
        default:
1866
                BUG();
1867
                return -EFAULT;
1868
        }
1869
 
1870
        l = strlen(policy_types[mode]);
1871
        if (buffer + maxlen < p + l + 1)
1872
                return -ENOSPC;
1873
 
1874
        strcpy(p, policy_types[mode]);
1875
        p += l;
1876
 
1877
        if (!nodes_empty(nodes)) {
1878
                if (buffer + maxlen < p + 2)
1879
                        return -ENOSPC;
1880
                *p++ = '=';
1881
                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1882
        }
1883
        return p - buffer;
1884
}
1885
 
1886
struct numa_maps {
1887
        unsigned long pages;
1888
        unsigned long anon;
1889
        unsigned long active;
1890
        unsigned long writeback;
1891
        unsigned long mapcount_max;
1892
        unsigned long dirty;
1893
        unsigned long swapcache;
1894
        unsigned long node[MAX_NUMNODES];
1895
};
1896
 
1897
static void gather_stats(struct page *page, void *private, int pte_dirty)
1898
{
1899
        struct numa_maps *md = private;
1900
        int count = page_mapcount(page);
1901
 
1902
        md->pages++;
1903
        if (pte_dirty || PageDirty(page))
1904
                md->dirty++;
1905
 
1906
        if (PageSwapCache(page))
1907
                md->swapcache++;
1908
 
1909
        if (PageActive(page))
1910
                md->active++;
1911
 
1912
        if (PageWriteback(page))
1913
                md->writeback++;
1914
 
1915
        if (PageAnon(page))
1916
                md->anon++;
1917
 
1918
        if (count > md->mapcount_max)
1919
                md->mapcount_max = count;
1920
 
1921
        md->node[page_to_nid(page)]++;
1922
}
1923
 
1924
#ifdef CONFIG_HUGETLB_PAGE
1925
static void check_huge_range(struct vm_area_struct *vma,
1926
                unsigned long start, unsigned long end,
1927
                struct numa_maps *md)
1928
{
1929
        unsigned long addr;
1930
        struct page *page;
1931
 
1932
        for (addr = start; addr < end; addr += HPAGE_SIZE) {
1933
                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1934
                pte_t pte;
1935
 
1936
                if (!ptep)
1937
                        continue;
1938
 
1939
                pte = *ptep;
1940
                if (pte_none(pte))
1941
                        continue;
1942
 
1943
                page = pte_page(pte);
1944
                if (!page)
1945
                        continue;
1946
 
1947
                gather_stats(page, md, pte_dirty(*ptep));
1948
        }
1949
}
1950
#else
1951
static inline void check_huge_range(struct vm_area_struct *vma,
1952
                unsigned long start, unsigned long end,
1953
                struct numa_maps *md)
1954
{
1955
}
1956
#endif
1957
 
1958
int show_numa_map(struct seq_file *m, void *v)
1959
{
1960
        struct proc_maps_private *priv = m->private;
1961
        struct vm_area_struct *vma = v;
1962
        struct numa_maps *md;
1963
        struct file *file = vma->vm_file;
1964
        struct mm_struct *mm = vma->vm_mm;
1965
        struct mempolicy *pol;
1966
        int n;
1967
        char buffer[50];
1968
 
1969
        if (!mm)
1970
                return 0;
1971
 
1972
        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1973
        if (!md)
1974
                return 0;
1975
 
1976
        pol = get_vma_policy(priv->task, vma, vma->vm_start);
1977
        mpol_to_str(buffer, sizeof(buffer), pol);
1978
        /*
1979
         * unref shared or other task's mempolicy
1980
         */
1981
        if (pol != &default_policy && pol != current->mempolicy)
1982
                __mpol_free(pol);
1983
 
1984
        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1985
 
1986
        if (file) {
1987
                seq_printf(m, " file=");
1988
                seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1989
        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1990
                seq_printf(m, " heap");
1991
        } else if (vma->vm_start <= mm->start_stack &&
1992
                        vma->vm_end >= mm->start_stack) {
1993
                seq_printf(m, " stack");
1994
        }
1995
 
1996
        if (is_vm_hugetlb_page(vma)) {
1997
                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1998
                seq_printf(m, " huge");
1999
        } else {
2000
                check_pgd_range(vma, vma->vm_start, vma->vm_end,
2001
                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2002
        }
2003
 
2004
        if (!md->pages)
2005
                goto out;
2006
 
2007
        if (md->anon)
2008
                seq_printf(m," anon=%lu",md->anon);
2009
 
2010
        if (md->dirty)
2011
                seq_printf(m," dirty=%lu",md->dirty);
2012
 
2013
        if (md->pages != md->anon && md->pages != md->dirty)
2014
                seq_printf(m, " mapped=%lu", md->pages);
2015
 
2016
        if (md->mapcount_max > 1)
2017
                seq_printf(m, " mapmax=%lu", md->mapcount_max);
2018
 
2019
        if (md->swapcache)
2020
                seq_printf(m," swapcache=%lu", md->swapcache);
2021
 
2022
        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2023
                seq_printf(m," active=%lu", md->active);
2024
 
2025
        if (md->writeback)
2026
                seq_printf(m," writeback=%lu", md->writeback);
2027
 
2028
        for_each_node_state(n, N_HIGH_MEMORY)
2029
                if (md->node[n])
2030
                        seq_printf(m, " N%d=%lu", n, md->node[n]);
2031
out:
2032
        seq_putc(m, '\n');
2033
        kfree(md);
2034
 
2035
        if (m->count < m->size)
2036
                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2037
        return 0;
2038
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.