2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
85 #define PDprintk(fmt...)
87 /* Highest zone. An specific allocation for a zone below that is not
89 static int policy_zone;
91 static struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
96 /* Check if all specified nodes are online */
97 static int nodes_online(unsigned long *nodes)
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
109 /* Do sanity checking on a policy */
110 static int mpol_check_policy(int mode, unsigned long *nodes)
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
127 return nodes_online(nodes);
130 /* Copy a node mask from user space. */
131 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
135 unsigned long nlongs;
136 unsigned long endmask;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
156 if (get_user(t, nmask + k))
158 if (k == nlongs - 1) {
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
178 /* Generate a custom zonelist for the BIND policy. */
179 static struct zonelist *bind_zonelist(unsigned long *nodes)
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
197 zl->zones[num++] = z;
203 zl->zones[num] = NULL;
207 /* Create a new policy */
208 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
210 struct mempolicy *policy;
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
236 policy->policy = mode;
240 /* Ensure all existing pages follow the policy. */
242 verify_pages(struct mm_struct *mm,
243 unsigned long addr, unsigned long end, unsigned long *nodes)
247 spin_lock(&mm->page_table_lock);
254 pgd = pgd_offset(mm, addr);
255 if (pgd_none(*pgd)) {
256 unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
262 pud = pud_offset(pgd, addr);
263 if (pud_none(*pud)) {
264 addr = (addr + PUD_SIZE) & PUD_MASK;
267 pmd = pmd_offset(pud, addr);
268 if (pmd_none(*pmd)) {
269 addr = (addr + PMD_SIZE) & PMD_MASK;
273 pte = pte_offset_map(pmd, addr);
274 if (pte_present(*pte)) {
275 unsigned long pfn = pte_pfn(*pte);
277 p = pfn_to_page(pfn);
281 unsigned nid = page_to_nid(p);
282 if (!test_bit(nid, nodes)) {
289 spin_unlock(&mm->page_table_lock);
293 /* Step 1: check the range */
294 static struct vm_area_struct *
295 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
296 unsigned long *nodes, unsigned long flags)
299 struct vm_area_struct *first, *vma, *prev;
301 first = find_vma(mm, start);
303 return ERR_PTR(-EFAULT);
305 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
306 if (!vma->vm_next && vma->vm_end < end)
307 return ERR_PTR(-EFAULT);
308 if (prev && prev->vm_end < vma->vm_start)
309 return ERR_PTR(-EFAULT);
310 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
311 err = verify_pages(vma->vm_mm,
312 vma->vm_start, vma->vm_end, nodes);
314 first = ERR_PTR(err);
323 /* Apply policy to a single VMA */
324 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
327 struct mempolicy *old = vma->vm_policy;
329 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
330 vma->vm_start, vma->vm_end, vma->vm_pgoff,
331 vma->vm_ops, vma->vm_file,
332 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
334 if (vma->vm_ops && vma->vm_ops->set_policy)
335 err = vma->vm_ops->set_policy(vma, new);
338 vma->vm_policy = new;
344 /* Step 2: apply policy to a range and do splits. */
345 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
346 unsigned long end, struct mempolicy *new)
348 struct vm_area_struct *next;
352 for (; vma && vma->vm_start < end; vma = next) {
354 if (vma->vm_start < start)
355 err = split_vma(vma->vm_mm, vma, start, 1);
356 if (!err && vma->vm_end > end)
357 err = split_vma(vma->vm_mm, vma, end, 0);
359 err = policy_vma(vma, new);
366 /* Change policy for a memory range */
367 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
369 unsigned long __user *nmask, unsigned long maxnode,
372 struct vm_area_struct *vma;
373 struct mm_struct *mm = current->mm;
374 struct mempolicy *new;
376 DECLARE_BITMAP(nodes, MAX_NUMNODES);
379 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
381 if (start & ~PAGE_MASK)
383 if (mode == MPOL_DEFAULT)
384 flags &= ~MPOL_MF_STRICT;
385 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
392 err = get_nodes(nodes, nmask, maxnode, mode);
396 new = mpol_new(mode, nodes);
400 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
403 down_write(&mm->mmap_sem);
404 vma = check_range(mm, start, end, nodes, flags);
407 err = mbind_range(vma, start, end, new);
408 up_write(&mm->mmap_sem);
413 /* Set the process memory policy */
414 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
415 unsigned long maxnode)
418 struct mempolicy *new;
419 DECLARE_BITMAP(nodes, MAX_NUMNODES);
423 err = get_nodes(nodes, nmask, maxnode, mode);
426 new = mpol_new(mode, nodes);
429 mpol_free(current->mempolicy);
430 current->mempolicy = new;
431 if (new && new->policy == MPOL_INTERLEAVE)
432 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
436 /* Fill a zone bitmap for a policy */
437 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
441 bitmap_zero(nodes, MAX_NUMNODES);
444 for (i = 0; p->v.zonelist->zones[i]; i++)
445 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
449 case MPOL_INTERLEAVE:
450 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
453 /* or use current node instead of online map? */
454 if (p->v.preferred_node < 0)
455 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
457 __set_bit(p->v.preferred_node, nodes);
464 static int lookup_node(struct mm_struct *mm, unsigned long addr)
469 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
471 err = page_to_nid(p);
477 /* Copy a kernel node mask to user space */
478 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
479 void *nodes, unsigned nbytes)
481 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
484 if (copy > PAGE_SIZE)
486 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
490 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
493 /* Retrieve NUMA policy */
494 asmlinkage long sys_get_mempolicy(int __user *policy,
495 unsigned long __user *nmask,
496 unsigned long maxnode,
497 unsigned long addr, unsigned long flags)
500 struct mm_struct *mm = current->mm;
501 struct vm_area_struct *vma = NULL;
502 struct mempolicy *pol = current->mempolicy;
504 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
506 if (nmask != NULL && maxnode < MAX_NUMNODES)
508 if (flags & MPOL_F_ADDR) {
509 down_read(&mm->mmap_sem);
510 vma = find_vma_intersection(mm, addr, addr+1);
512 up_read(&mm->mmap_sem);
515 if (vma->vm_ops && vma->vm_ops->get_policy)
516 pol = vma->vm_ops->get_policy(vma, addr);
518 pol = vma->vm_policy;
523 pol = &default_policy;
525 if (flags & MPOL_F_NODE) {
526 if (flags & MPOL_F_ADDR) {
527 err = lookup_node(mm, addr);
531 } else if (pol == current->mempolicy &&
532 pol->policy == MPOL_INTERLEAVE) {
533 pval = current->il_next;
542 up_read(¤t->mm->mmap_sem);
546 if (policy && put_user(pval, policy))
551 DECLARE_BITMAP(nodes, MAX_NUMNODES);
552 get_zonemask(pol, nodes);
553 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
558 up_read(¤t->mm->mmap_sem);
564 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
565 compat_ulong_t __user *nmask,
566 compat_ulong_t maxnode,
567 compat_ulong_t addr, compat_ulong_t flags)
570 unsigned long __user *nm = NULL;
571 unsigned long nr_bits, alloc_size;
572 DECLARE_BITMAP(bm, MAX_NUMNODES);
574 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
575 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
578 nm = compat_alloc_user_space(alloc_size);
580 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
583 err = copy_from_user(bm, nm, alloc_size);
584 /* ensure entire bitmap is zeroed */
585 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
586 err |= compat_put_bitmap(nmask, bm, nr_bits);
592 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
593 compat_ulong_t maxnode)
596 unsigned long __user *nm = NULL;
597 unsigned long nr_bits, alloc_size;
598 DECLARE_BITMAP(bm, MAX_NUMNODES);
600 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
601 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
604 err = compat_get_bitmap(bm, nmask, nr_bits);
605 nm = compat_alloc_user_space(alloc_size);
606 err |= copy_to_user(nm, bm, alloc_size);
612 return sys_set_mempolicy(mode, nm, nr_bits+1);
615 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
616 compat_ulong_t mode, compat_ulong_t __user *nmask,
617 compat_ulong_t maxnode, compat_ulong_t flags)
620 unsigned long __user *nm = NULL;
621 unsigned long nr_bits, alloc_size;
622 DECLARE_BITMAP(bm, MAX_NUMNODES);
624 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
625 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
628 err = compat_get_bitmap(bm, nmask, nr_bits);
629 nm = compat_alloc_user_space(alloc_size);
630 err |= copy_to_user(nm, bm, alloc_size);
636 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
641 /* Return effective policy for a VMA */
642 static struct mempolicy *
643 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
645 struct mempolicy *pol = current->mempolicy;
648 if (vma->vm_ops && vma->vm_ops->get_policy)
649 pol = vma->vm_ops->get_policy(vma, addr);
650 else if (vma->vm_policy &&
651 vma->vm_policy->policy != MPOL_DEFAULT)
652 pol = vma->vm_policy;
655 pol = &default_policy;
659 /* Return a zonelist representing a mempolicy */
660 static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
664 switch (policy->policy) {
666 nd = policy->v.preferred_node;
671 /* Lower zones don't get a policy applied */
672 /* Careful: current->mems_allowed might have moved */
673 if ((gfp & GFP_ZONEMASK) >= policy_zone)
674 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
675 return policy->v.zonelist;
677 case MPOL_INTERLEAVE: /* should not happen */
685 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
688 /* Do dynamic interleaving for a process */
689 static unsigned interleave_nodes(struct mempolicy *policy)
692 struct task_struct *me = current;
695 BUG_ON(nid >= MAX_NUMNODES);
696 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
697 if (next >= MAX_NUMNODES)
698 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
703 /* Do static interleaving for a VMA with known offset. */
704 static unsigned offset_il_node(struct mempolicy *pol,
705 struct vm_area_struct *vma, unsigned long off)
707 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
708 unsigned target = (unsigned)off % nnodes;
714 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
716 } while (c <= target);
717 BUG_ON(nid >= MAX_NUMNODES);
718 BUG_ON(!test_bit(nid, pol->v.nodes));
722 /* Allocate a page in interleaved policy.
723 Own path because it needs to do special accounting. */
724 static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
729 BUG_ON(!node_online(nid));
730 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
731 page = __alloc_pages(gfp, order, zl);
732 if (page && page_zone(page) == zl->zones[0]) {
733 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
740 * alloc_page_vma - Allocate a page for a VMA.
743 * %GFP_USER user allocation.
744 * %GFP_KERNEL kernel allocations,
745 * %GFP_HIGHMEM highmem/user allocations,
746 * %GFP_FS allocation should not call back into a file system.
747 * %GFP_ATOMIC don't sleep.
749 * @vma: Pointer to VMA or NULL if not available.
750 * @addr: Virtual Address of the allocation. Must be inside the VMA.
752 * This function allocates a page from the kernel page pool and applies
753 * a NUMA policy associated with the VMA or the current process.
754 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
755 * mm_struct of the VMA to prevent it from going away. Should be used for
756 * all allocations for pages that will be mapped into
757 * user space. Returns NULL when no page can be allocated.
759 * Should be called with the mm_sem of the vma hold.
762 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
764 struct mempolicy *pol = get_vma_policy(vma, addr);
766 cpuset_update_current_mems_allowed();
768 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
772 BUG_ON(addr >= vma->vm_end);
773 BUG_ON(addr < vma->vm_start);
775 off += (addr - vma->vm_start) >> PAGE_SHIFT;
776 nid = offset_il_node(pol, vma, off);
778 /* fall back to process interleaving */
779 nid = interleave_nodes(pol);
781 return alloc_page_interleave(gfp, 0, nid);
783 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
787 * alloc_pages_current - Allocate pages.
790 * %GFP_USER user allocation,
791 * %GFP_KERNEL kernel allocation,
792 * %GFP_HIGHMEM highmem allocation,
793 * %GFP_FS don't call back into a file system.
794 * %GFP_ATOMIC don't sleep.
795 * @order: Power of two of allocation size in pages. 0 is a single page.
797 * Allocate a page from the kernel page pool. When not in
798 * interrupt context and apply the current process NUMA policy.
799 * Returns NULL when no page can be allocated.
801 * Don't call cpuset_update_current_mems_allowed() unless
802 * 1) it's ok to take cpuset_sem (can WAIT), and
803 * 2) allocating for current task (not interrupt).
805 struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
807 struct mempolicy *pol = current->mempolicy;
809 if ((gfp & __GFP_WAIT) && !in_interrupt())
810 cpuset_update_current_mems_allowed();
811 if (!pol || in_interrupt())
812 pol = &default_policy;
813 if (pol->policy == MPOL_INTERLEAVE)
814 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
815 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
817 EXPORT_SYMBOL(alloc_pages_current);
819 /* Slow path of a mempolicy copy */
820 struct mempolicy *__mpol_copy(struct mempolicy *old)
822 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
825 return ERR_PTR(-ENOMEM);
827 atomic_set(&new->refcnt, 1);
828 if (new->policy == MPOL_BIND) {
829 int sz = ksize(old->v.zonelist);
830 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
831 if (!new->v.zonelist) {
832 kmem_cache_free(policy_cache, new);
833 return ERR_PTR(-ENOMEM);
835 memcpy(new->v.zonelist, old->v.zonelist, sz);
840 /* Slow path of a mempolicy comparison */
841 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
845 if (a->policy != b->policy)
850 case MPOL_INTERLEAVE:
851 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
853 return a->v.preferred_node == b->v.preferred_node;
856 for (i = 0; a->v.zonelist->zones[i]; i++)
857 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
859 return b->v.zonelist->zones[i] == NULL;
867 /* Slow path of a mpol destructor. */
868 void __mpol_free(struct mempolicy *p)
870 if (!atomic_dec_and_test(&p->refcnt))
872 if (p->policy == MPOL_BIND)
873 kfree(p->v.zonelist);
874 p->policy = MPOL_DEFAULT;
875 kmem_cache_free(policy_cache, p);
879 * Hugetlb policy. Same as above, just works with node numbers instead of
883 /* Find first node suitable for an allocation */
884 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
886 struct mempolicy *pol = get_vma_policy(vma, addr);
888 switch (pol->policy) {
890 return numa_node_id();
892 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
893 case MPOL_INTERLEAVE:
894 return interleave_nodes(pol);
896 return pol->v.preferred_node >= 0 ?
897 pol->v.preferred_node : numa_node_id();
903 /* Find secondary valid nodes for an allocation */
904 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
906 struct mempolicy *pol = get_vma_policy(vma, addr);
908 switch (pol->policy) {
911 case MPOL_INTERLEAVE:
915 for (z = pol->v.zonelist->zones; *z; z++)
916 if ((*z)->zone_pgdat->node_id == nid)
927 * Shared memory backing store policy support.
929 * Remember policies even when nobody has shared memory mapped.
930 * The policies are kept in Red-Black tree linked from the inode.
931 * They are protected by the sp->lock spinlock, which should be held
932 * for any accesses to the tree.
935 /* lookup first element intersecting start-end */
936 /* Caller holds sp->lock */
937 static struct sp_node *
938 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
940 struct rb_node *n = sp->root.rb_node;
943 struct sp_node *p = rb_entry(n, struct sp_node, nd);
947 else if (end <= p->start)
955 struct sp_node *w = NULL;
956 struct rb_node *prev = rb_prev(n);
959 w = rb_entry(prev, struct sp_node, nd);
964 return rb_entry(n, struct sp_node, nd);
967 /* Insert a new shared policy into the list. */
968 /* Caller holds sp->lock */
969 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
971 struct rb_node **p = &sp->root.rb_node;
972 struct rb_node *parent = NULL;
977 nd = rb_entry(parent, struct sp_node, nd);
978 if (new->start < nd->start)
980 else if (new->end > nd->end)
985 rb_link_node(&new->nd, parent, p);
986 rb_insert_color(&new->nd, &sp->root);
987 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
988 new->policy ? new->policy->policy : 0);
991 /* Find shared policy intersecting idx */
993 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
995 struct mempolicy *pol = NULL;
998 if (!sp->root.rb_node)
1000 spin_lock(&sp->lock);
1001 sn = sp_lookup(sp, idx, idx+1);
1003 mpol_get(sn->policy);
1006 spin_unlock(&sp->lock);
1010 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1012 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1013 rb_erase(&n->nd, &sp->root);
1014 mpol_free(n->policy);
1015 kmem_cache_free(sn_cache, n);
1019 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1021 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1032 /* Replace a policy range. */
1033 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1034 unsigned long end, struct sp_node *new)
1036 struct sp_node *n, *new2 = NULL;
1039 spin_lock(&sp->lock);
1040 n = sp_lookup(sp, start, end);
1041 /* Take care of old policies in the same range. */
1042 while (n && n->start < end) {
1043 struct rb_node *next = rb_next(&n->nd);
1044 if (n->start >= start) {
1050 /* Old policy spanning whole new range. */
1053 spin_unlock(&sp->lock);
1054 new2 = sp_alloc(end, n->end, n->policy);
1060 sp_insert(sp, new2);
1068 n = rb_entry(next, struct sp_node, nd);
1072 spin_unlock(&sp->lock);
1074 mpol_free(new2->policy);
1075 kmem_cache_free(sn_cache, new2);
1080 int mpol_set_shared_policy(struct shared_policy *info,
1081 struct vm_area_struct *vma, struct mempolicy *npol)
1084 struct sp_node *new = NULL;
1085 unsigned long sz = vma_pages(vma);
1087 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1089 sz, npol? npol->policy : -1,
1090 npol ? npol->v.nodes[0] : -1);
1093 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1097 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1099 kmem_cache_free(sn_cache, new);
1103 /* Free a backing policy store on inode delete. */
1104 void mpol_free_shared_policy(struct shared_policy *p)
1107 struct rb_node *next;
1109 if (!p->root.rb_node)
1111 spin_lock(&p->lock);
1112 next = rb_first(&p->root);
1114 n = rb_entry(next, struct sp_node, nd);
1115 next = rb_next(&n->nd);
1116 mpol_free(n->policy);
1117 kmem_cache_free(sn_cache, n);
1119 spin_unlock(&p->lock);
1123 /* assumes fs == KERNEL_DS */
1124 void __init numa_policy_init(void)
1126 policy_cache = kmem_cache_create("numa_policy",
1127 sizeof(struct mempolicy),
1128 0, SLAB_PANIC, NULL, NULL);
1130 sn_cache = kmem_cache_create("shared_policy_node",
1131 sizeof(struct sp_node),
1132 0, SLAB_PANIC, NULL, NULL);
1134 /* Set interleaving policy for system init. This way not all
1135 the data structures allocated at system boot end up in node zero. */
1137 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1139 printk("numa_policy_init: interleaving failed\n");
1142 /* Reset policy of current process to default.
1143 * Assumes fs == KERNEL_DS */
1144 void numa_default_policy(void)
1146 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);