mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/export.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/ksm.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92 #include <linux/syscalls.h>
  93 #include <linux/ctype.h>
  94 #include <linux/mm_inline.h>
  95 #include <linux/mmu_notifier.h>
  96 #include <linux/printk.h>
  97
  98 #include <asm/tlbflush.h>
  99 #include <asm/uaccess.h>
 100 #include <linux/random.h>
 101
 102 #include "internal.h"
 103
 104 /* Internal flags */
 105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108 static struct kmem_cache *policy_cache;
 109 static struct kmem_cache *sn_cache;
 110
 111 /* Highest zone. An specific allocation for a zone below that is not
 112    policied. */
 113 enum zone_type policy_zone = 0;
 114
 115 /*
 116  * run-time system-wide default policy => local allocation
 117  */
 118 static struct mempolicy default_policy = {
 119         .refcnt = ATOMIC_INIT(1), /* never free it */
 120         .mode = MPOL_PREFERRED,
 121         .flags = MPOL_F_LOCAL,
 122 };
 123
 124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126 struct mempolicy *get_task_policy(struct task_struct *p)
 127 {
 128         struct mempolicy *pol = p->mempolicy;
 129         int node;
 130
 131         if (pol)
 132                 return pol;
 133
 134         node = numa_node_id();
 135         if (node != NUMA_NO_NODE) {
 136                 pol = &preferred_node_policy[node];
 137                 /* preferred_node_policy is not initialised early in boot */
 138                 if (pol->mode)
 139                         return pol;
 140         }
 141
 142         return &default_policy;
 143 }
 144
 145 static const struct mempolicy_operations {
 146         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 147         /*
 148          * If read-side task has no lock to protect task->mempolicy, write-side
 149          * task will rebind the task->mempolicy by two step. The first step is
 150          * setting all the newly nodes, and the second step is cleaning all the
 151          * disallowed nodes. In this way, we can avoid finding no node to alloc
 152          * page.
 153          * If we have a lock to protect task->mempolicy in read-side, we do
 154          * rebind directly.
 155          *
 156          * step:
 157          *      MPOL_REBIND_ONCE - do rebind work at once
 158          *      MPOL_REBIND_STEP1 - set all the newly nodes
 159          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 160          */
 161         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 162                         enum mpol_rebind_step step);
 163 } mpol_ops[MPOL_MAX];
 164
 165 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 166 {
 167         return pol->flags & MPOL_MODE_FLAGS;
 168 }
 169
 170 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 171                                    const nodemask_t *rel)
 172 {
 173         nodemask_t tmp;
 174         nodes_fold(tmp, *orig, nodes_weight(*rel));
 175         nodes_onto(*ret, tmp, *rel);
 176 }
 177
 178 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 179 {
 180         if (nodes_empty(*nodes))
 181                 return -EINVAL;
 182         pol->v.nodes = *nodes;
 183         return 0;
 184 }
 185
 186 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 187 {
 188         if (!nodes)
 189                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 190         else if (nodes_empty(*nodes))
 191                 return -EINVAL;                 /*  no allowed nodes */
 192         else
 193                 pol->v.preferred_node = first_node(*nodes);
 194         return 0;
 195 }
 196
 197 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 198 {
 199         if (nodes_empty(*nodes))
 200                 return -EINVAL;
 201         pol->v.nodes = *nodes;
 202         return 0;
 203 }
 204
 205 /*
 206  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 207  * any, for the new policy.  mpol_new() has already validated the nodes
 208  * parameter with respect to the policy mode and flags.  But, we need to
 209  * handle an empty nodemask with MPOL_PREFERRED here.
 210  *
 211  * Must be called holding task's alloc_lock to protect task's mems_allowed
 212  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 213  */
 214 static int mpol_set_nodemask(struct mempolicy *pol,
 215                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 216 {
 217         int ret;
 218
 219         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 220         if (pol == NULL)
 221                 return 0;
 222         /* Check N_MEMORY */
 223         nodes_and(nsc->mask1,
 224                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 225
 226         VM_BUG_ON(!nodes);
 227         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 228                 nodes = NULL;   /* explicit local allocation */
 229         else {
 230                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 231                         mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 232                 else
 233                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 234
 235                 if (mpol_store_user_nodemask(pol))
 236                         pol->w.user_nodemask = *nodes;
 237                 else
 238                         pol->w.cpuset_mems_allowed =
 239                                                 cpuset_current_mems_allowed;
 240         }
 241
 242         if (nodes)
 243                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 244         else
 245                 ret = mpol_ops[pol->mode].create(pol, NULL);
 246         return ret;
 247 }
 248
 249 /*
 250  * This function just creates a new policy, does some check and simple
 251  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 252  */
 253 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 254                                   nodemask_t *nodes)
 255 {
 256         struct mempolicy *policy;
 257
 258         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 259                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 260
 261         if (mode == MPOL_DEFAULT) {
 262                 if (nodes && !nodes_empty(*nodes))
 263                         return ERR_PTR(-EINVAL);
 264                 return NULL;
 265         }
 266         VM_BUG_ON(!nodes);
 267
 268         /*
 269          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 270          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 271          * All other modes require a valid pointer to a non-empty nodemask.
 272          */
 273         if (mode == MPOL_PREFERRED) {
 274                 if (nodes_empty(*nodes)) {
 275                         if (((flags & MPOL_F_STATIC_NODES) ||
 276                              (flags & MPOL_F_RELATIVE_NODES)))
 277                                 return ERR_PTR(-EINVAL);
 278                 }
 279         } else if (mode == MPOL_LOCAL) {
 280                 if (!nodes_empty(*nodes))
 281                         return ERR_PTR(-EINVAL);
 282                 mode = MPOL_PREFERRED;
 283         } else if (nodes_empty(*nodes))
 284                 return ERR_PTR(-EINVAL);
 285         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 286         if (!policy)
 287                 return ERR_PTR(-ENOMEM);
 288         atomic_set(&policy->refcnt, 1);
 289         policy->mode = mode;
 290         policy->flags = flags;
 291
 292         return policy;
 293 }
 294
 295 /* Slow path of a mpol destructor. */
 296 void __mpol_put(struct mempolicy *p)
 297 {
 298         if (!atomic_dec_and_test(&p->refcnt))
 299                 return;
 300         kmem_cache_free(policy_cache, p);
 301 }
 302
 303 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 304                                 enum mpol_rebind_step step)
 305 {
 306 }
 307
 308 /*
 309  * step:
 310  *      MPOL_REBIND_ONCE  - do rebind work at once
 311  *      MPOL_REBIND_STEP1 - set all the newly nodes
 312  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 313  */
 314 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 315                                  enum mpol_rebind_step step)
 316 {
 317         nodemask_t tmp;
 318
 319         if (pol->flags & MPOL_F_STATIC_NODES)
 320                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 321         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 322                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 323         else {
 324                 /*
 325                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 326                  * result
 327                  */
 328                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 329                         nodes_remap(tmp, pol->v.nodes,
 330                                         pol->w.cpuset_mems_allowed, *nodes);
 331                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 332                 } else if (step == MPOL_REBIND_STEP2) {
 333                         tmp = pol->w.cpuset_mems_allowed;
 334                         pol->w.cpuset_mems_allowed = *nodes;
 335                 } else
 336                         BUG();
 337         }
 338
 339         if (nodes_empty(tmp))
 340                 tmp = *nodes;
 341
 342         if (step == MPOL_REBIND_STEP1)
 343                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 344         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 345                 pol->v.nodes = tmp;
 346         else
 347                 BUG();
 348
 349         if (!node_isset(current->il_next, tmp)) {
 350                 current->il_next = next_node(current->il_next, tmp);
 351                 if (current->il_next >= MAX_NUMNODES)
 352                         current->il_next = first_node(tmp);
 353                 if (current->il_next >= MAX_NUMNODES)
 354                         current->il_next = numa_node_id();
 355         }
 356 }
 357
 358 static void mpol_rebind_preferred(struct mempolicy *pol,
 359                                   const nodemask_t *nodes,
 360                                   enum mpol_rebind_step step)
 361 {
 362         nodemask_t tmp;
 363
 364         if (pol->flags & MPOL_F_STATIC_NODES) {
 365                 int node = first_node(pol->w.user_nodemask);
 366
 367                 if (node_isset(node, *nodes)) {
 368                         pol->v.preferred_node = node;
 369                         pol->flags &= ~MPOL_F_LOCAL;
 370                 } else
 371                         pol->flags |= MPOL_F_LOCAL;
 372         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 373                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 374                 pol->v.preferred_node = first_node(tmp);
 375         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 376                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 377                                                    pol->w.cpuset_mems_allowed,
 378                                                    *nodes);
 379                 pol->w.cpuset_mems_allowed = *nodes;
 380         }
 381 }
 382
 383 /*
 384  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 385  *
 386  * If read-side task has no lock to protect task->mempolicy, write-side
 387  * task will rebind the task->mempolicy by two step. The first step is
 388  * setting all the newly nodes, and the second step is cleaning all the
 389  * disallowed nodes. In this way, we can avoid finding no node to alloc
 390  * page.
 391  * If we have a lock to protect task->mempolicy in read-side, we do
 392  * rebind directly.
 393  *
 394  * step:
 395  *      MPOL_REBIND_ONCE  - do rebind work at once
 396  *      MPOL_REBIND_STEP1 - set all the newly nodes
 397  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 398  */
 399 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 400                                 enum mpol_rebind_step step)
 401 {
 402         if (!pol)
 403                 return;
 404         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 405             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 406                 return;
 407
 408         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 409                 return;
 410
 411         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 412                 BUG();
 413
 414         if (step == MPOL_REBIND_STEP1)
 415                 pol->flags |= MPOL_F_REBINDING;
 416         else if (step == MPOL_REBIND_STEP2)
 417                 pol->flags &= ~MPOL_F_REBINDING;
 418         else if (step >= MPOL_REBIND_NSTEP)
 419                 BUG();
 420
 421         mpol_ops[pol->mode].rebind(pol, newmask, step);
 422 }
 423
 424 /*
 425  * Wrapper for mpol_rebind_policy() that just requires task
 426  * pointer, and updates task mempolicy.
 427  *
 428  * Called with task's alloc_lock held.
 429  */
 430
 431 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 432                         enum mpol_rebind_step step)
 433 {
 434         mpol_rebind_policy(tsk->mempolicy, new, step);
 435 }
 436
 437 /*
 438  * Rebind each vma in mm to new nodemask.
 439  *
 440  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 441  */
 442
 443 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 444 {
 445         struct vm_area_struct *vma;
 446
 447         down_write(&mm->mmap_sem);
 448         for (vma = mm->mmap; vma; vma = vma->vm_next)
 449                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 450         up_write(&mm->mmap_sem);
 451 }
 452
 453 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 454         [MPOL_DEFAULT] = {
 455                 .rebind = mpol_rebind_default,
 456         },
 457         [MPOL_INTERLEAVE] = {
 458                 .create = mpol_new_interleave,
 459                 .rebind = mpol_rebind_nodemask,
 460         },
 461         [MPOL_PREFERRED] = {
 462                 .create = mpol_new_preferred,
 463                 .rebind = mpol_rebind_preferred,
 464         },
 465         [MPOL_BIND] = {
 466                 .create = mpol_new_bind,
 467                 .rebind = mpol_rebind_nodemask,
 468         },
 469 };
 470
 471 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 472                                 unsigned long flags);
 473
 474 /*
 475  * Scan through pages checking if pages follow certain conditions,
 476  * and move them to the pagelist if they do.
 477  */
 478 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 479                 unsigned long addr, unsigned long end,
 480                 const nodemask_t *nodes, unsigned long flags,
 481                 void *private)
 482 {
 483         pte_t *orig_pte;
 484         pte_t *pte;
 485         spinlock_t *ptl;
 486
 487         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 488         do {
 489                 struct page *page;
 490                 int nid;
 491
 492                 if (!pte_present(*pte))
 493                         continue;
 494                 page = vm_normal_page(vma, addr, *pte);
 495                 if (!page)
 496                         continue;
 497                 /*
 498                  * vm_normal_page() filters out zero pages, but there might
 499                  * still be PageReserved pages to skip, perhaps in a VDSO.
 500                  */
 501                 if (PageReserved(page))
 502                         continue;
 503                 nid = page_to_nid(page);
 504                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 505                         continue;
 506
 507                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 508                         migrate_page_add(page, private, flags);
 509                 else
 510                         break;
 511         } while (pte++, addr += PAGE_SIZE, addr != end);
 512         pte_unmap_unlock(orig_pte, ptl);
 513         return addr != end;
 514 }
 515
 516 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 517                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 518                                     void *private)
 519 {
 520 #ifdef CONFIG_HUGETLB_PAGE
 521         int nid;
 522         struct page *page;
 523         spinlock_t *ptl;
 524         pte_t entry;
 525
 526         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 527         entry = huge_ptep_get((pte_t *)pmd);
 528         if (!pte_present(entry))
 529                 goto unlock;
 530         page = pte_page(entry);
 531         nid = page_to_nid(page);
 532         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 533                 goto unlock;
 534         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 535         if (flags & (MPOL_MF_MOVE_ALL) ||
 536             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 537                 isolate_huge_page(page, private);
 538 unlock:
 539         spin_unlock(ptl);
 540 #else
 541         BUG();
 542 #endif
 543 }
 544
 545 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 546                 unsigned long addr, unsigned long end,
 547                 const nodemask_t *nodes, unsigned long flags,
 548                 void *private)
 549 {
 550         pmd_t *pmd;
 551         unsigned long next;
 552
 553         pmd = pmd_offset(pud, addr);
 554         do {
 555                 next = pmd_addr_end(addr, end);
 556                 if (!pmd_present(*pmd))
 557                         continue;
 558                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
 559                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 560                                                 flags, private);
 561                         continue;
 562                 }
 563                 split_huge_page_pmd(vma, addr, pmd);
 564                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 565                         continue;
 566                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 567                                     flags, private))
 568                         return -EIO;
 569         } while (pmd++, addr = next, addr != end);
 570         return 0;
 571 }
 572
 573 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 574                 unsigned long addr, unsigned long end,
 575                 const nodemask_t *nodes, unsigned long flags,
 576                 void *private)
 577 {
 578         pud_t *pud;
 579         unsigned long next;
 580
 581         pud = pud_offset(pgd, addr);
 582         do {
 583                 next = pud_addr_end(addr, end);
 584                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
 585                         continue;
 586                 if (pud_none_or_clear_bad(pud))
 587                         continue;
 588                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 589                                     flags, private))
 590                         return -EIO;
 591         } while (pud++, addr = next, addr != end);
 592         return 0;
 593 }
 594
 595 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 596                 unsigned long addr, unsigned long end,
 597                 const nodemask_t *nodes, unsigned long flags,
 598                 void *private)
 599 {
 600         pgd_t *pgd;
 601         unsigned long next;
 602
 603         pgd = pgd_offset(vma->vm_mm, addr);
 604         do {
 605                 next = pgd_addr_end(addr, end);
 606                 if (pgd_none_or_clear_bad(pgd))
 607                         continue;
 608                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 609                                     flags, private))
 610                         return -EIO;
 611         } while (pgd++, addr = next, addr != end);
 612         return 0;
 613 }
 614
 615 #ifdef CONFIG_NUMA_BALANCING
 616 /*
 617  * This is used to mark a range of virtual addresses to be inaccessible.
 618  * These are later cleared by a NUMA hinting fault. Depending on these
 619  * faults, pages may be migrated for better NUMA placement.
 620  *
 621  * This is assuming that NUMA faults are handled using PROT_NONE. If
 622  * an architecture makes a different choice, it will need further
 623  * changes to the core.
 624  */
 625 unsigned long change_prot_numa(struct vm_area_struct *vma,
 626                         unsigned long addr, unsigned long end)
 627 {
 628         int nr_updated;
 629
 630         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 631         if (nr_updated)
 632                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 633
 634         return nr_updated;
 635 }
 636 #else
 637 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 638                         unsigned long addr, unsigned long end)
 639 {
 640         return 0;
 641 }
 642 #endif /* CONFIG_NUMA_BALANCING */
 643
 644 /*
 645  * Walk through page tables and collect pages to be migrated.
 646  *
 647  * If pages found in a given range are on a set of nodes (determined by
 648  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 649  * passed via @private.)
 650  */
 651 static int
 652 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 653                 const nodemask_t *nodes, unsigned long flags, void *private)
 654 {
 655         int err = 0;
 656         struct vm_area_struct *vma, *prev;
 657
 658         vma = find_vma(mm, start);
 659         if (!vma)
 660                 return -EFAULT;
 661         prev = NULL;
 662         for (; vma && vma->vm_start < end; vma = vma->vm_next) {
 663                 unsigned long endvma = vma->vm_end;
 664
 665                 if (endvma > end)
 666                         endvma = end;
 667                 if (vma->vm_start > start)
 668                         start = vma->vm_start;
 669
 670                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 671                         if (!vma->vm_next && vma->vm_end < end)
 672                                 return -EFAULT;
 673                         if (prev && prev->vm_end < vma->vm_start)
 674                                 return -EFAULT;
 675                 }
 676
 677                 if (flags & MPOL_MF_LAZY) {
 678                         /* Similar to task_numa_work, skip inaccessible VMAs */
 679                         if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
 680                                 change_prot_numa(vma, start, endvma);
 681                         goto next;
 682                 }
 683
 684                 if ((flags & MPOL_MF_STRICT) ||
 685                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 686                       vma_migratable(vma))) {
 687
 688                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
 689                                                 flags, private);
 690                         if (err)
 691                                 break;
 692                 }
 693 next:
 694                 prev = vma;
 695         }
 696         return err;
 697 }
 698
 699 /*
 700  * Apply policy to a single VMA
 701  * This must be called with the mmap_sem held for writing.
 702  */
 703 static int vma_replace_policy(struct vm_area_struct *vma,
 704                                                 struct mempolicy *pol)
 705 {
 706         int err;
 707         struct mempolicy *old;
 708         struct mempolicy *new;
 709
 710         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 711                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 712                  vma->vm_ops, vma->vm_file,
 713                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 714
 715         new = mpol_dup(pol);
 716         if (IS_ERR(new))
 717                 return PTR_ERR(new);
 718
 719         if (vma->vm_ops && vma->vm_ops->set_policy) {
 720                 err = vma->vm_ops->set_policy(vma, new);
 721                 if (err)
 722                         goto err_out;
 723         }
 724
 725         old = vma->vm_policy;
 726         vma->vm_policy = new; /* protected by mmap_sem */
 727         mpol_put(old);
 728
 729         return 0;
 730  err_out:
 731         mpol_put(new);
 732         return err;
 733 }
 734
 735 /* Step 2: apply policy to a range and do splits. */
 736 static int mbind_range(struct mm_struct *mm, unsigned long start,
 737                        unsigned long end, struct mempolicy *new_pol)
 738 {
 739         struct vm_area_struct *next;
 740         struct vm_area_struct *prev;
 741         struct vm_area_struct *vma;
 742         int err = 0;
 743         pgoff_t pgoff;
 744         unsigned long vmstart;
 745         unsigned long vmend;
 746
 747         vma = find_vma(mm, start);
 748         if (!vma || vma->vm_start > start)
 749                 return -EFAULT;
 750
 751         prev = vma->vm_prev;
 752         if (start > vma->vm_start)
 753                 prev = vma;
 754
 755         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 756                 next = vma->vm_next;
 757                 vmstart = max(start, vma->vm_start);
 758                 vmend   = min(end, vma->vm_end);
 759
 760                 if (mpol_equal(vma_policy(vma), new_pol))
 761                         continue;
 762
 763                 pgoff = vma->vm_pgoff +
 764                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 765                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 766                                   vma->anon_vma, vma->vm_file, pgoff,
 767                                   new_pol);
 768                 if (prev) {
 769                         vma = prev;
 770                         next = vma->vm_next;
 771                         if (mpol_equal(vma_policy(vma), new_pol))
 772                                 continue;
 773                         /* vma_merge() joined vma && vma->next, case 8 */
 774                         goto replace;
 775                 }
 776                 if (vma->vm_start != vmstart) {
 777                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 778                         if (err)
 779                                 goto out;
 780                 }
 781                 if (vma->vm_end != vmend) {
 782                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 783                         if (err)
 784                                 goto out;
 785                 }
 786  replace:
 787                 err = vma_replace_policy(vma, new_pol);
 788                 if (err)
 789                         goto out;
 790         }
 791
 792  out:
 793         return err;
 794 }
 795
 796 /* Set the process memory policy */
 797 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 798                              nodemask_t *nodes)
 799 {
 800         struct mempolicy *new, *old;
 801         NODEMASK_SCRATCH(scratch);
 802         int ret;
 803
 804         if (!scratch)
 805                 return -ENOMEM;
 806
 807         new = mpol_new(mode, flags, nodes);
 808         if (IS_ERR(new)) {
 809                 ret = PTR_ERR(new);
 810                 goto out;
 811         }
 812
 813         task_lock(current);
 814         ret = mpol_set_nodemask(new, nodes, scratch);
 815         if (ret) {
 816                 task_unlock(current);
 817                 mpol_put(new);
 818                 goto out;
 819         }
 820         old = current->mempolicy;
 821         current->mempolicy = new;
 822         if (new && new->mode == MPOL_INTERLEAVE &&
 823             nodes_weight(new->v.nodes))
 824                 current->il_next = first_node(new->v.nodes);
 825         task_unlock(current);
 826         mpol_put(old);
 827         ret = 0;
 828 out:
 829         NODEMASK_SCRATCH_FREE(scratch);
 830         return ret;
 831 }
 832
 833 /*
 834  * Return nodemask for policy for get_mempolicy() query
 835  *
 836  * Called with task's alloc_lock held
 837  */
 838 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 839 {
 840         nodes_clear(*nodes);
 841         if (p == &default_policy)
 842                 return;
 843
 844         switch (p->mode) {
 845         case MPOL_BIND:
 846                 /* Fall through */
 847         case MPOL_INTERLEAVE:
 848                 *nodes = p->v.nodes;
 849                 break;
 850         case MPOL_PREFERRED:
 851                 if (!(p->flags & MPOL_F_LOCAL))
 852                         node_set(p->v.preferred_node, *nodes);
 853                 /* else return empty node mask for local allocation */
 854                 break;
 855         default:
 856                 BUG();
 857         }
 858 }
 859
 860 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 861 {
 862         struct page *p;
 863         int err;
 864
 865         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 866         if (err >= 0) {
 867                 err = page_to_nid(p);
 868                 put_page(p);
 869         }
 870         return err;
 871 }
 872
 873 /* Retrieve NUMA policy */
 874 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 875                              unsigned long addr, unsigned long flags)
 876 {
 877         int err;
 878         struct mm_struct *mm = current->mm;
 879         struct vm_area_struct *vma = NULL;
 880         struct mempolicy *pol = current->mempolicy;
 881
 882         if (flags &
 883                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 884                 return -EINVAL;
 885
 886         if (flags & MPOL_F_MEMS_ALLOWED) {
 887                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 888                         return -EINVAL;
 889                 *policy = 0;    /* just so it's initialized */
 890                 task_lock(current);
 891                 *nmask  = cpuset_current_mems_allowed;
 892                 task_unlock(current);
 893                 return 0;
 894         }
 895
 896         if (flags & MPOL_F_ADDR) {
 897                 /*
 898                  * Do NOT fall back to task policy if the
 899                  * vma/shared policy at addr is NULL.  We
 900                  * want to return MPOL_DEFAULT in this case.
 901                  */
 902                 down_read(&mm->mmap_sem);
 903                 vma = find_vma_intersection(mm, addr, addr+1);
 904                 if (!vma) {
 905                         up_read(&mm->mmap_sem);
 906                         return -EFAULT;
 907                 }
 908                 if (vma->vm_ops && vma->vm_ops->get_policy)
 909                         pol = vma->vm_ops->get_policy(vma, addr);
 910                 else
 911                         pol = vma->vm_policy;
 912         } else if (addr)
 913                 return -EINVAL;
 914
 915         if (!pol)
 916                 pol = &default_policy;  /* indicates default behavior */
 917
 918         if (flags & MPOL_F_NODE) {
 919                 if (flags & MPOL_F_ADDR) {
 920                         err = lookup_node(mm, addr);
 921                         if (err < 0)
 922                                 goto out;
 923                         *policy = err;
 924                 } else if (pol == current->mempolicy &&
 925                                 pol->mode == MPOL_INTERLEAVE) {
 926                         *policy = current->il_next;
 927                 } else {
 928                         err = -EINVAL;
 929                         goto out;
 930                 }
 931         } else {
 932                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 933                                                 pol->mode;
 934                 /*
 935                  * Internal mempolicy flags must be masked off before exposing
 936                  * the policy to userspace.
 937                  */
 938                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 939         }
 940
 941         if (vma) {
 942                 up_read(&current->mm->mmap_sem);
 943                 vma = NULL;
 944         }
 945
 946         err = 0;
 947         if (nmask) {
 948                 if (mpol_store_user_nodemask(pol)) {
 949                         *nmask = pol->w.user_nodemask;
 950                 } else {
 951                         task_lock(current);
 952                         get_policy_nodemask(pol, nmask);
 953                         task_unlock(current);
 954                 }
 955         }
 956
 957  out:
 958         mpol_cond_put(pol);
 959         if (vma)
 960                 up_read(&current->mm->mmap_sem);
 961         return err;
 962 }
 963
 964 #ifdef CONFIG_MIGRATION
 965 /*
 966  * page migration
 967  */
 968 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 969                                 unsigned long flags)
 970 {
 971         /*
 972          * Avoid migrating a page that is shared with others.
 973          */
 974         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 975                 if (!isolate_lru_page(page)) {
 976                         list_add_tail(&page->lru, pagelist);
 977                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 978                                             page_is_file_cache(page));
 979                 }
 980         }
 981 }
 982
 983 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 984 {
 985         if (PageHuge(page))
 986                 return alloc_huge_page_node(page_hstate(compound_head(page)),
 987                                         node);
 988         else
 989                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 990 }
 991
 992 /*
 993  * Migrate pages from one node to a target node.
 994  * Returns error or the number of pages not migrated.
 995  */
 996 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 997                            int flags)
 998 {
 999         nodemask_t nmask;
1000         LIST_HEAD(pagelist);
1001         int err = 0;
1002
1003         nodes_clear(nmask);
1004         node_set(source, nmask);
1005
1006         /*
1007          * This does not "check" the range but isolates all pages that
1008          * need migration.  Between passing in the full user address
1009          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1010          */
1011         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1012         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1013                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1014
1015         if (!list_empty(&pagelist)) {
1016                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1017                                         MIGRATE_SYNC, MR_SYSCALL);
1018                 if (err)
1019                         putback_movable_pages(&pagelist);
1020         }
1021
1022         return err;
1023 }
1024
1025 /*
1026  * Move pages between the two nodesets so as to preserve the physical
1027  * layout as much as possible.
1028  *
1029  * Returns the number of page that could not be moved.
1030  */
1031 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1032                      const nodemask_t *to, int flags)
1033 {
1034         int busy = 0;
1035         int err;
1036         nodemask_t tmp;
1037
1038         err = migrate_prep();
1039         if (err)
1040                 return err;
1041
1042         down_read(&mm->mmap_sem);
1043
1044         /*
1045          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1046          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1047          * bit in 'tmp', and return that <source, dest> pair for migration.
1048          * The pair of nodemasks 'to' and 'from' define the map.
1049          *
1050          * If no pair of bits is found that way, fallback to picking some
1051          * pair of 'source' and 'dest' bits that are not the same.  If the
1052          * 'source' and 'dest' bits are the same, this represents a node
1053          * that will be migrating to itself, so no pages need move.
1054          *
1055          * If no bits are left in 'tmp', or if all remaining bits left
1056          * in 'tmp' correspond to the same bit in 'to', return false
1057          * (nothing left to migrate).
1058          *
1059          * This lets us pick a pair of nodes to migrate between, such that
1060          * if possible the dest node is not already occupied by some other
1061          * source node, minimizing the risk of overloading the memory on a
1062          * node that would happen if we migrated incoming memory to a node
1063          * before migrating outgoing memory source that same node.
1064          *
1065          * A single scan of tmp is sufficient.  As we go, we remember the
1066          * most recent <s, d> pair that moved (s != d).  If we find a pair
1067          * that not only moved, but what's better, moved to an empty slot
1068          * (d is not set in tmp), then we break out then, with that pair.
1069          * Otherwise when we finish scanning from_tmp, we at least have the
1070          * most recent <s, d> pair that moved.  If we get all the way through
1071          * the scan of tmp without finding any node that moved, much less
1072          * moved to an empty node, then there is nothing left worth migrating.
1073          */
1074
1075         tmp = *from;
1076         while (!nodes_empty(tmp)) {
1077                 int s,d;
1078                 int source = NUMA_NO_NODE;
1079                 int dest = 0;
1080
1081                 for_each_node_mask(s, tmp) {
1082
1083                         /*
1084                          * do_migrate_pages() tries to maintain the relative
1085                          * node relationship of the pages established between
1086                          * threads and memory areas.
1087                          *
1088                          * However if the number of source nodes is not equal to
1089                          * the number of destination nodes we can not preserve
1090                          * this node relative relationship.  In that case, skip
1091                          * copying memory from a node that is in the destination
1092                          * mask.
1093                          *
1094                          * Example: [2,3,4] -> [3,4,5] moves everything.
1095                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1096                          */
1097
1098                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1099                                                 (node_isset(s, *to)))
1100                                 continue;
1101
1102                         d = node_remap(s, *from, *to);
1103                         if (s == d)
1104                                 continue;
1105
1106                         source = s;     /* Node moved. Memorize */
1107                         dest = d;
1108
1109                         /* dest not in remaining from nodes? */
1110                         if (!node_isset(dest, tmp))
1111                                 break;
1112                 }
1113                 if (source == NUMA_NO_NODE)
1114                         break;
1115
1116                 node_clear(source, tmp);
1117                 err = migrate_to_node(mm, source, dest, flags);
1118                 if (err > 0)
1119                         busy += err;
1120                 if (err < 0)
1121                         break;
1122         }
1123         up_read(&mm->mmap_sem);
1124         if (err < 0)
1125                 return err;
1126         return busy;
1127
1128 }
1129
1130 /*
1131  * Allocate a new page for page migration based on vma policy.
1132  * Start by assuming the page is mapped by the same vma as contains @start.
1133  * Search forward from there, if not.  N.B., this assumes that the
1134  * list of pages handed to migrate_pages()--which is how we get here--
1135  * is in virtual address order.
1136  */
1137 static struct page *new_page(struct page *page, unsigned long start, int **x)
1138 {
1139         struct vm_area_struct *vma;
1140         unsigned long uninitialized_var(address);
1141
1142         vma = find_vma(current->mm, start);
1143         while (vma) {
1144                 address = page_address_in_vma(page, vma);
1145                 if (address != -EFAULT)
1146                         break;
1147                 vma = vma->vm_next;
1148         }
1149
1150         if (PageHuge(page)) {
1151                 BUG_ON(!vma);
1152                 return alloc_huge_page_noerr(vma, address, 1);
1153         }
1154         /*
1155          * if !vma, alloc_page_vma() will use task or system default policy
1156          */
1157         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1158 }
1159 #else
1160
1161 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1162                                 unsigned long flags)
1163 {
1164 }
1165
1166 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1167                      const nodemask_t *to, int flags)
1168 {
1169         return -ENOSYS;
1170 }
1171
1172 static struct page *new_page(struct page *page, unsigned long start, int **x)
1173 {
1174         return NULL;
1175 }
1176 #endif
1177
1178 static long do_mbind(unsigned long start, unsigned long len,
1179                      unsigned short mode, unsigned short mode_flags,
1180                      nodemask_t *nmask, unsigned long flags)
1181 {
1182         struct mm_struct *mm = current->mm;
1183         struct mempolicy *new;
1184         unsigned long end;
1185         int err;
1186         LIST_HEAD(pagelist);
1187
1188         if (flags & ~(unsigned long)MPOL_MF_VALID)
1189                 return -EINVAL;
1190         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1191                 return -EPERM;
1192
1193         if (start & ~PAGE_MASK)
1194                 return -EINVAL;
1195
1196         if (mode == MPOL_DEFAULT)
1197                 flags &= ~MPOL_MF_STRICT;
1198
1199         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1200         end = start + len;
1201
1202         if (end < start)
1203                 return -EINVAL;
1204         if (end == start)
1205                 return 0;
1206
1207         new = mpol_new(mode, mode_flags, nmask);
1208         if (IS_ERR(new))
1209                 return PTR_ERR(new);
1210
1211         if (flags & MPOL_MF_LAZY)
1212                 new->flags |= MPOL_F_MOF;
1213
1214         /*
1215          * If we are using the default policy then operation
1216          * on discontinuous address spaces is okay after all
1217          */
1218         if (!new)
1219                 flags |= MPOL_MF_DISCONTIG_OK;
1220
1221         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1222                  start, start + len, mode, mode_flags,
1223                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1224
1225         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1226
1227                 err = migrate_prep();
1228                 if (err)
1229                         goto mpol_out;
1230         }
1231         {
1232                 NODEMASK_SCRATCH(scratch);
1233                 if (scratch) {
1234                         down_write(&mm->mmap_sem);
1235                         task_lock(current);
1236                         err = mpol_set_nodemask(new, nmask, scratch);
1237                         task_unlock(current);
1238                         if (err)
1239                                 up_write(&mm->mmap_sem);
1240                 } else
1241                         err = -ENOMEM;
1242                 NODEMASK_SCRATCH_FREE(scratch);
1243         }
1244         if (err)
1245                 goto mpol_out;
1246
1247         err = queue_pages_range(mm, start, end, nmask,
1248                           flags | MPOL_MF_INVERT, &pagelist);
1249         if (!err)
1250                 err = mbind_range(mm, start, end, new);
1251
1252         if (!err) {
1253                 int nr_failed = 0;
1254
1255                 if (!list_empty(&pagelist)) {
1256                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1257                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1258                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1259                         if (nr_failed)
1260                                 putback_movable_pages(&pagelist);
1261                 }
1262
1263                 if (nr_failed && (flags & MPOL_MF_STRICT))
1264                         err = -EIO;
1265         } else
1266                 putback_movable_pages(&pagelist);
1267
1268         up_write(&mm->mmap_sem);
1269  mpol_out:
1270         mpol_put(new);
1271         return err;
1272 }
1273
1274 /*
1275  * User space interface with variable sized bitmaps for nodelists.
1276  */
1277
1278 /* Copy a node mask from user space. */
1279 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1280                      unsigned long maxnode)
1281 {
1282         unsigned long k;
1283         unsigned long nlongs;
1284         unsigned long endmask;
1285
1286         --maxnode;
1287         nodes_clear(*nodes);
1288         if (maxnode == 0 || !nmask)
1289                 return 0;
1290         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1291                 return -EINVAL;
1292
1293         nlongs = BITS_TO_LONGS(maxnode);
1294         if ((maxnode % BITS_PER_LONG) == 0)
1295                 endmask = ~0UL;
1296         else
1297                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1298
1299         /* When the user specified more nodes than supported just check
1300            if the non supported part is all zero. */
1301         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1302                 if (nlongs > PAGE_SIZE/sizeof(long))
1303                         return -EINVAL;
1304                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1305                         unsigned long t;
1306                         if (get_user(t, nmask + k))
1307                                 return -EFAULT;
1308                         if (k == nlongs - 1) {
1309                                 if (t & endmask)
1310                                         return -EINVAL;
1311                         } else if (t)
1312                                 return -EINVAL;
1313                 }
1314                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1315                 endmask = ~0UL;
1316         }
1317
1318         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1319                 return -EFAULT;
1320         nodes_addr(*nodes)[nlongs-1] &= endmask;
1321         return 0;
1322 }
1323
1324 /* Copy a kernel node mask to user space */
1325 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1326                               nodemask_t *nodes)
1327 {
1328         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1329         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1330
1331         if (copy > nbytes) {
1332                 if (copy > PAGE_SIZE)
1333                         return -EINVAL;
1334                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1335                         return -EFAULT;
1336                 copy = nbytes;
1337         }
1338         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1339 }
1340
1341 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1342                 unsigned long, mode, const unsigned long __user *, nmask,
1343                 unsigned long, maxnode, unsigned, flags)
1344 {
1345         nodemask_t nodes;
1346         int err;
1347         unsigned short mode_flags;
1348
1349         mode_flags = mode & MPOL_MODE_FLAGS;
1350         mode &= ~MPOL_MODE_FLAGS;
1351         if (mode >= MPOL_MAX)
1352                 return -EINVAL;
1353         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1354             (mode_flags & MPOL_F_RELATIVE_NODES))
1355                 return -EINVAL;
1356         err = get_nodes(&nodes, nmask, maxnode);
1357         if (err)
1358                 return err;
1359         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1360 }
1361
1362 /* Set the process memory policy */
1363 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1364                 unsigned long, maxnode)
1365 {
1366         int err;
1367         nodemask_t nodes;
1368         unsigned short flags;
1369
1370         flags = mode & MPOL_MODE_FLAGS;
1371         mode &= ~MPOL_MODE_FLAGS;
1372         if ((unsigned int)mode >= MPOL_MAX)
1373                 return -EINVAL;
1374         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1375                 return -EINVAL;
1376         err = get_nodes(&nodes, nmask, maxnode);
1377         if (err)
1378                 return err;
1379         return do_set_mempolicy(mode, flags, &nodes);
1380 }
1381
1382 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1383                 const unsigned long __user *, old_nodes,
1384                 const unsigned long __user *, new_nodes)
1385 {
1386         const struct cred *cred = current_cred(), *tcred;
1387         struct mm_struct *mm = NULL;
1388         struct task_struct *task;
1389         nodemask_t task_nodes;
1390         int err;
1391         nodemask_t *old;
1392         nodemask_t *new;
1393         NODEMASK_SCRATCH(scratch);
1394
1395         if (!scratch)
1396                 return -ENOMEM;
1397
1398         old = &scratch->mask1;
1399         new = &scratch->mask2;
1400
1401         err = get_nodes(old, old_nodes, maxnode);
1402         if (err)
1403                 goto out;
1404
1405         err = get_nodes(new, new_nodes, maxnode);
1406         if (err)
1407                 goto out;
1408
1409         /* Find the mm_struct */
1410         rcu_read_lock();
1411         task = pid ? find_task_by_vpid(pid) : current;
1412         if (!task) {
1413                 rcu_read_unlock();
1414                 err = -ESRCH;
1415                 goto out;
1416         }
1417         get_task_struct(task);
1418
1419         err = -EINVAL;
1420
1421         /*
1422          * Check if this process has the right to modify the specified
1423          * process. The right exists if the process has administrative
1424          * capabilities, superuser privileges or the same
1425          * userid as the target process.
1426          */
1427         tcred = __task_cred(task);
1428         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1429             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1430             !capable(CAP_SYS_NICE)) {
1431                 rcu_read_unlock();
1432                 err = -EPERM;
1433                 goto out_put;
1434         }
1435         rcu_read_unlock();
1436
1437         task_nodes = cpuset_mems_allowed(task);
1438         /* Is the user allowed to access the target nodes? */
1439         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1440                 err = -EPERM;
1441                 goto out_put;
1442         }
1443
1444         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1445                 err = -EINVAL;
1446                 goto out_put;
1447         }
1448
1449         err = security_task_movememory(task);
1450         if (err)
1451                 goto out_put;
1452
1453         mm = get_task_mm(task);
1454         put_task_struct(task);
1455
1456         if (!mm) {
1457                 err = -EINVAL;
1458                 goto out;
1459         }
1460
1461         err = do_migrate_pages(mm, old, new,
1462                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1463
1464         mmput(mm);
1465 out:
1466         NODEMASK_SCRATCH_FREE(scratch);
1467
1468         return err;
1469
1470 out_put:
1471         put_task_struct(task);
1472         goto out;
1473
1474 }
1475
1476
1477 /* Retrieve NUMA policy */
1478 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1479                 unsigned long __user *, nmask, unsigned long, maxnode,
1480                 unsigned long, addr, unsigned long, flags)
1481 {
1482         int err;
1483         int uninitialized_var(pval);
1484         nodemask_t nodes;
1485
1486         if (nmask != NULL && maxnode < MAX_NUMNODES)
1487                 return -EINVAL;
1488
1489         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1490
1491         if (err)
1492                 return err;
1493
1494         if (policy && put_user(pval, policy))
1495                 return -EFAULT;
1496
1497         if (nmask)
1498                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1499
1500         return err;
1501 }
1502
1503 #ifdef CONFIG_COMPAT
1504
1505 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1506                        compat_ulong_t __user *, nmask,
1507                        compat_ulong_t, maxnode,
1508                        compat_ulong_t, addr, compat_ulong_t, flags)
1509 {
1510         long err;
1511         unsigned long __user *nm = NULL;
1512         unsigned long nr_bits, alloc_size;
1513         DECLARE_BITMAP(bm, MAX_NUMNODES);
1514
1515         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1516         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1517
1518         if (nmask)
1519                 nm = compat_alloc_user_space(alloc_size);
1520
1521         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1522
1523         if (!err && nmask) {
1524                 unsigned long copy_size;
1525                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1526                 err = copy_from_user(bm, nm, copy_size);
1527                 /* ensure entire bitmap is zeroed */
1528                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1529                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1530         }
1531
1532         return err;
1533 }
1534
1535 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1536                        compat_ulong_t, maxnode)
1537 {
1538         long err = 0;
1539         unsigned long __user *nm = NULL;
1540         unsigned long nr_bits, alloc_size;
1541         DECLARE_BITMAP(bm, MAX_NUMNODES);
1542
1543         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1544         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1545
1546         if (nmask) {
1547                 err = compat_get_bitmap(bm, nmask, nr_bits);
1548                 nm = compat_alloc_user_space(alloc_size);
1549                 err |= copy_to_user(nm, bm, alloc_size);
1550         }
1551
1552         if (err)
1553                 return -EFAULT;
1554
1555         return sys_set_mempolicy(mode, nm, nr_bits+1);
1556 }
1557
1558 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1559                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1560                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1561 {
1562         long err = 0;
1563         unsigned long __user *nm = NULL;
1564         unsigned long nr_bits, alloc_size;
1565         nodemask_t bm;
1566
1567         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1568         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1569
1570         if (nmask) {
1571                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1572                 nm = compat_alloc_user_space(alloc_size);
1573                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1574         }
1575
1576         if (err)
1577                 return -EFAULT;
1578
1579         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1580 }
1581
1582 #endif
1583
1584 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1585                                                 unsigned long addr)
1586 {
1587         struct mempolicy *pol = NULL;
1588
1589         if (vma) {
1590                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1591                         pol = vma->vm_ops->get_policy(vma, addr);
1592                 } else if (vma->vm_policy) {
1593                         pol = vma->vm_policy;
1594
1595                         /*
1596                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1597                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1598                          * count on these policies which will be dropped by
1599                          * mpol_cond_put() later
1600                          */
1601                         if (mpol_needs_cond_ref(pol))
1602                                 mpol_get(pol);
1603                 }
1604         }
1605
1606         return pol;
1607 }
1608
1609 /*
1610  * get_vma_policy(@vma, @addr)
1611  * @vma: virtual memory area whose policy is sought
1612  * @addr: address in @vma for shared policy lookup
1613  *
1614  * Returns effective policy for a VMA at specified address.
1615  * Falls back to current->mempolicy or system default policy, as necessary.
1616  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1617  * count--added by the get_policy() vm_op, as appropriate--to protect against
1618  * freeing by another task.  It is the caller's responsibility to free the
1619  * extra reference for shared policies.
1620  */
1621 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1622                                                 unsigned long addr)
1623 {
1624         struct mempolicy *pol = __get_vma_policy(vma, addr);
1625
1626         if (!pol)
1627                 pol = get_task_policy(current);
1628
1629         return pol;
1630 }
1631
1632 bool vma_policy_mof(struct vm_area_struct *vma)
1633 {
1634         struct mempolicy *pol;
1635
1636         if (vma->vm_ops && vma->vm_ops->get_policy) {
1637                 bool ret = false;
1638
1639                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1640                 if (pol && (pol->flags & MPOL_F_MOF))
1641                         ret = true;
1642                 mpol_cond_put(pol);
1643
1644                 return ret;
1645         }
1646
1647         pol = vma->vm_policy;
1648         if (!pol)
1649                 pol = get_task_policy(current);
1650
1651         return pol->flags & MPOL_F_MOF;
1652 }
1653
1654 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1655 {
1656         enum zone_type dynamic_policy_zone = policy_zone;
1657
1658         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1659
1660         /*
1661          * if policy->v.nodes has movable memory only,
1662          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1663          *
1664          * policy->v.nodes is intersect with node_states[N_MEMORY].
1665          * so if the following test faile, it implies
1666          * policy->v.nodes has movable memory only.
1667          */
1668         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1669                 dynamic_policy_zone = ZONE_MOVABLE;
1670
1671         return zone >= dynamic_policy_zone;
1672 }
1673
1674 /*
1675  * Return a nodemask representing a mempolicy for filtering nodes for
1676  * page allocation
1677  */
1678 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1679 {
1680         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1681         if (unlikely(policy->mode == MPOL_BIND) &&
1682                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1683                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1684                 return &policy->v.nodes;
1685
1686         return NULL;
1687 }
1688
1689 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1690 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1691         int nd)
1692 {
1693         switch (policy->mode) {
1694         case MPOL_PREFERRED:
1695                 if (!(policy->flags & MPOL_F_LOCAL))
1696                         nd = policy->v.preferred_node;
1697                 break;
1698         case MPOL_BIND:
1699                 /*
1700                  * Normally, MPOL_BIND allocations are node-local within the
1701                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1702                  * current node isn't part of the mask, we use the zonelist for
1703                  * the first node in the mask instead.
1704                  */
1705                 if (unlikely(gfp & __GFP_THISNODE) &&
1706                                 unlikely(!node_isset(nd, policy->v.nodes)))
1707                         nd = first_node(policy->v.nodes);
1708                 break;
1709         default:
1710                 BUG();
1711         }
1712         return node_zonelist(nd, gfp);
1713 }
1714
1715 /* Do dynamic interleaving for a process */
1716 static unsigned interleave_nodes(struct mempolicy *policy)
1717 {
1718         unsigned nid, next;
1719         struct task_struct *me = current;
1720
1721         nid = me->il_next;
1722         next = next_node(nid, policy->v.nodes);
1723         if (next >= MAX_NUMNODES)
1724                 next = first_node(policy->v.nodes);
1725         if (next < MAX_NUMNODES)
1726                 me->il_next = next;
1727         return nid;
1728 }
1729
1730 /*
1731  * Depending on the memory policy provide a node from which to allocate the
1732  * next slab entry.
1733  */
1734 unsigned int mempolicy_slab_node(void)
1735 {
1736         struct mempolicy *policy;
1737         int node = numa_mem_id();
1738
1739         if (in_interrupt())
1740                 return node;
1741
1742         policy = current->mempolicy;
1743         if (!policy || policy->flags & MPOL_F_LOCAL)
1744                 return node;
1745
1746         switch (policy->mode) {
1747         case MPOL_PREFERRED:
1748                 /*
1749                  * handled MPOL_F_LOCAL above
1750                  */
1751                 return policy->v.preferred_node;
1752
1753         case MPOL_INTERLEAVE:
1754                 return interleave_nodes(policy);
1755
1756         case MPOL_BIND: {
1757                 /*
1758                  * Follow bind policy behavior and start allocation at the
1759                  * first node.
1760                  */
1761                 struct zonelist *zonelist;
1762                 struct zone *zone;
1763                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1764                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1765                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1766                                                         &policy->v.nodes,
1767                                                         &zone);
1768                 return zone ? zone->node : node;
1769         }
1770
1771         default:
1772                 BUG();
1773         }
1774 }
1775
1776 /* Do static interleaving for a VMA with known offset. */
1777 static unsigned offset_il_node(struct mempolicy *pol,
1778                 struct vm_area_struct *vma, unsigned long off)
1779 {
1780         unsigned nnodes = nodes_weight(pol->v.nodes);
1781         unsigned target;
1782         int c;
1783         int nid = NUMA_NO_NODE;
1784
1785         if (!nnodes)
1786                 return numa_node_id();
1787         target = (unsigned int)off % nnodes;
1788         c = 0;
1789         do {
1790                 nid = next_node(nid, pol->v.nodes);
1791                 c++;
1792         } while (c <= target);
1793         return nid;
1794 }
1795
1796 /* Determine a node number for interleave */
1797 static inline unsigned interleave_nid(struct mempolicy *pol,
1798                  struct vm_area_struct *vma, unsigned long addr, int shift)
1799 {
1800         if (vma) {
1801                 unsigned long off;
1802
1803                 /*
1804                  * for small pages, there is no difference between
1805                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1806                  * for huge pages, since vm_pgoff is in units of small
1807                  * pages, we need to shift off the always 0 bits to get
1808                  * a useful offset.
1809                  */
1810                 BUG_ON(shift < PAGE_SHIFT);
1811                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1812                 off += (addr - vma->vm_start) >> shift;
1813                 return offset_il_node(pol, vma, off);
1814         } else
1815                 return interleave_nodes(pol);
1816 }
1817
1818 /*
1819  * Return the bit number of a random bit set in the nodemask.
1820  * (returns NUMA_NO_NODE if nodemask is empty)
1821  */
1822 int node_random(const nodemask_t *maskp)
1823 {
1824         int w, bit = NUMA_NO_NODE;
1825
1826         w = nodes_weight(*maskp);
1827         if (w)
1828                 bit = bitmap_ord_to_pos(maskp->bits,
1829                         get_random_int() % w, MAX_NUMNODES);
1830         return bit;
1831 }
1832
1833 #ifdef CONFIG_HUGETLBFS
1834 /*
1835  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1836  * @vma: virtual memory area whose policy is sought
1837  * @addr: address in @vma for shared policy lookup and interleave policy
1838  * @gfp_flags: for requested zone
1839  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1840  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1841  *
1842  * Returns a zonelist suitable for a huge page allocation and a pointer
1843  * to the struct mempolicy for conditional unref after allocation.
1844  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1845  * @nodemask for filtering the zonelist.
1846  *
1847  * Must be protected by read_mems_allowed_begin()
1848  */
1849 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1850                                 gfp_t gfp_flags, struct mempolicy **mpol,
1851                                 nodemask_t **nodemask)
1852 {
1853         struct zonelist *zl;
1854
1855         *mpol = get_vma_policy(vma, addr);
1856         *nodemask = NULL;       /* assume !MPOL_BIND */
1857
1858         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1859                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1860                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1861         } else {
1862                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1863                 if ((*mpol)->mode == MPOL_BIND)
1864                         *nodemask = &(*mpol)->v.nodes;
1865         }
1866         return zl;
1867 }
1868
1869 /*
1870  * init_nodemask_of_mempolicy
1871  *
1872  * If the current task's mempolicy is "default" [NULL], return 'false'
1873  * to indicate default policy.  Otherwise, extract the policy nodemask
1874  * for 'bind' or 'interleave' policy into the argument nodemask, or
1875  * initialize the argument nodemask to contain the single node for
1876  * 'preferred' or 'local' policy and return 'true' to indicate presence
1877  * of non-default mempolicy.
1878  *
1879  * We don't bother with reference counting the mempolicy [mpol_get/put]
1880  * because the current task is examining it's own mempolicy and a task's
1881  * mempolicy is only ever changed by the task itself.
1882  *
1883  * N.B., it is the caller's responsibility to free a returned nodemask.
1884  */
1885 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1886 {
1887         struct mempolicy *mempolicy;
1888         int nid;
1889
1890         if (!(mask && current->mempolicy))
1891                 return false;
1892
1893         task_lock(current);
1894         mempolicy = current->mempolicy;
1895         switch (mempolicy->mode) {
1896         case MPOL_PREFERRED:
1897                 if (mempolicy->flags & MPOL_F_LOCAL)
1898                         nid = numa_node_id();
1899                 else
1900                         nid = mempolicy->v.preferred_node;
1901                 init_nodemask_of_node(mask, nid);
1902                 break;
1903
1904         case MPOL_BIND:
1905                 /* Fall through */
1906         case MPOL_INTERLEAVE:
1907                 *mask =  mempolicy->v.nodes;
1908                 break;
1909
1910         default:
1911                 BUG();
1912         }
1913         task_unlock(current);
1914
1915         return true;
1916 }
1917 #endif
1918
1919 /*
1920  * mempolicy_nodemask_intersects
1921  *
1922  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1923  * policy.  Otherwise, check for intersection between mask and the policy
1924  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1925  * policy, always return true since it may allocate elsewhere on fallback.
1926  *
1927  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1928  */
1929 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1930                                         const nodemask_t *mask)
1931 {
1932         struct mempolicy *mempolicy;
1933         bool ret = true;
1934
1935         if (!mask)
1936                 return ret;
1937         task_lock(tsk);
1938         mempolicy = tsk->mempolicy;
1939         if (!mempolicy)
1940                 goto out;
1941
1942         switch (mempolicy->mode) {
1943         case MPOL_PREFERRED:
1944                 /*
1945                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1946                  * allocate from, they may fallback to other nodes when oom.
1947                  * Thus, it's possible for tsk to have allocated memory from
1948                  * nodes in mask.
1949                  */
1950                 break;
1951         case MPOL_BIND:
1952         case MPOL_INTERLEAVE:
1953                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1954                 break;
1955         default:
1956                 BUG();
1957         }
1958 out:
1959         task_unlock(tsk);
1960         return ret;
1961 }
1962
1963 /* Allocate a page in interleaved policy.
1964    Own path because it needs to do special accounting. */
1965 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1966                                         unsigned nid)
1967 {
1968         struct zonelist *zl;
1969         struct page *page;
1970
1971         zl = node_zonelist(nid, gfp);
1972         page = __alloc_pages(gfp, order, zl);
1973         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1974                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1975         return page;
1976 }
1977
1978 /**
1979  *      alloc_pages_vma - Allocate a page for a VMA.
1980  *
1981  *      @gfp:
1982  *      %GFP_USER    user allocation.
1983  *      %GFP_KERNEL  kernel allocations,
1984  *      %GFP_HIGHMEM highmem/user allocations,
1985  *      %GFP_FS      allocation should not call back into a file system.
1986  *      %GFP_ATOMIC  don't sleep.
1987  *
1988  *      @order:Order of the GFP allocation.
1989  *      @vma:  Pointer to VMA or NULL if not available.
1990  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1991  *
1992  *      This function allocates a page from the kernel page pool and applies
1993  *      a NUMA policy associated with the VMA or the current process.
1994  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1995  *      mm_struct of the VMA to prevent it from going away. Should be used for
1996  *      all allocations for pages that will be mapped into
1997  *      user space. Returns NULL when no page can be allocated.
1998  *
1999  *      Should be called with the mm_sem of the vma hold.
2000  */
2001 struct page *
2002 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2003                 unsigned long addr, int node)
2004 {
2005         struct mempolicy *pol;
2006         struct page *page;
2007         unsigned int cpuset_mems_cookie;
2008
2009 retry_cpuset:
2010         pol = get_vma_policy(vma, addr);
2011         cpuset_mems_cookie = read_mems_allowed_begin();
2012
2013         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2014                 unsigned nid;
2015
2016                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2017                 mpol_cond_put(pol);
2018                 page = alloc_page_interleave(gfp, order, nid);
2019                 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2020                         goto retry_cpuset;
2021
2022                 return page;
2023         }
2024         page = __alloc_pages_nodemask(gfp, order,
2025                                       policy_zonelist(gfp, pol, node),
2026                                       policy_nodemask(gfp, pol));
2027         mpol_cond_put(pol);
2028         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2029                 goto retry_cpuset;
2030         return page;
2031 }
2032
2033 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2034 /**
2035  * alloc_hugepage_vma: Allocate a hugepage for a VMA
2036  * @gfp:
2037  *   %GFP_USER    user allocation.
2038  *   %GFP_KERNEL  kernel allocations,
2039  *   %GFP_HIGHMEM highmem/user allocations,
2040  *   %GFP_FS      allocation should not call back into a file system.
2041  *   %GFP_ATOMIC  don't sleep.
2042  *
2043  * @vma:   Pointer to VMA or NULL if not available.
2044  * @addr:  Virtual Address of the allocation. Must be inside the VMA.
2045  * @order: Order of the hugepage for gfp allocation.
2046  *
2047  * This functions allocate a huge page from the kernel page pool and applies
2048  * a NUMA policy associated with the VMA or the current process.
2049  * For policy other than %MPOL_INTERLEAVE, we make sure we allocate hugepage
2050  * only from the current node if the current node is part of the node mask.
2051  * If we can't allocate a hugepage we fail the allocation and don' try to fallback
2052  * to other nodes in the node mask. If the current node is not part of node mask
2053  * or if the NUMA policy is MPOL_INTERLEAVE we use the allocator that can
2054  * fallback to nodes in the policy node mask.
2055  *
2056  * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2057  * mm_struct of the VMA to prevent it from going away. Should be used for
2058  * all allocations for pages that will be mapped into
2059  * user space. Returns NULL when no page can be allocated.
2060  *
2061  * Should be called with vma->vm_mm->mmap_sem held.
2062  *
2063  */
2064 struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
2065                                 unsigned long addr, int order)
2066 {
2067         struct page *page;
2068         nodemask_t *nmask;
2069         struct mempolicy *pol;
2070         int node = numa_node_id();
2071         unsigned int cpuset_mems_cookie;
2072
2073 retry_cpuset:
2074         pol = get_vma_policy(vma, addr);
2075         cpuset_mems_cookie = read_mems_allowed_begin();
2076         /*
2077          * For interleave policy, we don't worry about
2078          * current node. Otherwise if current node is
2079          * in nodemask, try to allocate hugepage from
2080          * the current node. Don't fall back to other nodes
2081          * for THP.
2082          */
2083         if (unlikely(pol->mode == MPOL_INTERLEAVE))
2084                 goto alloc_with_fallback;
2085         nmask = policy_nodemask(gfp, pol);
2086         if (!nmask || node_isset(node, *nmask)) {
2087                 mpol_cond_put(pol);
2088                 page = alloc_pages_exact_node(node, gfp, order);
2089                 if (unlikely(!page &&
2090                              read_mems_allowed_retry(cpuset_mems_cookie)))
2091                         goto retry_cpuset;
2092                 return page;
2093         }
2094 alloc_with_fallback:
2095         mpol_cond_put(pol);
2096         /*
2097          * if current node is not part of node mask, try
2098          * the allocation from any node, and we can do retry
2099          * in that case.
2100          */
2101         return alloc_pages_vma(gfp, order, vma, addr, node);
2102 }
2103 #endif
2104
2105 /**
2106  *      alloc_pages_current - Allocate pages.
2107  *
2108  *      @gfp:
2109  *              %GFP_USER   user allocation,
2110  *              %GFP_KERNEL kernel allocation,
2111  *              %GFP_HIGHMEM highmem allocation,
2112  *              %GFP_FS     don't call back into a file system.
2113  *              %GFP_ATOMIC don't sleep.
2114  *      @order: Power of two of allocation size in pages. 0 is a single page.
2115  *
2116  *      Allocate a page from the kernel page pool.  When not in
2117  *      interrupt context and apply the current process NUMA policy.
2118  *      Returns NULL when no page can be allocated.
2119  *
2120  *      Don't call cpuset_update_task_memory_state() unless
2121  *      1) it's ok to take cpuset_sem (can WAIT), and
2122  *      2) allocating for current task (not interrupt).
2123  */
2124 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2125 {
2126         struct mempolicy *pol = &default_policy;
2127         struct page *page;
2128         unsigned int cpuset_mems_cookie;
2129
2130         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2131                 pol = get_task_policy(current);
2132
2133 retry_cpuset:
2134         cpuset_mems_cookie = read_mems_allowed_begin();
2135
2136         /*
2137          * No reference counting needed for current->mempolicy
2138          * nor system default_policy
2139          */
2140         if (pol->mode == MPOL_INTERLEAVE)
2141                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2142         else
2143                 page = __alloc_pages_nodemask(gfp, order,
2144                                 policy_zonelist(gfp, pol, numa_node_id()),
2145                                 policy_nodemask(gfp, pol));
2146
2147         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2148                 goto retry_cpuset;
2149
2150         return page;
2151 }
2152 EXPORT_SYMBOL(alloc_pages_current);
2153
2154 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2155 {
2156         struct mempolicy *pol = mpol_dup(vma_policy(src));
2157
2158         if (IS_ERR(pol))
2159                 return PTR_ERR(pol);
2160         dst->vm_policy = pol;
2161         return 0;
2162 }
2163
2164 /*
2165  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2166  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2167  * with the mems_allowed returned by cpuset_mems_allowed().  This
2168  * keeps mempolicies cpuset relative after its cpuset moves.  See
2169  * further kernel/cpuset.c update_nodemask().
2170  *
2171  * current's mempolicy may be rebinded by the other task(the task that changes
2172  * cpuset's mems), so we needn't do rebind work for current task.
2173  */
2174
2175 /* Slow path of a mempolicy duplicate */
2176 struct mempolicy *__mpol_dup(struct mempolicy *old)
2177 {
2178         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2179
2180         if (!new)
2181                 return ERR_PTR(-ENOMEM);
2182
2183         /* task's mempolicy is protected by alloc_lock */
2184         if (old == current->mempolicy) {
2185                 task_lock(current);
2186                 *new = *old;
2187                 task_unlock(current);
2188         } else
2189                 *new = *old;
2190
2191         if (current_cpuset_is_being_rebound()) {
2192                 nodemask_t mems = cpuset_mems_allowed(current);
2193                 if (new->flags & MPOL_F_REBINDING)
2194                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2195                 else
2196                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2197         }
2198         atomic_set(&new->refcnt, 1);
2199         return new;
2200 }
2201
2202 /* Slow path of a mempolicy comparison */
2203 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2204 {
2205         if (!a || !b)
2206                 return false;
2207         if (a->mode != b->mode)
2208                 return false;
2209         if (a->flags != b->flags)
2210                 return false;
2211         if (mpol_store_user_nodemask(a))
2212                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2213                         return false;
2214
2215         switch (a->mode) {
2216         case MPOL_BIND:
2217                 /* Fall through */
2218         case MPOL_INTERLEAVE:
2219                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2220         case MPOL_PREFERRED:
2221                 return a->v.preferred_node == b->v.preferred_node;
2222         default:
2223                 BUG();
2224                 return false;
2225         }
2226 }
2227
2228 /*
2229  * Shared memory backing store policy support.
2230  *
2231  * Remember policies even when nobody has shared memory mapped.
2232  * The policies are kept in Red-Black tree linked from the inode.
2233  * They are protected by the sp->lock spinlock, which should be held
2234  * for any accesses to the tree.
2235  */
2236
2237 /* lookup first element intersecting start-end */
2238 /* Caller holds sp->lock */
2239 static struct sp_node *
2240 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2241 {
2242         struct rb_node *n = sp->root.rb_node;
2243
2244         while (n) {
2245                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2246
2247                 if (start >= p->end)
2248                         n = n->rb_right;
2249                 else if (end <= p->start)
2250                         n = n->rb_left;
2251                 else
2252                         break;
2253         }
2254         if (!n)
2255                 return NULL;
2256         for (;;) {
2257                 struct sp_node *w = NULL;
2258                 struct rb_node *prev = rb_prev(n);
2259                 if (!prev)
2260                         break;
2261                 w = rb_entry(prev, struct sp_node, nd);
2262                 if (w->end <= start)
2263                         break;
2264                 n = prev;
2265         }
2266         return rb_entry(n, struct sp_node, nd);
2267 }
2268
2269 /* Insert a new shared policy into the list. */
2270 /* Caller holds sp->lock */
2271 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2272 {
2273         struct rb_node **p = &sp->root.rb_node;
2274         struct rb_node *parent = NULL;
2275         struct sp_node *nd;
2276
2277         while (*p) {
2278                 parent = *p;
2279                 nd = rb_entry(parent, struct sp_node, nd);
2280                 if (new->start < nd->start)
2281                         p = &(*p)->rb_left;
2282                 else if (new->end > nd->end)
2283                         p = &(*p)->rb_right;
2284                 else
2285                         BUG();
2286         }
2287         rb_link_node(&new->nd, parent, p);
2288         rb_insert_color(&new->nd, &sp->root);
2289         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2290                  new->policy ? new->policy->mode : 0);
2291 }
2292
2293 /* Find shared policy intersecting idx */
2294 struct mempolicy *
2295 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2296 {
2297         struct mempolicy *pol = NULL;
2298         struct sp_node *sn;
2299
2300         if (!sp->root.rb_node)
2301                 return NULL;
2302         spin_lock(&sp->lock);
2303         sn = sp_lookup(sp, idx, idx+1);
2304         if (sn) {
2305                 mpol_get(sn->policy);
2306                 pol = sn->policy;
2307         }
2308         spin_unlock(&sp->lock);
2309         return pol;
2310 }
2311
2312 static void sp_free(struct sp_node *n)
2313 {
2314         mpol_put(n->policy);
2315         kmem_cache_free(sn_cache, n);
2316 }
2317
2318 /**
2319  * mpol_misplaced - check whether current page node is valid in policy
2320  *
2321  * @page: page to be checked
2322  * @vma: vm area where page mapped
2323  * @addr: virtual address where page mapped
2324  *
2325  * Lookup current policy node id for vma,addr and "compare to" page's
2326  * node id.
2327  *
2328  * Returns:
2329  *      -1      - not misplaced, page is in the right node
2330  *      node    - node id where the page should be
2331  *
2332  * Policy determination "mimics" alloc_page_vma().
2333  * Called from fault path where we know the vma and faulting address.
2334  */
2335 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2336 {
2337         struct mempolicy *pol;
2338         struct zone *zone;
2339         int curnid = page_to_nid(page);
2340         unsigned long pgoff;
2341         int thiscpu = raw_smp_processor_id();
2342         int thisnid = cpu_to_node(thiscpu);
2343         int polnid = -1;
2344         int ret = -1;
2345
2346         BUG_ON(!vma);
2347
2348         pol = get_vma_policy(vma, addr);
2349         if (!(pol->flags & MPOL_F_MOF))
2350                 goto out;
2351
2352         switch (pol->mode) {
2353         case MPOL_INTERLEAVE:
2354                 BUG_ON(addr >= vma->vm_end);
2355                 BUG_ON(addr < vma->vm_start);
2356
2357                 pgoff = vma->vm_pgoff;
2358                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2359                 polnid = offset_il_node(pol, vma, pgoff);
2360                 break;
2361
2362         case MPOL_PREFERRED:
2363                 if (pol->flags & MPOL_F_LOCAL)
2364                         polnid = numa_node_id();
2365                 else
2366                         polnid = pol->v.preferred_node;
2367                 break;
2368
2369         case MPOL_BIND:
2370                 /*
2371                  * allows binding to multiple nodes.
2372                  * use current page if in policy nodemask,
2373                  * else select nearest allowed node, if any.
2374                  * If no allowed nodes, use current [!misplaced].
2375                  */
2376                 if (node_isset(curnid, pol->v.nodes))
2377                         goto out;
2378                 (void)first_zones_zonelist(
2379                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2380                                 gfp_zone(GFP_HIGHUSER),
2381                                 &pol->v.nodes, &zone);
2382                 polnid = zone->node;
2383                 break;
2384
2385         default:
2386                 BUG();
2387         }
2388
2389         /* Migrate the page towards the node whose CPU is referencing it */
2390         if (pol->flags & MPOL_F_MORON) {
2391                 polnid = thisnid;
2392
2393                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2394                         goto out;
2395         }
2396
2397         if (curnid != polnid)
2398                 ret = polnid;
2399 out:
2400         mpol_cond_put(pol);
2401
2402         return ret;
2403 }
2404
2405 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2406 {
2407         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2408         rb_erase(&n->nd, &sp->root);
2409         sp_free(n);
2410 }
2411
2412 static void sp_node_init(struct sp_node *node, unsigned long start,
2413                         unsigned long end, struct mempolicy *pol)
2414 {
2415         node->start = start;
2416         node->end = end;
2417         node->policy = pol;
2418 }
2419
2420 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2421                                 struct mempolicy *pol)
2422 {
2423         struct sp_node *n;
2424         struct mempolicy *newpol;
2425
2426         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2427         if (!n)
2428                 return NULL;
2429
2430         newpol = mpol_dup(pol);
2431         if (IS_ERR(newpol)) {
2432                 kmem_cache_free(sn_cache, n);
2433                 return NULL;
2434         }
2435         newpol->flags |= MPOL_F_SHARED;
2436         sp_node_init(n, start, end, newpol);
2437
2438         return n;
2439 }
2440
2441 /* Replace a policy range. */
2442 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2443                                  unsigned long end, struct sp_node *new)
2444 {
2445         struct sp_node *n;
2446         struct sp_node *n_new = NULL;
2447         struct mempolicy *mpol_new = NULL;
2448         int ret = 0;
2449
2450 restart:
2451         spin_lock(&sp->lock);
2452         n = sp_lookup(sp, start, end);
2453         /* Take care of old policies in the same range. */
2454         while (n && n->start < end) {
2455                 struct rb_node *next = rb_next(&n->nd);
2456                 if (n->start >= start) {
2457                         if (n->end <= end)
2458                                 sp_delete(sp, n);
2459                         else
2460                                 n->start = end;
2461                 } else {
2462                         /* Old policy spanning whole new range. */
2463                         if (n->end > end) {
2464                                 if (!n_new)
2465                                         goto alloc_new;
2466
2467                                 *mpol_new = *n->policy;
2468                                 atomic_set(&mpol_new->refcnt, 1);
2469                                 sp_node_init(n_new, end, n->end, mpol_new);
2470                                 n->end = start;
2471                                 sp_insert(sp, n_new);
2472                                 n_new = NULL;
2473                                 mpol_new = NULL;
2474                                 break;
2475                         } else
2476                                 n->end = start;
2477                 }
2478                 if (!next)
2479                         break;
2480                 n = rb_entry(next, struct sp_node, nd);
2481         }
2482         if (new)
2483                 sp_insert(sp, new);
2484         spin_unlock(&sp->lock);
2485         ret = 0;
2486
2487 err_out:
2488         if (mpol_new)
2489                 mpol_put(mpol_new);
2490         if (n_new)
2491                 kmem_cache_free(sn_cache, n_new);
2492
2493         return ret;
2494
2495 alloc_new:
2496         spin_unlock(&sp->lock);
2497         ret = -ENOMEM;
2498         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2499         if (!n_new)
2500                 goto err_out;
2501         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2502         if (!mpol_new)
2503                 goto err_out;
2504         goto restart;
2505 }
2506
2507 /**
2508  * mpol_shared_policy_init - initialize shared policy for inode
2509  * @sp: pointer to inode shared policy
2510  * @mpol:  struct mempolicy to install
2511  *
2512  * Install non-NULL @mpol in inode's shared policy rb-tree.
2513  * On entry, the current task has a reference on a non-NULL @mpol.
2514  * This must be released on exit.
2515  * This is called at get_inode() calls and we can use GFP_KERNEL.
2516  */
2517 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2518 {
2519         int ret;
2520
2521         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2522         spin_lock_init(&sp->lock);
2523
2524         if (mpol) {
2525                 struct vm_area_struct pvma;
2526                 struct mempolicy *new;
2527                 NODEMASK_SCRATCH(scratch);
2528
2529                 if (!scratch)
2530                         goto put_mpol;
2531                 /* contextualize the tmpfs mount point mempolicy */
2532                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2533                 if (IS_ERR(new))
2534                         goto free_scratch; /* no valid nodemask intersection */
2535
2536                 task_lock(current);
2537                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2538                 task_unlock(current);
2539                 if (ret)
2540                         goto put_new;
2541
2542                 /* Create pseudo-vma that contains just the policy */
2543                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2544                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2545                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2546
2547 put_new:
2548                 mpol_put(new);                  /* drop initial ref */
2549 free_scratch:
2550                 NODEMASK_SCRATCH_FREE(scratch);
2551 put_mpol:
2552                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2553         }
2554 }
2555
2556 int mpol_set_shared_policy(struct shared_policy *info,
2557                         struct vm_area_struct *vma, struct mempolicy *npol)
2558 {
2559         int err;
2560         struct sp_node *new = NULL;
2561         unsigned long sz = vma_pages(vma);
2562
2563         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2564                  vma->vm_pgoff,
2565                  sz, npol ? npol->mode : -1,
2566                  npol ? npol->flags : -1,
2567                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2568
2569         if (npol) {
2570                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2571                 if (!new)
2572                         return -ENOMEM;
2573         }
2574         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2575         if (err && new)
2576                 sp_free(new);
2577         return err;
2578 }
2579
2580 /* Free a backing policy store on inode delete. */
2581 void mpol_free_shared_policy(struct shared_policy *p)
2582 {
2583         struct sp_node *n;
2584         struct rb_node *next;
2585
2586         if (!p->root.rb_node)
2587                 return;
2588         spin_lock(&p->lock);
2589         next = rb_first(&p->root);
2590         while (next) {
2591                 n = rb_entry(next, struct sp_node, nd);
2592                 next = rb_next(&n->nd);
2593                 sp_delete(p, n);
2594         }
2595         spin_unlock(&p->lock);
2596 }
2597
2598 #ifdef CONFIG_NUMA_BALANCING
2599 static int __initdata numabalancing_override;
2600
2601 static void __init check_numabalancing_enable(void)
2602 {
2603         bool numabalancing_default = false;
2604
2605         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2606                 numabalancing_default = true;
2607
2608         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2609         if (numabalancing_override)
2610                 set_numabalancing_state(numabalancing_override == 1);
2611
2612         if (nr_node_ids > 1 && !numabalancing_override) {
2613                 pr_info("%s automatic NUMA balancing. "
2614                         "Configure with numa_balancing= or the "
2615                         "kernel.numa_balancing sysctl",
2616                         numabalancing_default ? "Enabling" : "Disabling");
2617                 set_numabalancing_state(numabalancing_default);
2618         }
2619 }
2620
2621 static int __init setup_numabalancing(char *str)
2622 {
2623         int ret = 0;
2624         if (!str)
2625                 goto out;
2626
2627         if (!strcmp(str, "enable")) {
2628                 numabalancing_override = 1;
2629                 ret = 1;
2630         } else if (!strcmp(str, "disable")) {
2631                 numabalancing_override = -1;
2632                 ret = 1;
2633         }
2634 out:
2635         if (!ret)
2636                 pr_warn("Unable to parse numa_balancing=\n");
2637
2638         return ret;
2639 }
2640 __setup("numa_balancing=", setup_numabalancing);
2641 #else
2642 static inline void __init check_numabalancing_enable(void)
2643 {
2644 }
2645 #endif /* CONFIG_NUMA_BALANCING */
2646
2647 /* assumes fs == KERNEL_DS */
2648 void __init numa_policy_init(void)
2649 {
2650         nodemask_t interleave_nodes;
2651         unsigned long largest = 0;
2652         int nid, prefer = 0;
2653
2654         policy_cache = kmem_cache_create("numa_policy",
2655                                          sizeof(struct mempolicy),
2656                                          0, SLAB_PANIC, NULL);
2657
2658         sn_cache = kmem_cache_create("shared_policy_node",
2659                                      sizeof(struct sp_node),
2660                                      0, SLAB_PANIC, NULL);
2661
2662         for_each_node(nid) {
2663                 preferred_node_policy[nid] = (struct mempolicy) {
2664                         .refcnt = ATOMIC_INIT(1),
2665                         .mode = MPOL_PREFERRED,
2666                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2667                         .v = { .preferred_node = nid, },
2668                 };
2669         }
2670
2671         /*
2672          * Set interleaving policy for system init. Interleaving is only
2673          * enabled across suitably sized nodes (default is >= 16MB), or
2674          * fall back to the largest node if they're all smaller.
2675          */
2676         nodes_clear(interleave_nodes);
2677         for_each_node_state(nid, N_MEMORY) {
2678                 unsigned long total_pages = node_present_pages(nid);
2679
2680                 /* Preserve the largest node */
2681                 if (largest < total_pages) {
2682                         largest = total_pages;
2683                         prefer = nid;
2684                 }
2685
2686                 /* Interleave this node? */
2687                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2688                         node_set(nid, interleave_nodes);
2689         }
2690
2691         /* All too small, use the largest */
2692         if (unlikely(nodes_empty(interleave_nodes)))
2693                 node_set(prefer, interleave_nodes);
2694
2695         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2696                 pr_err("%s: interleaving failed\n", __func__);
2697
2698         check_numabalancing_enable();
2699 }
2700
2701 /* Reset policy of current process to default */
2702 void numa_default_policy(void)
2703 {
2704         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2705 }
2706
2707 /*
2708  * Parse and format mempolicy from/to strings
2709  */
2710
2711 /*
2712  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2713  */
2714 static const char * const policy_modes[] =
2715 {
2716         [MPOL_DEFAULT]    = "default",
2717         [MPOL_PREFERRED]  = "prefer",
2718         [MPOL_BIND]       = "bind",
2719         [MPOL_INTERLEAVE] = "interleave",
2720         [MPOL_LOCAL]      = "local",
2721 };
2722
2723
2724 #ifdef CONFIG_TMPFS
2725 /**
2726  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2727  * @str:  string containing mempolicy to parse
2728  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2729  *
2730  * Format of input:
2731  *      <mode>[=<flags>][:<nodelist>]
2732  *
2733  * On success, returns 0, else 1
2734  */
2735 int mpol_parse_str(char *str, struct mempolicy **mpol)
2736 {
2737         struct mempolicy *new = NULL;
2738         unsigned short mode;
2739         unsigned short mode_flags;
2740         nodemask_t nodes;
2741         char *nodelist = strchr(str, ':');
2742         char *flags = strchr(str, '=');
2743         int err = 1;
2744
2745         if (nodelist) {
2746                 /* NUL-terminate mode or flags string */
2747                 *nodelist++ = '\0';
2748                 if (nodelist_parse(nodelist, nodes))
2749                         goto out;
2750                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2751                         goto out;
2752         } else
2753                 nodes_clear(nodes);
2754
2755         if (flags)
2756                 *flags++ = '\0';        /* terminate mode string */
2757
2758         for (mode = 0; mode < MPOL_MAX; mode++) {
2759                 if (!strcmp(str, policy_modes[mode])) {
2760                         break;
2761                 }
2762         }
2763         if (mode >= MPOL_MAX)
2764                 goto out;
2765
2766         switch (mode) {
2767         case MPOL_PREFERRED:
2768                 /*
2769                  * Insist on a nodelist of one node only
2770                  */
2771                 if (nodelist) {
2772                         char *rest = nodelist;
2773                         while (isdigit(*rest))
2774                                 rest++;
2775                         if (*rest)
2776                                 goto out;
2777                 }
2778                 break;
2779         case MPOL_INTERLEAVE:
2780                 /*
2781                  * Default to online nodes with memory if no nodelist
2782                  */
2783                 if (!nodelist)
2784                         nodes = node_states[N_MEMORY];
2785                 break;
2786         case MPOL_LOCAL:
2787                 /*
2788                  * Don't allow a nodelist;  mpol_new() checks flags
2789                  */
2790                 if (nodelist)
2791                         goto out;
2792                 mode = MPOL_PREFERRED;
2793                 break;
2794         case MPOL_DEFAULT:
2795                 /*
2796                  * Insist on a empty nodelist
2797                  */
2798                 if (!nodelist)
2799                         err = 0;
2800                 goto out;
2801         case MPOL_BIND:
2802                 /*
2803                  * Insist on a nodelist
2804                  */
2805                 if (!nodelist)
2806                         goto out;
2807         }
2808
2809         mode_flags = 0;
2810         if (flags) {
2811                 /*
2812                  * Currently, we only support two mutually exclusive
2813                  * mode flags.
2814                  */
2815                 if (!strcmp(flags, "static"))
2816                         mode_flags |= MPOL_F_STATIC_NODES;
2817                 else if (!strcmp(flags, "relative"))
2818                         mode_flags |= MPOL_F_RELATIVE_NODES;
2819                 else
2820                         goto out;
2821         }
2822
2823         new = mpol_new(mode, mode_flags, &nodes);
2824         if (IS_ERR(new))
2825                 goto out;
2826
2827         /*
2828          * Save nodes for mpol_to_str() to show the tmpfs mount options
2829          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2830          */
2831         if (mode != MPOL_PREFERRED)
2832                 new->v.nodes = nodes;
2833         else if (nodelist)
2834                 new->v.preferred_node = first_node(nodes);
2835         else
2836                 new->flags |= MPOL_F_LOCAL;
2837
2838         /*
2839          * Save nodes for contextualization: this will be used to "clone"
2840          * the mempolicy in a specific context [cpuset] at a later time.
2841          */
2842         new->w.user_nodemask = nodes;
2843
2844         err = 0;
2845
2846 out:
2847         /* Restore string for error message */
2848         if (nodelist)
2849                 *--nodelist = ':';
2850         if (flags)
2851                 *--flags = '=';
2852         if (!err)
2853                 *mpol = new;
2854         return err;
2855 }
2856 #endif /* CONFIG_TMPFS */
2857
2858 /**
2859  * mpol_to_str - format a mempolicy structure for printing
2860  * @buffer:  to contain formatted mempolicy string
2861  * @maxlen:  length of @buffer
2862  * @pol:  pointer to mempolicy to be formatted
2863  *
2864  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2865  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2866  * longest flag, "relative", and to display at least a few node ids.
2867  */
2868 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2869 {
2870         char *p = buffer;
2871         nodemask_t nodes = NODE_MASK_NONE;
2872         unsigned short mode = MPOL_DEFAULT;
2873         unsigned short flags = 0;
2874
2875         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2876                 mode = pol->mode;
2877                 flags = pol->flags;
2878         }
2879
2880         switch (mode) {
2881         case MPOL_DEFAULT:
2882                 break;
2883         case MPOL_PREFERRED:
2884                 if (flags & MPOL_F_LOCAL)
2885                         mode = MPOL_LOCAL;
2886                 else
2887                         node_set(pol->v.preferred_node, nodes);
2888                 break;
2889         case MPOL_BIND:
2890         case MPOL_INTERLEAVE:
2891                 nodes = pol->v.nodes;
2892                 break;
2893         default:
2894                 WARN_ON_ONCE(1);
2895                 snprintf(p, maxlen, "unknown");
2896                 return;
2897         }
2898
2899         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2900
2901         if (flags & MPOL_MODE_FLAGS) {
2902                 p += snprintf(p, buffer + maxlen - p, "=");
2903
2904                 /*
2905                  * Currently, the only defined flags are mutually exclusive
2906                  */
2907                 if (flags & MPOL_F_STATIC_NODES)
2908                         p += snprintf(p, buffer + maxlen - p, "static");
2909                 else if (flags & MPOL_F_RELATIVE_NODES)
2910                         p += snprintf(p, buffer + maxlen - p, "relative");
2911         }
2912
2913         if (!nodes_empty(nodes)) {
2914                 p += snprintf(p, buffer + maxlen - p, ":");
2915                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2916         }
2917 }