~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/mempolicy.c

Version: ~ [ linux-5.6-rc3 ] ~ [ linux-5.5.6 ] ~ [ linux-5.4.22 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.106 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.171 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.214 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.214 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.82 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Simple NUMA memory policy for the Linux kernel.
  3  *
  4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
  5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
  6  * Subject to the GNU Public License, version 2.
  7  *
  8  * NUMA policy allows the user to give hints in which node(s) memory should
  9  * be allocated.
 10  *
 11  * Support four policies per VMA and per process:
 12  *
 13  * The VMA policy has priority over the process policy for a page fault.
 14  *
 15  * interleave     Allocate memory interleaved over a set of nodes,
 16  *                with normal fallback if it fails.
 17  *                For VMA based allocations this interleaves based on the
 18  *                offset into the backing object or offset into the mapping
 19  *                for anonymous memory. For process policy an process counter
 20  *                is used.
 21  *
 22  * bind           Only allocate memory on a specific set of nodes,
 23  *                no fallback.
 24  *                FIXME: memory is allocated starting with the first node
 25  *                to the last. It would be better if bind would truly restrict
 26  *                the allocation to memory nodes instead
 27  *
 28  * preferred       Try a specific node first before normal fallback.
 29  *                As a special case NUMA_NO_NODE here means do the allocation
 30  *                on the local CPU. This is normally identical to default,
 31  *                but useful to set in a VMA when you have a non default
 32  *                process policy.
 33  *
 34  * default        Allocate on the local node first, or when on a VMA
 35  *                use the process policy. This is what Linux always did
 36  *                in a NUMA aware kernel and still does by, ahem, default.
 37  *
 38  * The process policy is applied for most non interrupt memory allocations
 39  * in that process' context. Interrupts ignore the policies and always
 40  * try to allocate on the local CPU. The VMA policy is only applied for memory
 41  * allocations for a VMA in the VM.
 42  *
 43  * Currently there are a few corner cases in swapping where the policy
 44  * is not applied, but the majority should be handled. When process policy
 45  * is used it is not remembered over swap outs/swap ins.
 46  *
 47  * Only the highest zone in the zone hierarchy gets policied. Allocations
 48  * requesting a lower zone just use default policy. This implies that
 49  * on systems with highmem kernel lowmem allocation don't get policied.
 50  * Same with GFP_DMA allocations.
 51  *
 52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
 53  * all users and remembered even when nobody has memory mapped.
 54  */
 55 
 56 /* Notebook:
 57    fix mmap readahead to honour policy and enable policy for any page cache
 58    object
 59    statistics for bigpages
 60    global policy for page cache? currently it uses process policy. Requires
 61    first item above.
 62    handle mremap for shared memory (currently ignored for the policy)
 63    grows down?
 64    make bind policy root only? It can trigger oom much faster and the
 65    kernel is not always grateful with that.
 66 */
 67 
 68 #include <linux/mempolicy.h>
 69 #include <linux/mm.h>
 70 #include <linux/highmem.h>
 71 #include <linux/hugetlb.h>
 72 #include <linux/kernel.h>
 73 #include <linux/sched.h>
 74 #include <linux/nodemask.h>
 75 #include <linux/cpuset.h>
 76 #include <linux/slab.h>
 77 #include <linux/string.h>
 78 #include <linux/export.h>
 79 #include <linux/nsproxy.h>
 80 #include <linux/interrupt.h>
 81 #include <linux/init.h>
 82 #include <linux/compat.h>
 83 #include <linux/swap.h>
 84 #include <linux/seq_file.h>
 85 #include <linux/proc_fs.h>
 86 #include <linux/migrate.h>
 87 #include <linux/ksm.h>
 88 #include <linux/rmap.h>
 89 #include <linux/security.h>
 90 #include <linux/syscalls.h>
 91 #include <linux/ctype.h>
 92 #include <linux/mm_inline.h>
 93 #include <linux/mmu_notifier.h>
 94 
 95 #include <asm/tlbflush.h>
 96 #include <asm/uaccess.h>
 97 #include <linux/random.h>
 98 
 99 #include "internal.h"
100 
101 /* Internal flags */
102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
104 
105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache;
107 
108 /* Highest zone. An specific allocation for a zone below that is not
109    policied. */
110 enum zone_type policy_zone = 0;
111 
112 /*
113  * run-time system-wide default policy => local allocation
114  */
115 static struct mempolicy default_policy = {
116         .refcnt = ATOMIC_INIT(1), /* never free it */
117         .mode = MPOL_PREFERRED,
118         .flags = MPOL_F_LOCAL,
119 };
120 
121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122 
123 static struct mempolicy *get_task_policy(struct task_struct *p)
124 {
125         struct mempolicy *pol = p->mempolicy;
126 
127         if (!pol) {
128                 int node = numa_node_id();
129 
130                 if (node != NUMA_NO_NODE) {
131                         pol = &preferred_node_policy[node];
132                         /*
133                          * preferred_node_policy is not initialised early in
134                          * boot
135                          */
136                         if (!pol->mode)
137                                 pol = NULL;
138                 }
139         }
140 
141         return pol;
142 }
143 
144 static const struct mempolicy_operations {
145         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146         /*
147          * If read-side task has no lock to protect task->mempolicy, write-side
148          * task will rebind the task->mempolicy by two step. The first step is
149          * setting all the newly nodes, and the second step is cleaning all the
150          * disallowed nodes. In this way, we can avoid finding no node to alloc
151          * page.
152          * If we have a lock to protect task->mempolicy in read-side, we do
153          * rebind directly.
154          *
155          * step:
156          *      MPOL_REBIND_ONCE - do rebind work at once
157          *      MPOL_REBIND_STEP1 - set all the newly nodes
158          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
159          */
160         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161                         enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX];
163 
164 /* Check that the nodemask contains at least one populated zone */
165 static int is_valid_nodemask(const nodemask_t *nodemask)
166 {
167         return nodes_intersects(*nodemask, node_states[N_MEMORY]);
168 }
169 
170 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
171 {
172         return pol->flags & MPOL_MODE_FLAGS;
173 }
174 
175 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
176                                    const nodemask_t *rel)
177 {
178         nodemask_t tmp;
179         nodes_fold(tmp, *orig, nodes_weight(*rel));
180         nodes_onto(*ret, tmp, *rel);
181 }
182 
183 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
184 {
185         if (nodes_empty(*nodes))
186                 return -EINVAL;
187         pol->v.nodes = *nodes;
188         return 0;
189 }
190 
191 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
192 {
193         if (!nodes)
194                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
195         else if (nodes_empty(*nodes))
196                 return -EINVAL;                 /*  no allowed nodes */
197         else
198                 pol->v.preferred_node = first_node(*nodes);
199         return 0;
200 }
201 
202 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
203 {
204         if (!is_valid_nodemask(nodes))
205                 return -EINVAL;
206         pol->v.nodes = *nodes;
207         return 0;
208 }
209 
210 /*
211  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
212  * any, for the new policy.  mpol_new() has already validated the nodes
213  * parameter with respect to the policy mode and flags.  But, we need to
214  * handle an empty nodemask with MPOL_PREFERRED here.
215  *
216  * Must be called holding task's alloc_lock to protect task's mems_allowed
217  * and mempolicy.  May also be called holding the mmap_semaphore for write.
218  */
219 static int mpol_set_nodemask(struct mempolicy *pol,
220                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
221 {
222         int ret;
223 
224         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
225         if (pol == NULL)
226                 return 0;
227         /* Check N_MEMORY */
228         nodes_and(nsc->mask1,
229                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
230 
231         VM_BUG_ON(!nodes);
232         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
233                 nodes = NULL;   /* explicit local allocation */
234         else {
235                 if (pol->flags & MPOL_F_RELATIVE_NODES)
236                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
237                 else
238                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
239 
240                 if (mpol_store_user_nodemask(pol))
241                         pol->w.user_nodemask = *nodes;
242                 else
243                         pol->w.cpuset_mems_allowed =
244                                                 cpuset_current_mems_allowed;
245         }
246 
247         if (nodes)
248                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
249         else
250                 ret = mpol_ops[pol->mode].create(pol, NULL);
251         return ret;
252 }
253 
254 /*
255  * This function just creates a new policy, does some check and simple
256  * initialization. You must invoke mpol_set_nodemask() to set nodes.
257  */
258 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259                                   nodemask_t *nodes)
260 {
261         struct mempolicy *policy;
262 
263         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
264                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265 
266         if (mode == MPOL_DEFAULT) {
267                 if (nodes && !nodes_empty(*nodes))
268                         return ERR_PTR(-EINVAL);
269                 return NULL;
270         }
271         VM_BUG_ON(!nodes);
272 
273         /*
274          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
275          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
276          * All other modes require a valid pointer to a non-empty nodemask.
277          */
278         if (mode == MPOL_PREFERRED) {
279                 if (nodes_empty(*nodes)) {
280                         if (((flags & MPOL_F_STATIC_NODES) ||
281                              (flags & MPOL_F_RELATIVE_NODES)))
282                                 return ERR_PTR(-EINVAL);
283                 }
284         } else if (mode == MPOL_LOCAL) {
285                 if (!nodes_empty(*nodes))
286                         return ERR_PTR(-EINVAL);
287                 mode = MPOL_PREFERRED;
288         } else if (nodes_empty(*nodes))
289                 return ERR_PTR(-EINVAL);
290         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
291         if (!policy)
292                 return ERR_PTR(-ENOMEM);
293         atomic_set(&policy->refcnt, 1);
294         policy->mode = mode;
295         policy->flags = flags;
296 
297         return policy;
298 }
299 
300 /* Slow path of a mpol destructor. */
301 void __mpol_put(struct mempolicy *p)
302 {
303         if (!atomic_dec_and_test(&p->refcnt))
304                 return;
305         kmem_cache_free(policy_cache, p);
306 }
307 
308 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
309                                 enum mpol_rebind_step step)
310 {
311 }
312 
313 /*
314  * step:
315  *      MPOL_REBIND_ONCE  - do rebind work at once
316  *      MPOL_REBIND_STEP1 - set all the newly nodes
317  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
318  */
319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
320                                  enum mpol_rebind_step step)
321 {
322         nodemask_t tmp;
323 
324         if (pol->flags & MPOL_F_STATIC_NODES)
325                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
326         else if (pol->flags & MPOL_F_RELATIVE_NODES)
327                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
328         else {
329                 /*
330                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
331                  * result
332                  */
333                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
334                         nodes_remap(tmp, pol->v.nodes,
335                                         pol->w.cpuset_mems_allowed, *nodes);
336                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
337                 } else if (step == MPOL_REBIND_STEP2) {
338                         tmp = pol->w.cpuset_mems_allowed;
339                         pol->w.cpuset_mems_allowed = *nodes;
340                 } else
341                         BUG();
342         }
343 
344         if (nodes_empty(tmp))
345                 tmp = *nodes;
346 
347         if (step == MPOL_REBIND_STEP1)
348                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
349         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
350                 pol->v.nodes = tmp;
351         else
352                 BUG();
353 
354         if (!node_isset(current->il_next, tmp)) {
355                 current->il_next = next_node(current->il_next, tmp);
356                 if (current->il_next >= MAX_NUMNODES)
357                         current->il_next = first_node(tmp);
358                 if (current->il_next >= MAX_NUMNODES)
359                         current->il_next = numa_node_id();
360         }
361 }
362 
363 static void mpol_rebind_preferred(struct mempolicy *pol,
364                                   const nodemask_t *nodes,
365                                   enum mpol_rebind_step step)
366 {
367         nodemask_t tmp;
368 
369         if (pol->flags & MPOL_F_STATIC_NODES) {
370                 int node = first_node(pol->w.user_nodemask);
371 
372                 if (node_isset(node, *nodes)) {
373                         pol->v.preferred_node = node;
374                         pol->flags &= ~MPOL_F_LOCAL;
375                 } else
376                         pol->flags |= MPOL_F_LOCAL;
377         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
378                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
379                 pol->v.preferred_node = first_node(tmp);
380         } else if (!(pol->flags & MPOL_F_LOCAL)) {
381                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
382                                                    pol->w.cpuset_mems_allowed,
383                                                    *nodes);
384                 pol->w.cpuset_mems_allowed = *nodes;
385         }
386 }
387 
388 /*
389  * mpol_rebind_policy - Migrate a policy to a different set of nodes
390  *
391  * If read-side task has no lock to protect task->mempolicy, write-side
392  * task will rebind the task->mempolicy by two step. The first step is
393  * setting all the newly nodes, and the second step is cleaning all the
394  * disallowed nodes. In this way, we can avoid finding no node to alloc
395  * page.
396  * If we have a lock to protect task->mempolicy in read-side, we do
397  * rebind directly.
398  *
399  * step:
400  *      MPOL_REBIND_ONCE  - do rebind work at once
401  *      MPOL_REBIND_STEP1 - set all the newly nodes
402  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
403  */
404 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
405                                 enum mpol_rebind_step step)
406 {
407         if (!pol)
408                 return;
409         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
410             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
411                 return;
412 
413         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
414                 return;
415 
416         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
417                 BUG();
418 
419         if (step == MPOL_REBIND_STEP1)
420                 pol->flags |= MPOL_F_REBINDING;
421         else if (step == MPOL_REBIND_STEP2)
422                 pol->flags &= ~MPOL_F_REBINDING;
423         else if (step >= MPOL_REBIND_NSTEP)
424                 BUG();
425 
426         mpol_ops[pol->mode].rebind(pol, newmask, step);
427 }
428 
429 /*
430  * Wrapper for mpol_rebind_policy() that just requires task
431  * pointer, and updates task mempolicy.
432  *
433  * Called with task's alloc_lock held.
434  */
435 
436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
437                         enum mpol_rebind_step step)
438 {
439         mpol_rebind_policy(tsk->mempolicy, new, step);
440 }
441 
442 /*
443  * Rebind each vma in mm to new nodemask.
444  *
445  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
446  */
447 
448 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
449 {
450         struct vm_area_struct *vma;
451 
452         down_write(&mm->mmap_sem);
453         for (vma = mm->mmap; vma; vma = vma->vm_next)
454                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
455         up_write(&mm->mmap_sem);
456 }
457 
458 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
459         [MPOL_DEFAULT] = {
460                 .rebind = mpol_rebind_default,
461         },
462         [MPOL_INTERLEAVE] = {
463                 .create = mpol_new_interleave,
464                 .rebind = mpol_rebind_nodemask,
465         },
466         [MPOL_PREFERRED] = {
467                 .create = mpol_new_preferred,
468                 .rebind = mpol_rebind_preferred,
469         },
470         [MPOL_BIND] = {
471                 .create = mpol_new_bind,
472                 .rebind = mpol_rebind_nodemask,
473         },
474 };
475 
476 static void migrate_page_add(struct page *page, struct list_head *pagelist,
477                                 unsigned long flags);
478 
479 /*
480  * Scan through pages checking if pages follow certain conditions,
481  * and move them to the pagelist if they do.
482  */
483 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
484                 unsigned long addr, unsigned long end,
485                 const nodemask_t *nodes, unsigned long flags,
486                 void *private)
487 {
488         pte_t *orig_pte;
489         pte_t *pte;
490         spinlock_t *ptl;
491 
492         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
493         do {
494                 struct page *page;
495                 int nid;
496 
497                 if (!pte_present(*pte))
498                         continue;
499                 page = vm_normal_page(vma, addr, *pte);
500                 if (!page)
501                         continue;
502                 /*
503                  * vm_normal_page() filters out zero pages, but there might
504                  * still be PageReserved pages to skip, perhaps in a VDSO.
505                  */
506                 if (PageReserved(page))
507                         continue;
508                 nid = page_to_nid(page);
509                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
510                         continue;
511 
512                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
513                         migrate_page_add(page, private, flags);
514                 else
515                         break;
516         } while (pte++, addr += PAGE_SIZE, addr != end);
517         pte_unmap_unlock(orig_pte, ptl);
518         return addr != end;
519 }
520 
521 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
522                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523                                     void *private)
524 {
525 #ifdef CONFIG_HUGETLB_PAGE
526         int nid;
527         struct page *page;
528         spinlock_t *ptl;
529         pte_t entry;
530 
531         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
532         entry = huge_ptep_get((pte_t *)pmd);
533         if (!pte_present(entry))
534                 goto unlock;
535         page = pte_page(entry);
536         nid = page_to_nid(page);
537         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
538                 goto unlock;
539         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
540         if (flags & (MPOL_MF_MOVE_ALL) ||
541             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
542                 isolate_huge_page(page, private);
543 unlock:
544         spin_unlock(ptl);
545 #else
546         BUG();
547 #endif
548 }
549 
550 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
551                 unsigned long addr, unsigned long end,
552                 const nodemask_t *nodes, unsigned long flags,
553                 void *private)
554 {
555         pmd_t *pmd;
556         unsigned long next;
557 
558         pmd = pmd_offset(pud, addr);
559         do {
560                 next = pmd_addr_end(addr, end);
561                 if (!pmd_present(*pmd))
562                         continue;
563                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
564                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
565                                                 flags, private);
566                         continue;
567                 }
568                 split_huge_page_pmd(vma, addr, pmd);
569                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
570                         continue;
571                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
572                                     flags, private))
573                         return -EIO;
574         } while (pmd++, addr = next, addr != end);
575         return 0;
576 }
577 
578 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
579                 unsigned long addr, unsigned long end,
580                 const nodemask_t *nodes, unsigned long flags,
581                 void *private)
582 {
583         pud_t *pud;
584         unsigned long next;
585 
586         pud = pud_offset(pgd, addr);
587         do {
588                 next = pud_addr_end(addr, end);
589                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
590                         continue;
591                 if (pud_none_or_clear_bad(pud))
592                         continue;
593                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
594                                     flags, private))
595                         return -EIO;
596         } while (pud++, addr = next, addr != end);
597         return 0;
598 }
599 
600 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
601                 unsigned long addr, unsigned long end,
602                 const nodemask_t *nodes, unsigned long flags,
603                 void *private)
604 {
605         pgd_t *pgd;
606         unsigned long next;
607 
608         pgd = pgd_offset(vma->vm_mm, addr);
609         do {
610                 next = pgd_addr_end(addr, end);
611                 if (pgd_none_or_clear_bad(pgd))
612                         continue;
613                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
614                                     flags, private))
615                         return -EIO;
616         } while (pgd++, addr = next, addr != end);
617         return 0;
618 }
619 
620 #ifdef CONFIG_NUMA_BALANCING
621 /*
622  * This is used to mark a range of virtual addresses to be inaccessible.
623  * These are later cleared by a NUMA hinting fault. Depending on these
624  * faults, pages may be migrated for better NUMA placement.
625  *
626  * This is assuming that NUMA faults are handled using PROT_NONE. If
627  * an architecture makes a different choice, it will need further
628  * changes to the core.
629  */
630 unsigned long change_prot_numa(struct vm_area_struct *vma,
631                         unsigned long addr, unsigned long end)
632 {
633         int nr_updated;
634 
635         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
636         if (nr_updated)
637                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
638 
639         return nr_updated;
640 }
641 #else
642 static unsigned long change_prot_numa(struct vm_area_struct *vma,
643                         unsigned long addr, unsigned long end)
644 {
645         return 0;
646 }
647 #endif /* CONFIG_NUMA_BALANCING */
648 
649 /*
650  * Walk through page tables and collect pages to be migrated.
651  *
652  * If pages found in a given range are on a set of nodes (determined by
653  * @nodes and @flags,) it's isolated and queued to the pagelist which is
654  * passed via @private.)
655  */
656 static int
657 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
658                 const nodemask_t *nodes, unsigned long flags, void *private)
659 {
660         int err = 0;
661         struct vm_area_struct *vma, *prev;
662 
663         vma = find_vma(mm, start);
664         if (!vma)
665                 return -EFAULT;
666         prev = NULL;
667         for (; vma && vma->vm_start < end; vma = vma->vm_next) {
668                 unsigned long endvma = vma->vm_end;
669 
670                 if (endvma > end)
671                         endvma = end;
672                 if (vma->vm_start > start)
673                         start = vma->vm_start;
674 
675                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
676                         if (!vma->vm_next && vma->vm_end < end)
677                                 return -EFAULT;
678                         if (prev && prev->vm_end < vma->vm_start)
679                                 return -EFAULT;
680                 }
681 
682                 if (flags & MPOL_MF_LAZY) {
683                         change_prot_numa(vma, start, endvma);
684                         goto next;
685                 }
686 
687                 if ((flags & MPOL_MF_STRICT) ||
688                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
689                       vma_migratable(vma))) {
690 
691                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
692                                                 flags, private);
693                         if (err)
694                                 break;
695                 }
696 next:
697                 prev = vma;
698         }
699         return err;
700 }
701 
702 /*
703  * Apply policy to a single VMA
704  * This must be called with the mmap_sem held for writing.
705  */
706 static int vma_replace_policy(struct vm_area_struct *vma,
707                                                 struct mempolicy *pol)
708 {
709         int err;
710         struct mempolicy *old;
711         struct mempolicy *new;
712 
713         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
714                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
715                  vma->vm_ops, vma->vm_file,
716                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
717 
718         new = mpol_dup(pol);
719         if (IS_ERR(new))
720                 return PTR_ERR(new);
721 
722         if (vma->vm_ops && vma->vm_ops->set_policy) {
723                 err = vma->vm_ops->set_policy(vma, new);
724                 if (err)
725                         goto err_out;
726         }
727 
728         old = vma->vm_policy;
729         vma->vm_policy = new; /* protected by mmap_sem */
730         mpol_put(old);
731 
732         return 0;
733  err_out:
734         mpol_put(new);
735         return err;
736 }
737 
738 /* Step 2: apply policy to a range and do splits. */
739 static int mbind_range(struct mm_struct *mm, unsigned long start,
740                        unsigned long end, struct mempolicy *new_pol)
741 {
742         struct vm_area_struct *next;
743         struct vm_area_struct *prev;
744         struct vm_area_struct *vma;
745         int err = 0;
746         pgoff_t pgoff;
747         unsigned long vmstart;
748         unsigned long vmend;
749 
750         vma = find_vma(mm, start);
751         if (!vma || vma->vm_start > start)
752                 return -EFAULT;
753 
754         prev = vma->vm_prev;
755         if (start > vma->vm_start)
756                 prev = vma;
757 
758         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
759                 next = vma->vm_next;
760                 vmstart = max(start, vma->vm_start);
761                 vmend   = min(end, vma->vm_end);
762 
763                 if (mpol_equal(vma_policy(vma), new_pol))
764                         continue;
765 
766                 pgoff = vma->vm_pgoff +
767                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
768                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
769                                   vma->anon_vma, vma->vm_file, pgoff,
770                                   new_pol);
771                 if (prev) {
772                         vma = prev;
773                         next = vma->vm_next;
774                         if (mpol_equal(vma_policy(vma), new_pol))
775                                 continue;
776                         /* vma_merge() joined vma && vma->next, case 8 */
777                         goto replace;
778                 }
779                 if (vma->vm_start != vmstart) {
780                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
781                         if (err)
782                                 goto out;
783                 }
784                 if (vma->vm_end != vmend) {
785                         err = split_vma(vma->vm_mm, vma, vmend, 0);
786                         if (err)
787                                 goto out;
788                 }
789  replace:
790                 err = vma_replace_policy(vma, new_pol);
791                 if (err)
792                         goto out;
793         }
794 
795  out:
796         return err;
797 }
798 
799 /* Set the process memory policy */
800 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
801                              nodemask_t *nodes)
802 {
803         struct mempolicy *new, *old;
804         struct mm_struct *mm = current->mm;
805         NODEMASK_SCRATCH(scratch);
806         int ret;
807 
808         if (!scratch)
809                 return -ENOMEM;
810 
811         new = mpol_new(mode, flags, nodes);
812         if (IS_ERR(new)) {
813                 ret = PTR_ERR(new);
814                 goto out;
815         }
816         /*
817          * prevent changing our mempolicy while show_numa_maps()
818          * is using it.
819          * Note:  do_set_mempolicy() can be called at init time
820          * with no 'mm'.
821          */
822         if (mm)
823                 down_write(&mm->mmap_sem);
824         task_lock(current);
825         ret = mpol_set_nodemask(new, nodes, scratch);
826         if (ret) {
827                 task_unlock(current);
828                 if (mm)
829                         up_write(&mm->mmap_sem);
830                 mpol_put(new);
831                 goto out;
832         }
833         old = current->mempolicy;
834         current->mempolicy = new;
835         if (new && new->mode == MPOL_INTERLEAVE &&
836             nodes_weight(new->v.nodes))
837                 current->il_next = first_node(new->v.nodes);
838         task_unlock(current);
839         if (mm)
840                 up_write(&mm->mmap_sem);
841 
842         mpol_put(old);
843         ret = 0;
844 out:
845         NODEMASK_SCRATCH_FREE(scratch);
846         return ret;
847 }
848 
849 /*
850  * Return nodemask for policy for get_mempolicy() query
851  *
852  * Called with task's alloc_lock held
853  */
854 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
855 {
856         nodes_clear(*nodes);
857         if (p == &default_policy)
858                 return;
859 
860         switch (p->mode) {
861         case MPOL_BIND:
862                 /* Fall through */
863         case MPOL_INTERLEAVE:
864                 *nodes = p->v.nodes;
865                 break;
866         case MPOL_PREFERRED:
867                 if (!(p->flags & MPOL_F_LOCAL))
868                         node_set(p->v.preferred_node, *nodes);
869                 /* else return empty node mask for local allocation */
870                 break;
871         default:
872                 BUG();
873         }
874 }
875 
876 static int lookup_node(struct mm_struct *mm, unsigned long addr)
877 {
878         struct page *p;
879         int err;
880 
881         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
882         if (err >= 0) {
883                 err = page_to_nid(p);
884                 put_page(p);
885         }
886         return err;
887 }
888 
889 /* Retrieve NUMA policy */
890 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
891                              unsigned long addr, unsigned long flags)
892 {
893         int err;
894         struct mm_struct *mm = current->mm;
895         struct vm_area_struct *vma = NULL;
896         struct mempolicy *pol = current->mempolicy;
897 
898         if (flags &
899                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
900                 return -EINVAL;
901 
902         if (flags & MPOL_F_MEMS_ALLOWED) {
903                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
904                         return -EINVAL;
905                 *policy = 0;    /* just so it's initialized */
906                 task_lock(current);
907                 *nmask  = cpuset_current_mems_allowed;
908                 task_unlock(current);
909                 return 0;
910         }
911 
912         if (flags & MPOL_F_ADDR) {
913                 /*
914                  * Do NOT fall back to task policy if the
915                  * vma/shared policy at addr is NULL.  We
916                  * want to return MPOL_DEFAULT in this case.
917                  */
918                 down_read(&mm->mmap_sem);
919                 vma = find_vma_intersection(mm, addr, addr+1);
920                 if (!vma) {
921                         up_read(&mm->mmap_sem);
922                         return -EFAULT;
923                 }
924                 if (vma->vm_ops && vma->vm_ops->get_policy)
925                         pol = vma->vm_ops->get_policy(vma, addr);
926                 else
927                         pol = vma->vm_policy;
928         } else if (addr)
929                 return -EINVAL;
930 
931         if (!pol)
932                 pol = &default_policy;  /* indicates default behavior */
933 
934         if (flags & MPOL_F_NODE) {
935                 if (flags & MPOL_F_ADDR) {
936                         err = lookup_node(mm, addr);
937                         if (err < 0)
938                                 goto out;
939                         *policy = err;
940                 } else if (pol == current->mempolicy &&
941                                 pol->mode == MPOL_INTERLEAVE) {
942                         *policy = current->il_next;
943                 } else {
944                         err = -EINVAL;
945                         goto out;
946                 }
947         } else {
948                 *policy = pol == &default_policy ? MPOL_DEFAULT :
949                                                 pol->mode;
950                 /*
951                  * Internal mempolicy flags must be masked off before exposing
952                  * the policy to userspace.
953                  */
954                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
955         }
956 
957         if (vma) {
958                 up_read(&current->mm->mmap_sem);
959                 vma = NULL;
960         }
961 
962         err = 0;
963         if (nmask) {
964                 if (mpol_store_user_nodemask(pol)) {
965                         *nmask = pol->w.user_nodemask;
966                 } else {
967                         task_lock(current);
968                         get_policy_nodemask(pol, nmask);
969                         task_unlock(current);
970                 }
971         }
972 
973  out:
974         mpol_cond_put(pol);
975         if (vma)
976                 up_read(&current->mm->mmap_sem);
977         return err;
978 }
979 
980 #ifdef CONFIG_MIGRATION
981 /*
982  * page migration
983  */
984 static void migrate_page_add(struct page *page, struct list_head *pagelist,
985                                 unsigned long flags)
986 {
987         /*
988          * Avoid migrating a page that is shared with others.
989          */
990         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
991                 if (!isolate_lru_page(page)) {
992                         list_add_tail(&page->lru, pagelist);
993                         inc_zone_page_state(page, NR_ISOLATED_ANON +
994                                             page_is_file_cache(page));
995                 }
996         }
997 }
998 
999 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1000 {
1001         if (PageHuge(page))
1002                 return alloc_huge_page_node(page_hstate(compound_head(page)),
1003                                         node);
1004         else
1005                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1006 }
1007 
1008 /*
1009  * Migrate pages from one node to a target node.
1010  * Returns error or the number of pages not migrated.
1011  */
1012 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1013                            int flags)
1014 {
1015         nodemask_t nmask;
1016         LIST_HEAD(pagelist);
1017         int err = 0;
1018 
1019         nodes_clear(nmask);
1020         node_set(source, nmask);
1021 
1022         /*
1023          * This does not "check" the range but isolates all pages that
1024          * need migration.  Between passing in the full user address
1025          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1026          */
1027         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1028         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1029                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1030 
1031         if (!list_empty(&pagelist)) {
1032                 err = migrate_pages(&pagelist, new_node_page, dest,
1033                                         MIGRATE_SYNC, MR_SYSCALL);
1034                 if (err)
1035                         putback_movable_pages(&pagelist);
1036         }
1037 
1038         return err;
1039 }
1040 
1041 /*
1042  * Move pages between the two nodesets so as to preserve the physical
1043  * layout as much as possible.
1044  *
1045  * Returns the number of page that could not be moved.
1046  */
1047 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1048                      const nodemask_t *to, int flags)
1049 {
1050         int busy = 0;
1051         int err;
1052         nodemask_t tmp;
1053 
1054         err = migrate_prep();
1055         if (err)
1056                 return err;
1057 
1058         down_read(&mm->mmap_sem);
1059 
1060         err = migrate_vmas(mm, from, to, flags);
1061         if (err)
1062                 goto out;
1063 
1064         /*
1065          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1066          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1067          * bit in 'tmp', and return that <source, dest> pair for migration.
1068          * The pair of nodemasks 'to' and 'from' define the map.
1069          *
1070          * If no pair of bits is found that way, fallback to picking some
1071          * pair of 'source' and 'dest' bits that are not the same.  If the
1072          * 'source' and 'dest' bits are the same, this represents a node
1073          * that will be migrating to itself, so no pages need move.
1074          *
1075          * If no bits are left in 'tmp', or if all remaining bits left
1076          * in 'tmp' correspond to the same bit in 'to', return false
1077          * (nothing left to migrate).
1078          *
1079          * This lets us pick a pair of nodes to migrate between, such that
1080          * if possible the dest node is not already occupied by some other
1081          * source node, minimizing the risk of overloading the memory on a
1082          * node that would happen if we migrated incoming memory to a node
1083          * before migrating outgoing memory source that same node.
1084          *
1085          * A single scan of tmp is sufficient.  As we go, we remember the
1086          * most recent <s, d> pair that moved (s != d).  If we find a pair
1087          * that not only moved, but what's better, moved to an empty slot
1088          * (d is not set in tmp), then we break out then, with that pair.
1089          * Otherwise when we finish scanning from_tmp, we at least have the
1090          * most recent <s, d> pair that moved.  If we get all the way through
1091          * the scan of tmp without finding any node that moved, much less
1092          * moved to an empty node, then there is nothing left worth migrating.
1093          */
1094 
1095         tmp = *from;
1096         while (!nodes_empty(tmp)) {
1097                 int s,d;
1098                 int source = NUMA_NO_NODE;
1099                 int dest = 0;
1100 
1101                 for_each_node_mask(s, tmp) {
1102 
1103                         /*
1104                          * do_migrate_pages() tries to maintain the relative
1105                          * node relationship of the pages established between
1106                          * threads and memory areas.
1107                          *
1108                          * However if the number of source nodes is not equal to
1109                          * the number of destination nodes we can not preserve
1110                          * this node relative relationship.  In that case, skip
1111                          * copying memory from a node that is in the destination
1112                          * mask.
1113                          *
1114                          * Example: [2,3,4] -> [3,4,5] moves everything.
1115                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1116                          */
1117 
1118                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1119                                                 (node_isset(s, *to)))
1120                                 continue;
1121 
1122                         d = node_remap(s, *from, *to);
1123                         if (s == d)
1124                                 continue;
1125 
1126                         source = s;     /* Node moved. Memorize */
1127                         dest = d;
1128 
1129                         /* dest not in remaining from nodes? */
1130                         if (!node_isset(dest, tmp))
1131                                 break;
1132                 }
1133                 if (source == NUMA_NO_NODE)
1134                         break;
1135 
1136                 node_clear(source, tmp);
1137                 err = migrate_to_node(mm, source, dest, flags);
1138                 if (err > 0)
1139                         busy += err;
1140                 if (err < 0)
1141                         break;
1142         }
1143 out:
1144         up_read(&mm->mmap_sem);
1145         if (err < 0)
1146                 return err;
1147         return busy;
1148 
1149 }
1150 
1151 /*
1152  * Allocate a new page for page migration based on vma policy.
1153  * Start by assuming the page is mapped by the same vma as contains @start.
1154  * Search forward from there, if not.  N.B., this assumes that the
1155  * list of pages handed to migrate_pages()--which is how we get here--
1156  * is in virtual address order.
1157  */
1158 static struct page *new_page(struct page *page, unsigned long start, int **x)
1159 {
1160         struct vm_area_struct *vma;
1161         unsigned long uninitialized_var(address);
1162 
1163         vma = find_vma(current->mm, start);
1164         while (vma) {
1165                 address = page_address_in_vma(page, vma);
1166                 if (address != -EFAULT)
1167                         break;
1168                 vma = vma->vm_next;
1169         }
1170 
1171         if (PageHuge(page)) {
1172                 BUG_ON(!vma);
1173                 return alloc_huge_page_noerr(vma, address, 1);
1174         }
1175         /*
1176          * if !vma, alloc_page_vma() will use task or system default policy
1177          */
1178         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1179 }
1180 #else
1181 
1182 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1183                                 unsigned long flags)
1184 {
1185 }
1186 
1187 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1188                      const nodemask_t *to, int flags)
1189 {
1190         return -ENOSYS;
1191 }
1192 
1193 static struct page *new_page(struct page *page, unsigned long start, int **x)
1194 {
1195         return NULL;
1196 }
1197 #endif
1198 
1199 static long do_mbind(unsigned long start, unsigned long len,
1200                      unsigned short mode, unsigned short mode_flags,
1201                      nodemask_t *nmask, unsigned long flags)
1202 {
1203         struct mm_struct *mm = current->mm;
1204         struct mempolicy *new;
1205         unsigned long end;
1206         int err;
1207         LIST_HEAD(pagelist);
1208 
1209         if (flags & ~(unsigned long)MPOL_MF_VALID)
1210                 return -EINVAL;
1211         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1212                 return -EPERM;
1213 
1214         if (start & ~PAGE_MASK)
1215                 return -EINVAL;
1216 
1217         if (mode == MPOL_DEFAULT)
1218                 flags &= ~MPOL_MF_STRICT;
1219 
1220         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1221         end = start + len;
1222 
1223         if (end < start)
1224                 return -EINVAL;
1225         if (end == start)
1226                 return 0;
1227 
1228         new = mpol_new(mode, mode_flags, nmask);
1229         if (IS_ERR(new))
1230                 return PTR_ERR(new);
1231 
1232         if (flags & MPOL_MF_LAZY)
1233                 new->flags |= MPOL_F_MOF;
1234 
1235         /*
1236          * If we are using the default policy then operation
1237          * on discontinuous address spaces is okay after all
1238          */
1239         if (!new)
1240                 flags |= MPOL_MF_DISCONTIG_OK;
1241 
1242         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1243                  start, start + len, mode, mode_flags,
1244                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1245 
1246         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1247 
1248                 err = migrate_prep();
1249                 if (err)
1250                         goto mpol_out;
1251         }
1252         {
1253                 NODEMASK_SCRATCH(scratch);
1254                 if (scratch) {
1255                         down_write(&mm->mmap_sem);
1256                         task_lock(current);
1257                         err = mpol_set_nodemask(new, nmask, scratch);
1258                         task_unlock(current);
1259                         if (err)
1260                                 up_write(&mm->mmap_sem);
1261                 } else
1262                         err = -ENOMEM;
1263                 NODEMASK_SCRATCH_FREE(scratch);
1264         }
1265         if (err)
1266                 goto mpol_out;
1267 
1268         err = queue_pages_range(mm, start, end, nmask,
1269                           flags | MPOL_MF_INVERT, &pagelist);
1270         if (!err)
1271                 err = mbind_range(mm, start, end, new);
1272 
1273         if (!err) {
1274                 int nr_failed = 0;
1275 
1276                 if (!list_empty(&pagelist)) {
1277                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1278                         nr_failed = migrate_pages(&pagelist, new_page,
1279                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1280                         if (nr_failed)
1281                                 putback_movable_pages(&pagelist);
1282                 }
1283 
1284                 if (nr_failed && (flags & MPOL_MF_STRICT))
1285                         err = -EIO;
1286         } else
1287                 putback_movable_pages(&pagelist);
1288 
1289         up_write(&mm->mmap_sem);
1290  mpol_out:
1291         mpol_put(new);
1292         return err;
1293 }
1294 
1295 /*
1296  * User space interface with variable sized bitmaps for nodelists.
1297  */
1298 
1299 /* Copy a node mask from user space. */
1300 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1301                      unsigned long maxnode)
1302 {
1303         unsigned long k;
1304         unsigned long nlongs;
1305         unsigned long endmask;
1306 
1307         --maxnode;
1308         nodes_clear(*nodes);
1309         if (maxnode == 0 || !nmask)
1310                 return 0;
1311         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1312                 return -EINVAL;
1313 
1314         nlongs = BITS_TO_LONGS(maxnode);
1315         if ((maxnode % BITS_PER_LONG) == 0)
1316                 endmask = ~0UL;
1317         else
1318                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1319 
1320         /* When the user specified more nodes than supported just check
1321            if the non supported part is all zero. */
1322         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1323                 if (nlongs > PAGE_SIZE/sizeof(long))
1324                         return -EINVAL;
1325                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1326                         unsigned long t;
1327                         if (get_user(t, nmask + k))
1328                                 return -EFAULT;
1329                         if (k == nlongs - 1) {
1330                                 if (t & endmask)
1331                                         return -EINVAL;
1332                         } else if (t)
1333                                 return -EINVAL;
1334                 }
1335                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1336                 endmask = ~0UL;
1337         }
1338 
1339         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1340                 return -EFAULT;
1341         nodes_addr(*nodes)[nlongs-1] &= endmask;
1342         return 0;
1343 }
1344 
1345 /* Copy a kernel node mask to user space */
1346 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1347                               nodemask_t *nodes)
1348 {
1349         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1350         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1351 
1352         if (copy > nbytes) {
1353                 if (copy > PAGE_SIZE)
1354                         return -EINVAL;
1355                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1356                         return -EFAULT;
1357                 copy = nbytes;
1358         }
1359         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1360 }
1361 
1362 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1363                 unsigned long, mode, unsigned long __user *, nmask,
1364                 unsigned long, maxnode, unsigned, flags)
1365 {
1366         nodemask_t nodes;
1367         int err;
1368         unsigned short mode_flags;
1369 
1370         mode_flags = mode & MPOL_MODE_FLAGS;
1371         mode &= ~MPOL_MODE_FLAGS;
1372         if (mode >= MPOL_MAX)
1373                 return -EINVAL;
1374         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1375             (mode_flags & MPOL_F_RELATIVE_NODES))
1376                 return -EINVAL;
1377         err = get_nodes(&nodes, nmask, maxnode);
1378         if (err)
1379                 return err;
1380         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1381 }
1382 
1383 /* Set the process memory policy */
1384 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1385                 unsigned long, maxnode)
1386 {
1387         int err;
1388         nodemask_t nodes;
1389         unsigned short flags;
1390 
1391         flags = mode & MPOL_MODE_FLAGS;
1392         mode &= ~MPOL_MODE_FLAGS;
1393         if ((unsigned int)mode >= MPOL_MAX)
1394                 return -EINVAL;
1395         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1396                 return -EINVAL;
1397         err = get_nodes(&nodes, nmask, maxnode);
1398         if (err)
1399                 return err;
1400         return do_set_mempolicy(mode, flags, &nodes);
1401 }
1402 
1403 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1404                 const unsigned long __user *, old_nodes,
1405                 const unsigned long __user *, new_nodes)
1406 {
1407         const struct cred *cred = current_cred(), *tcred;
1408         struct mm_struct *mm = NULL;
1409         struct task_struct *task;
1410         nodemask_t task_nodes;
1411         int err;
1412         nodemask_t *old;
1413         nodemask_t *new;
1414         NODEMASK_SCRATCH(scratch);
1415 
1416         if (!scratch)
1417                 return -ENOMEM;
1418 
1419         old = &scratch->mask1;
1420         new = &scratch->mask2;
1421 
1422         err = get_nodes(old, old_nodes, maxnode);
1423         if (err)
1424                 goto out;
1425 
1426         err = get_nodes(new, new_nodes, maxnode);
1427         if (err)
1428                 goto out;
1429 
1430         /* Find the mm_struct */
1431         rcu_read_lock();
1432         task = pid ? find_task_by_vpid(pid) : current;
1433         if (!task) {
1434                 rcu_read_unlock();
1435                 err = -ESRCH;
1436                 goto out;
1437         }
1438         get_task_struct(task);
1439 
1440         err = -EINVAL;
1441 
1442         /*
1443          * Check if this process has the right to modify the specified
1444          * process. The right exists if the process has administrative
1445          * capabilities, superuser privileges or the same
1446          * userid as the target process.
1447          */
1448         tcred = __task_cred(task);
1449         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1450             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1451             !capable(CAP_SYS_NICE)) {
1452                 rcu_read_unlock();
1453                 err = -EPERM;
1454                 goto out_put;
1455         }
1456         rcu_read_unlock();
1457 
1458         task_nodes = cpuset_mems_allowed(task);
1459         /* Is the user allowed to access the target nodes? */
1460         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1461                 err = -EPERM;
1462                 goto out_put;
1463         }
1464 
1465         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1466                 err = -EINVAL;
1467                 goto out_put;
1468         }
1469 
1470         err = security_task_movememory(task);
1471         if (err)
1472                 goto out_put;
1473 
1474         mm = get_task_mm(task);
1475         put_task_struct(task);
1476 
1477         if (!mm) {
1478                 err = -EINVAL;
1479                 goto out;
1480         }
1481 
1482         err = do_migrate_pages(mm, old, new,
1483                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1484 
1485         mmput(mm);
1486 out:
1487         NODEMASK_SCRATCH_FREE(scratch);
1488 
1489         return err;
1490 
1491 out_put:
1492         put_task_struct(task);
1493         goto out;
1494 
1495 }
1496 
1497 
1498 /* Retrieve NUMA policy */
1499 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1500                 unsigned long __user *, nmask, unsigned long, maxnode,
1501                 unsigned long, addr, unsigned long, flags)
1502 {
1503         int err;
1504         int uninitialized_var(pval);
1505         nodemask_t nodes;
1506 
1507         if (nmask != NULL && maxnode < MAX_NUMNODES)
1508                 return -EINVAL;
1509 
1510         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1511 
1512         if (err)
1513                 return err;
1514 
1515         if (policy && put_user(pval, policy))
1516                 return -EFAULT;
1517 
1518         if (nmask)
1519                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1520 
1521         return err;
1522 }
1523 
1524 #ifdef CONFIG_COMPAT
1525 
1526 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1527                        compat_ulong_t __user *, nmask,
1528                        compat_ulong_t, maxnode,
1529                        compat_ulong_t, addr, compat_ulong_t, flags)
1530 {
1531         long err;
1532         unsigned long __user *nm = NULL;
1533         unsigned long nr_bits, alloc_size;
1534         DECLARE_BITMAP(bm, MAX_NUMNODES);
1535 
1536         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1537         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1538 
1539         if (nmask)
1540                 nm = compat_alloc_user_space(alloc_size);
1541 
1542         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1543 
1544         if (!err && nmask) {
1545                 unsigned long copy_size;
1546                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1547                 err = copy_from_user(bm, nm, copy_size);
1548                 /* ensure entire bitmap is zeroed */
1549                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1550                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1551         }
1552 
1553         return err;
1554 }
1555 
1556 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1557                        compat_ulong_t, maxnode)
1558 {
1559         long err = 0;
1560         unsigned long __user *nm = NULL;
1561         unsigned long nr_bits, alloc_size;
1562         DECLARE_BITMAP(bm, MAX_NUMNODES);
1563 
1564         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1565         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1566 
1567         if (nmask) {
1568                 err = compat_get_bitmap(bm, nmask, nr_bits);
1569                 nm = compat_alloc_user_space(alloc_size);
1570                 err |= copy_to_user(nm, bm, alloc_size);
1571         }
1572 
1573         if (err)
1574                 return -EFAULT;
1575 
1576         return sys_set_mempolicy(mode, nm, nr_bits+1);
1577 }
1578 
1579 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1580                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1581                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1582 {
1583         long err = 0;
1584         unsigned long __user *nm = NULL;
1585         unsigned long nr_bits, alloc_size;
1586         nodemask_t bm;
1587 
1588         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1589         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1590 
1591         if (nmask) {
1592                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1593                 nm = compat_alloc_user_space(alloc_size);
1594                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1595         }
1596 
1597         if (err)
1598                 return -EFAULT;
1599 
1600         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1601 }
1602 
1603 #endif
1604 
1605 /*
1606  * get_vma_policy(@task, @vma, @addr)
1607  * @task - task for fallback if vma policy == default
1608  * @vma   - virtual memory area whose policy is sought
1609  * @addr  - address in @vma for shared policy lookup
1610  *
1611  * Returns effective policy for a VMA at specified address.
1612  * Falls back to @task or system default policy, as necessary.
1613  * Current or other task's task mempolicy and non-shared vma policies must be
1614  * protected by task_lock(task) by the caller.
1615  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1616  * count--added by the get_policy() vm_op, as appropriate--to protect against
1617  * freeing by another task.  It is the caller's responsibility to free the
1618  * extra reference for shared policies.
1619  */
1620 struct mempolicy *get_vma_policy(struct task_struct *task,
1621                 struct vm_area_struct *vma, unsigned long addr)
1622 {
1623         struct mempolicy *pol = get_task_policy(task);
1624 
1625         if (vma) {
1626                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1627                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1628                                                                         addr);
1629                         if (vpol)
1630                                 pol = vpol;
1631                 } else if (vma->vm_policy) {
1632                         pol = vma->vm_policy;
1633 
1634                         /*
1635                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1636                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1637                          * count on these policies which will be dropped by
1638                          * mpol_cond_put() later
1639                          */
1640                         if (mpol_needs_cond_ref(pol))
1641                                 mpol_get(pol);
1642                 }
1643         }
1644         if (!pol)
1645                 pol = &default_policy;
1646         return pol;
1647 }
1648 
1649 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1650 {
1651         struct mempolicy *pol = get_task_policy(task);
1652         if (vma) {
1653                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1654                         bool ret = false;
1655 
1656                         pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1657                         if (pol && (pol->flags & MPOL_F_MOF))
1658                                 ret = true;
1659                         mpol_cond_put(pol);
1660 
1661                         return ret;
1662                 } else if (vma->vm_policy) {
1663                         pol = vma->vm_policy;
1664                 }
1665         }
1666 
1667         if (!pol)
1668                 return default_policy.flags & MPOL_F_MOF;
1669 
1670         return pol->flags & MPOL_F_MOF;
1671 }
1672 
1673 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1674 {
1675         enum zone_type dynamic_policy_zone = policy_zone;
1676 
1677         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1678 
1679         /*
1680          * if policy->v.nodes has movable memory only,
1681          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1682          *
1683          * policy->v.nodes is intersect with node_states[N_MEMORY].
1684          * so if the following test faile, it implies
1685          * policy->v.nodes has movable memory only.
1686          */
1687         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1688                 dynamic_policy_zone = ZONE_MOVABLE;
1689 
1690         return zone >= dynamic_policy_zone;
1691 }
1692 
1693 /*
1694  * Return a nodemask representing a mempolicy for filtering nodes for
1695  * page allocation
1696  */
1697 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1698 {
1699         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1700         if (unlikely(policy->mode == MPOL_BIND) &&
1701                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1702                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1703                 return &policy->v.nodes;
1704 
1705         return NULL;
1706 }
1707 
1708 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1709 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1710         int nd)
1711 {
1712         switch (policy->mode) {
1713         case MPOL_PREFERRED:
1714                 if (!(policy->flags & MPOL_F_LOCAL))
1715                         nd = policy->v.preferred_node;
1716                 break;
1717         case MPOL_BIND:
1718                 /*
1719                  * Normally, MPOL_BIND allocations are node-local within the
1720                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1721                  * current node isn't part of the mask, we use the zonelist for
1722                  * the first node in the mask instead.
1723                  */
1724                 if (unlikely(gfp & __GFP_THISNODE) &&
1725                                 unlikely(!node_isset(nd, policy->v.nodes)))
1726                         nd = first_node(policy->v.nodes);
1727                 break;
1728         default:
1729                 BUG();
1730         }
1731         return node_zonelist(nd, gfp);
1732 }
1733 
1734 /* Do dynamic interleaving for a process */
1735 static unsigned interleave_nodes(struct mempolicy *policy)
1736 {
1737         unsigned nid, next;
1738         struct task_struct *me = current;
1739 
1740         nid = me->il_next;
1741         next = next_node(nid, policy->v.nodes);
1742         if (next >= MAX_NUMNODES)
1743                 next = first_node(policy->v.nodes);
1744         if (next < MAX_NUMNODES)
1745                 me->il_next = next;
1746         return nid;
1747 }
1748 
1749 /*
1750  * Depending on the memory policy provide a node from which to allocate the
1751  * next slab entry.
1752  */
1753 unsigned int mempolicy_slab_node(void)
1754 {
1755         struct mempolicy *policy;
1756         int node = numa_mem_id();
1757 
1758         if (in_interrupt())
1759                 return node;
1760 
1761         policy = current->mempolicy;
1762         if (!policy || policy->flags & MPOL_F_LOCAL)
1763                 return node;
1764 
1765         switch (policy->mode) {
1766         case MPOL_PREFERRED:
1767                 /*
1768                  * handled MPOL_F_LOCAL above
1769                  */
1770                 return policy->v.preferred_node;
1771 
1772         case MPOL_INTERLEAVE:
1773                 return interleave_nodes(policy);
1774 
1775         case MPOL_BIND: {
1776                 /*
1777                  * Follow bind policy behavior and start allocation at the
1778                  * first node.
1779                  */
1780                 struct zonelist *zonelist;
1781                 struct zone *zone;
1782                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1783                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1784                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1785                                                         &policy->v.nodes,
1786                                                         &zone);
1787                 return zone ? zone->node : node;
1788         }
1789 
1790         default:
1791                 BUG();
1792         }
1793 }
1794 
1795 /* Do static interleaving for a VMA with known offset. */
1796 static unsigned offset_il_node(struct mempolicy *pol,
1797                 struct vm_area_struct *vma, unsigned long off)
1798 {
1799         unsigned nnodes = nodes_weight(pol->v.nodes);
1800         unsigned target;
1801         int c;
1802         int nid = NUMA_NO_NODE;
1803 
1804         if (!nnodes)
1805                 return numa_node_id();
1806         target = (unsigned int)off % nnodes;
1807         c = 0;
1808         do {
1809                 nid = next_node(nid, pol->v.nodes);
1810                 c++;
1811         } while (c <= target);
1812         return nid;
1813 }
1814 
1815 /* Determine a node number for interleave */
1816 static inline unsigned interleave_nid(struct mempolicy *pol,
1817                  struct vm_area_struct *vma, unsigned long addr, int shift)
1818 {
1819         if (vma) {
1820                 unsigned long off;
1821 
1822                 /*
1823                  * for small pages, there is no difference between
1824                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1825                  * for huge pages, since vm_pgoff is in units of small
1826                  * pages, we need to shift off the always 0 bits to get
1827                  * a useful offset.
1828                  */
1829                 BUG_ON(shift < PAGE_SHIFT);
1830                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1831                 off += (addr - vma->vm_start) >> shift;
1832                 return offset_il_node(pol, vma, off);
1833         } else
1834                 return interleave_nodes(pol);
1835 }
1836 
1837 /*
1838  * Return the bit number of a random bit set in the nodemask.
1839  * (returns NUMA_NO_NODE if nodemask is empty)
1840  */
1841 int node_random(const nodemask_t *maskp)
1842 {
1843         int w, bit = NUMA_NO_NODE;
1844 
1845         w = nodes_weight(*maskp);
1846         if (w)
1847                 bit = bitmap_ord_to_pos(maskp->bits,
1848                         get_random_int() % w, MAX_NUMNODES);
1849         return bit;
1850 }
1851 
1852 #ifdef CONFIG_HUGETLBFS
1853 /*
1854  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1855  * @vma = virtual memory area whose policy is sought
1856  * @addr = address in @vma for shared policy lookup and interleave policy
1857  * @gfp_flags = for requested zone
1858  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1859  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1860  *
1861  * Returns a zonelist suitable for a huge page allocation and a pointer
1862  * to the struct mempolicy for conditional unref after allocation.
1863  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1864  * @nodemask for filtering the zonelist.
1865  *
1866  * Must be protected by read_mems_allowed_begin()
1867  */
1868 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1869                                 gfp_t gfp_flags, struct mempolicy **mpol,
1870                                 nodemask_t **nodemask)
1871 {
1872         struct zonelist *zl;
1873 
1874         *mpol = get_vma_policy(current, vma, addr);
1875         *nodemask = NULL;       /* assume !MPOL_BIND */
1876 
1877         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1878                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1879                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1880         } else {
1881                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1882                 if ((*mpol)->mode == MPOL_BIND)
1883                         *nodemask = &(*mpol)->v.nodes;
1884         }
1885         return zl;
1886 }
1887 
1888 /*
1889  * init_nodemask_of_mempolicy
1890  *
1891  * If the current task's mempolicy is "default" [NULL], return 'false'
1892  * to indicate default policy.  Otherwise, extract the policy nodemask
1893  * for 'bind' or 'interleave' policy into the argument nodemask, or
1894  * initialize the argument nodemask to contain the single node for
1895  * 'preferred' or 'local' policy and return 'true' to indicate presence
1896  * of non-default mempolicy.
1897  *
1898  * We don't bother with reference counting the mempolicy [mpol_get/put]
1899  * because the current task is examining it's own mempolicy and a task's
1900  * mempolicy is only ever changed by the task itself.
1901  *
1902  * N.B., it is the caller's responsibility to free a returned nodemask.
1903  */
1904 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1905 {
1906         struct mempolicy *mempolicy;
1907         int nid;
1908 
1909         if (!(mask && current->mempolicy))
1910                 return false;
1911 
1912         task_lock(current);
1913         mempolicy = current->mempolicy;
1914         switch (mempolicy->mode) {
1915         case MPOL_PREFERRED:
1916                 if (mempolicy->flags & MPOL_F_LOCAL)
1917                         nid = numa_node_id();
1918                 else
1919                         nid = mempolicy->v.preferred_node;
1920                 init_nodemask_of_node(mask, nid);
1921                 break;
1922 
1923         case MPOL_BIND:
1924                 /* Fall through */
1925         case MPOL_INTERLEAVE:
1926                 *mask =  mempolicy->v.nodes;
1927                 break;
1928 
1929         default:
1930                 BUG();
1931         }
1932         task_unlock(current);
1933 
1934         return true;
1935 }
1936 #endif
1937 
1938 /*
1939  * mempolicy_nodemask_intersects
1940  *
1941  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1942  * policy.  Otherwise, check for intersection between mask and the policy
1943  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1944  * policy, always return true since it may allocate elsewhere on fallback.
1945  *
1946  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1947  */
1948 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1949                                         const nodemask_t *mask)
1950 {
1951         struct mempolicy *mempolicy;
1952         bool ret = true;
1953 
1954         if (!mask)
1955                 return ret;
1956         task_lock(tsk);
1957         mempolicy = tsk->mempolicy;
1958         if (!mempolicy)
1959                 goto out;
1960 
1961         switch (mempolicy->mode) {
1962         case MPOL_PREFERRED:
1963                 /*
1964                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1965                  * allocate from, they may fallback to other nodes when oom.
1966                  * Thus, it's possible for tsk to have allocated memory from
1967                  * nodes in mask.
1968                  */
1969                 break;
1970         case MPOL_BIND:
1971         case MPOL_INTERLEAVE:
1972                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1973                 break;
1974         default:
1975                 BUG();
1976         }
1977 out:
1978         task_unlock(tsk);
1979         return ret;
1980 }
1981 
1982 /* Allocate a page in interleaved policy.
1983    Own path because it needs to do special accounting. */
1984 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1985                                         unsigned nid)
1986 {
1987         struct zonelist *zl;
1988         struct page *page;
1989 
1990         zl = node_zonelist(nid, gfp);
1991         page = __alloc_pages(gfp, order, zl);
1992         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1993                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1994         return page;
1995 }
1996 
1997 /**
1998  *      alloc_pages_vma - Allocate a page for a VMA.
1999  *
2000  *      @gfp:
2001  *      %GFP_USER    user allocation.
2002  *      %GFP_KERNEL  kernel allocations,
2003  *      %GFP_HIGHMEM highmem/user allocations,
2004  *      %GFP_FS      allocation should not call back into a file system.
2005  *      %GFP_ATOMIC  don't sleep.
2006  *
2007  *      @order:Order of the GFP allocation.
2008  *      @vma:  Pointer to VMA or NULL if not available.
2009  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2010  *
2011  *      This function allocates a page from the kernel page pool and applies
2012  *      a NUMA policy associated with the VMA or the current process.
2013  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2014  *      mm_struct of the VMA to prevent it from going away. Should be used for
2015  *      all allocations for pages that will be mapped into
2016  *      user space. Returns NULL when no page can be allocated.
2017  *
2018  *      Should be called with the mm_sem of the vma hold.
2019  */
2020 struct page *
2021 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2022                 unsigned long addr, int node)
2023 {
2024         struct mempolicy *pol;
2025         struct page *page;
2026         unsigned int cpuset_mems_cookie;
2027 
2028 retry_cpuset:
2029         pol = get_vma_policy(current, vma, addr);
2030         cpuset_mems_cookie = read_mems_allowed_begin();
2031 
2032         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2033                 unsigned nid;
2034 
2035                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2036                 mpol_cond_put(pol);
2037                 page = alloc_page_interleave(gfp, order, nid);
2038                 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2039                         goto retry_cpuset;
2040 
2041                 return page;
2042         }
2043         page = __alloc_pages_nodemask(gfp, order,
2044                                       policy_zonelist(gfp, pol, node),
2045                                       policy_nodemask(gfp, pol));
2046         if (unlikely(mpol_needs_cond_ref(pol)))
2047                 __mpol_put(pol);
2048         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2049                 goto retry_cpuset;
2050         return page;
2051 }
2052 
2053 /**
2054  *      alloc_pages_current - Allocate pages.
2055  *
2056  *      @gfp:
2057  *              %GFP_USER   user allocation,
2058  *              %GFP_KERNEL kernel allocation,
2059  *              %GFP_HIGHMEM highmem allocation,
2060  *              %GFP_FS     don't call back into a file system.
2061  *              %GFP_ATOMIC don't sleep.
2062  *      @order: Power of two of allocation size in pages. 0 is a single page.
2063  *
2064  *      Allocate a page from the kernel page pool.  When not in
2065  *      interrupt context and apply the current process NUMA policy.
2066  *      Returns NULL when no page can be allocated.
2067  *
2068  *      Don't call cpuset_update_task_memory_state() unless
2069  *      1) it's ok to take cpuset_sem (can WAIT), and
2070  *      2) allocating for current task (not interrupt).
2071  */
2072 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2073 {
2074         struct mempolicy *pol = get_task_policy(current);
2075         struct page *page;
2076         unsigned int cpuset_mems_cookie;
2077 
2078         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2079                 pol = &default_policy;
2080 
2081 retry_cpuset:
2082         cpuset_mems_cookie = read_mems_allowed_begin();
2083 
2084         /*
2085          * No reference counting needed for current->mempolicy
2086          * nor system default_policy
2087          */
2088         if (pol->mode == MPOL_INTERLEAVE)
2089                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2090         else
2091                 page = __alloc_pages_nodemask(gfp, order,
2092                                 policy_zonelist(gfp, pol, numa_node_id()),
2093                                 policy_nodemask(gfp, pol));
2094 
2095         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2096                 goto retry_cpuset;
2097 
2098         return page;
2099 }
2100 EXPORT_SYMBOL(alloc_pages_current);
2101 
2102 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2103 {
2104         struct mempolicy *pol = mpol_dup(vma_policy(src));
2105 
2106         if (IS_ERR(pol))
2107                 return PTR_ERR(pol);
2108         dst->vm_policy = pol;
2109         return 0;
2110 }
2111 
2112 /*
2113  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2114  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2115  * with the mems_allowed returned by cpuset_mems_allowed().  This
2116  * keeps mempolicies cpuset relative after its cpuset moves.  See
2117  * further kernel/cpuset.c update_nodemask().
2118  *
2119  * current's mempolicy may be rebinded by the other task(the task that changes
2120  * cpuset's mems), so we needn't do rebind work for current task.
2121  */
2122 
2123 /* Slow path of a mempolicy duplicate */
2124 struct mempolicy *__mpol_dup(struct mempolicy *old)
2125 {
2126         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2127 
2128         if (!new)
2129                 return ERR_PTR(-ENOMEM);
2130 
2131         /* task's mempolicy is protected by alloc_lock */
2132         if (old == current->mempolicy) {
2133                 task_lock(current);
2134                 *new = *old;
2135                 task_unlock(current);
2136         } else
2137                 *new = *old;
2138 
2139         if (current_cpuset_is_being_rebound()) {
2140                 nodemask_t mems = cpuset_mems_allowed(current);
2141                 if (new->flags & MPOL_F_REBINDING)
2142                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2143                 else
2144                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2145         }
2146         atomic_set(&new->refcnt, 1);
2147         return new;
2148 }
2149 
2150 /* Slow path of a mempolicy comparison */
2151 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2152 {
2153         if (!a || !b)
2154                 return false;
2155         if (a->mode != b->mode)
2156                 return false;
2157         if (a->flags != b->flags)
2158                 return false;
2159         if (mpol_store_user_nodemask(a))
2160                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2161                         return false;
2162 
2163         switch (a->mode) {
2164         case MPOL_BIND:
2165                 /* Fall through */
2166         case MPOL_INTERLEAVE:
2167                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2168         case MPOL_PREFERRED:
2169                 return a->v.preferred_node == b->v.preferred_node;
2170         default:
2171                 BUG();
2172                 return false;
2173         }
2174 }
2175 
2176 /*
2177  * Shared memory backing store policy support.
2178  *
2179  * Remember policies even when nobody has shared memory mapped.
2180  * The policies are kept in Red-Black tree linked from the inode.
2181  * They are protected by the sp->lock spinlock, which should be held
2182  * for any accesses to the tree.
2183  */
2184 
2185 /* lookup first element intersecting start-end */
2186 /* Caller holds sp->lock */
2187 static struct sp_node *
2188 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2189 {
2190         struct rb_node *n = sp->root.rb_node;
2191 
2192         while (n) {
2193                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2194 
2195                 if (start >= p->end)
2196                         n = n->rb_right;
2197                 else if (end <= p->start)
2198                         n = n->rb_left;
2199                 else
2200                         break;
2201         }
2202         if (!n)
2203                 return NULL;
2204         for (;;) {
2205                 struct sp_node *w = NULL;
2206                 struct rb_node *prev = rb_prev(n);
2207                 if (!prev)
2208                         break;
2209                 w = rb_entry(prev, struct sp_node, nd);
2210                 if (w->end <= start)
2211                         break;
2212                 n = prev;
2213         }
2214         return rb_entry(n, struct sp_node, nd);
2215 }
2216 
2217 /* Insert a new shared policy into the list. */
2218 /* Caller holds sp->lock */
2219 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2220 {
2221         struct rb_node **p = &sp->root.rb_node;
2222         struct rb_node *parent = NULL;
2223         struct sp_node *nd;
2224 
2225         while (*p) {
2226                 parent = *p;
2227                 nd = rb_entry(parent, struct sp_node, nd);
2228                 if (new->start < nd->start)
2229                         p = &(*p)->rb_left;
2230                 else if (new->end > nd->end)
2231                         p = &(*p)->rb_right;
2232                 else
2233                         BUG();
2234         }
2235         rb_link_node(&new->nd, parent, p);
2236         rb_insert_color(&new->nd, &sp->root);
2237         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2238                  new->policy ? new->policy->mode : 0);
2239 }
2240 
2241 /* Find shared policy intersecting idx */
2242 struct mempolicy *
2243 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2244 {
2245         struct mempolicy *pol = NULL;
2246         struct sp_node *sn;
2247 
2248         if (!sp->root.rb_node)
2249                 return NULL;
2250         spin_lock(&sp->lock);
2251         sn = sp_lookup(sp, idx, idx+1);
2252         if (sn) {
2253                 mpol_get(sn->policy);
2254                 pol = sn->policy;
2255         }
2256         spin_unlock(&sp->lock);
2257         return pol;
2258 }
2259 
2260 static void sp_free(struct sp_node *n)
2261 {
2262         mpol_put(n->policy);
2263         kmem_cache_free(sn_cache, n);
2264 }
2265 
2266 /**
2267  * mpol_misplaced - check whether current page node is valid in policy
2268  *
2269  * @page   - page to be checked
2270  * @vma    - vm area where page mapped
2271  * @addr   - virtual address where page mapped
2272  *
2273  * Lookup current policy node id for vma,addr and "compare to" page's
2274  * node id.
2275  *
2276  * Returns:
2277  *      -1      - not misplaced, page is in the right node
2278  *      node    - node id where the page should be
2279  *
2280  * Policy determination "mimics" alloc_page_vma().
2281  * Called from fault path where we know the vma and faulting address.
2282  */
2283 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2284 {
2285         struct mempolicy *pol;
2286         struct zone *zone;
2287         int curnid = page_to_nid(page);
2288         unsigned long pgoff;
2289         int thiscpu = raw_smp_processor_id();
2290         int thisnid = cpu_to_node(thiscpu);
2291         int polnid = -1;
2292         int ret = -1;
2293 
2294         BUG_ON(!vma);
2295 
2296         pol = get_vma_policy(current, vma, addr);
2297         if (!(pol->flags & MPOL_F_MOF))
2298                 goto out;
2299 
2300         switch (pol->mode) {
2301         case MPOL_INTERLEAVE:
2302                 BUG_ON(addr >= vma->vm_end);
2303                 BUG_ON(addr < vma->vm_start);
2304 
2305                 pgoff = vma->vm_pgoff;
2306                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2307                 polnid = offset_il_node(pol, vma, pgoff);
2308                 break;
2309 
2310         case MPOL_PREFERRED:
2311                 if (pol->flags & MPOL_F_LOCAL)
2312                         polnid = numa_node_id();
2313                 else
2314                         polnid = pol->v.preferred_node;
2315                 break;
2316 
2317         case MPOL_BIND:
2318                 /*
2319                  * allows binding to multiple nodes.
2320                  * use current page if in policy nodemask,
2321                  * else select nearest allowed node, if any.
2322                  * If no allowed nodes, use current [!misplaced].
2323                  */
2324                 if (node_isset(curnid, pol->v.nodes))
2325                         goto out;
2326                 (void)first_zones_zonelist(
2327                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2328                                 gfp_zone(GFP_HIGHUSER),
2329                                 &pol->v.nodes, &zone);
2330                 polnid = zone->node;
2331                 break;
2332 
2333         default:
2334                 BUG();
2335         }
2336 
2337         /* Migrate the page towards the node whose CPU is referencing it */
2338         if (pol->flags & MPOL_F_MORON) {
2339                 polnid = thisnid;
2340 
2341                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2342                         goto out;
2343         }
2344 
2345         if (curnid != polnid)
2346                 ret = polnid;
2347 out:
2348         mpol_cond_put(pol);
2349 
2350         return ret;
2351 }
2352 
2353 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2354 {
2355         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2356         rb_erase(&n->nd, &sp->root);
2357         sp_free(n);
2358 }
2359 
2360 static void sp_node_init(struct sp_node *node, unsigned long start,
2361                         unsigned long end, struct mempolicy *pol)
2362 {
2363         node->start = start;
2364         node->end = end;
2365         node->policy = pol;
2366 }
2367 
2368 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2369                                 struct mempolicy *pol)
2370 {
2371         struct sp_node *n;
2372         struct mempolicy *newpol;
2373 
2374         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2375         if (!n)
2376                 return NULL;
2377 
2378         newpol = mpol_dup(pol);
2379         if (IS_ERR(newpol)) {
2380                 kmem_cache_free(sn_cache, n);
2381                 return NULL;
2382         }
2383         newpol->flags |= MPOL_F_SHARED;
2384         sp_node_init(n, start, end, newpol);
2385 
2386         return n;
2387 }
2388 
2389 /* Replace a policy range. */
2390 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2391                                  unsigned long end, struct sp_node *new)
2392 {
2393         struct sp_node *n;
2394         struct sp_node *n_new = NULL;
2395         struct mempolicy *mpol_new = NULL;
2396         int ret = 0;
2397 
2398 restart:
2399         spin_lock(&sp->lock);
2400         n = sp_lookup(sp, start, end);
2401         /* Take care of old policies in the same range. */
2402         while (n && n->start < end) {
2403                 struct rb_node *next = rb_next(&n->nd);
2404                 if (n->start >= start) {
2405                         if (n->end <= end)
2406                                 sp_delete(sp, n);
2407                         else
2408                                 n->start = end;
2409                 } else {
2410                         /* Old policy spanning whole new range. */
2411                         if (n->end > end) {
2412                                 if (!n_new)
2413                                         goto alloc_new;
2414 
2415                                 *mpol_new = *n->policy;
2416                                 atomic_set(&mpol_new->refcnt, 1);
2417                                 sp_node_init(n_new, end, n->end, mpol_new);
2418                                 n->end = start;
2419                                 sp_insert(sp, n_new);
2420                                 n_new = NULL;
2421                                 mpol_new = NULL;
2422                                 break;
2423                         } else
2424                                 n->end = start;
2425                 }
2426                 if (!next)
2427                         break;
2428                 n = rb_entry(next, struct sp_node, nd);
2429         }
2430         if (new)
2431                 sp_insert(sp, new);
2432         spin_unlock(&sp->lock);
2433         ret = 0;
2434 
2435 err_out:
2436         if (mpol_new)
2437                 mpol_put(mpol_new);
2438         if (n_new)
2439                 kmem_cache_free(sn_cache, n_new);
2440 
2441         return ret;
2442 
2443 alloc_new:
2444         spin_unlock(&sp->lock);
2445         ret = -ENOMEM;
2446         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2447         if (!n_new)
2448                 goto err_out;
2449         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2450         if (!mpol_new)
2451                 goto err_out;
2452         goto restart;
2453 }
2454 
2455 /**
2456  * mpol_shared_policy_init - initialize shared policy for inode
2457  * @sp: pointer to inode shared policy
2458  * @mpol:  struct mempolicy to install
2459  *
2460  * Install non-NULL @mpol in inode's shared policy rb-tree.
2461  * On entry, the current task has a reference on a non-NULL @mpol.
2462  * This must be released on exit.
2463  * This is called at get_inode() calls and we can use GFP_KERNEL.
2464  */
2465 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2466 {
2467         int ret;
2468 
2469         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2470         spin_lock_init(&sp->lock);
2471 
2472         if (mpol) {
2473                 struct vm_area_struct pvma;
2474                 struct mempolicy *new;
2475                 NODEMASK_SCRATCH(scratch);
2476 
2477                 if (!scratch)
2478                         goto put_mpol;
2479                 /* contextualize the tmpfs mount point mempolicy */
2480                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2481                 if (IS_ERR(new))
2482                         goto free_scratch; /* no valid nodemask intersection */
2483 
2484                 task_lock(current);
2485                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2486                 task_unlock(current);
2487                 if (ret)
2488                         goto put_new;
2489 
2490                 /* Create pseudo-vma that contains just the policy */
2491                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2492                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2493                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2494 
2495 put_new:
2496                 mpol_put(new);                  /* drop initial ref */
2497 free_scratch:
2498                 NODEMASK_SCRATCH_FREE(scratch);
2499 put_mpol:
2500                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2501         }
2502 }
2503 
2504 int mpol_set_shared_policy(struct shared_policy *info,
2505                         struct vm_area_struct *vma, struct mempolicy *npol)
2506 {
2507         int err;
2508         struct sp_node *new = NULL;
2509         unsigned long sz = vma_pages(vma);
2510 
2511         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2512                  vma->vm_pgoff,
2513                  sz, npol ? npol->mode : -1,
2514                  npol ? npol->flags : -1,
2515                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2516 
2517         if (npol) {
2518                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2519                 if (!new)
2520                         return -ENOMEM;
2521         }
2522         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2523         if (err && new)
2524                 sp_free(new);
2525         return err;
2526 }
2527 
2528 /* Free a backing policy store on inode delete. */
2529 void mpol_free_shared_policy(struct shared_policy *p)
2530 {
2531         struct sp_node *n;
2532         struct rb_node *next;
2533 
2534         if (!p->root.rb_node)
2535                 return;
2536         spin_lock(&p->lock);
2537         next = rb_first(&p->root);
2538         while (next) {
2539                 n = rb_entry(next, struct sp_node, nd);
2540                 next = rb_next(&n->nd);
2541                 sp_delete(p, n);
2542         }
2543         spin_unlock(&p->lock);
2544 }
2545 
2546 #ifdef CONFIG_NUMA_BALANCING
2547 static int __initdata numabalancing_override;
2548 
2549 static void __init check_numabalancing_enable(void)
2550 {
2551         bool numabalancing_default = false;
2552 
2553         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2554                 numabalancing_default = true;
2555 
2556         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2557         if (numabalancing_override)
2558                 set_numabalancing_state(numabalancing_override == 1);
2559 
2560         if (nr_node_ids > 1 && !numabalancing_override) {
2561                 pr_info("%s automatic NUMA balancing. "
2562                         "Configure with numa_balancing= or the "
2563                         "kernel.numa_balancing sysctl",
2564                         numabalancing_default ? "Enabling" : "Disabling");
2565                 set_numabalancing_state(numabalancing_default);
2566         }
2567 }
2568 
2569 static int __init setup_numabalancing(char *str)
2570 {
2571         int ret = 0;
2572         if (!str)
2573                 goto out;
2574 
2575         if (!strcmp(str, "enable")) {
2576                 numabalancing_override = 1;
2577                 ret = 1;
2578         } else if (!strcmp(str, "disable")) {
2579                 numabalancing_override = -1;
2580                 ret = 1;
2581         }
2582 out:
2583         if (!ret)
2584                 pr_warn("Unable to parse numa_balancing=\n");
2585 
2586         return ret;
2587 }
2588 __setup("numa_balancing=", setup_numabalancing);
2589 #else
2590 static inline void __init check_numabalancing_enable(void)
2591 {
2592 }
2593 #endif /* CONFIG_NUMA_BALANCING */
2594 
2595 /* assumes fs == KERNEL_DS */
2596 void __init numa_policy_init(void)
2597 {
2598         nodemask_t interleave_nodes;
2599         unsigned long largest = 0;
2600         int nid, prefer = 0;
2601 
2602         policy_cache = kmem_cache_create("numa_policy",
2603                                          sizeof(struct mempolicy),
2604                                          0, SLAB_PANIC, NULL);
2605 
2606         sn_cache = kmem_cache_create("shared_policy_node",
2607                                      sizeof(struct sp_node),
2608                                      0, SLAB_PANIC, NULL);
2609 
2610         for_each_node(nid) {
2611                 preferred_node_policy[nid] = (struct mempolicy) {
2612                         .refcnt = ATOMIC_INIT(1),
2613                         .mode = MPOL_PREFERRED,
2614                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2615                         .v = { .preferred_node = nid, },
2616                 };
2617         }
2618 
2619         /*
2620          * Set interleaving policy for system init. Interleaving is only
2621          * enabled across suitably sized nodes (default is >= 16MB), or
2622          * fall back to the largest node if they're all smaller.
2623          */
2624         nodes_clear(interleave_nodes);
2625         for_each_node_state(nid, N_MEMORY) {
2626                 unsigned long total_pages = node_present_pages(nid);
2627 
2628                 /* Preserve the largest node */
2629                 if (largest < total_pages) {
2630                         largest = total_pages;
2631                         prefer = nid;
2632                 }
2633 
2634                 /* Interleave this node? */
2635                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2636                         node_set(nid, interleave_nodes);
2637         }
2638 
2639         /* All too small, use the largest */
2640         if (unlikely(nodes_empty(interleave_nodes)))
2641                 node_set(prefer, interleave_nodes);
2642 
2643         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2644                 printk("numa_policy_init: interleaving failed\n");
2645 
2646         check_numabalancing_enable();
2647 }
2648 
2649 /* Reset policy of current process to default */
2650 void numa_default_policy(void)
2651 {
2652         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2653 }
2654 
2655 /*
2656  * Parse and format mempolicy from/to strings
2657  */
2658 
2659 /*
2660  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2661  */
2662 static const char * const policy_modes[] =
2663 {
2664         [MPOL_DEFAULT]    = "default",
2665         [MPOL_PREFERRED]  = "prefer",
2666         [MPOL_BIND]       = "bind",
2667         [MPOL_INTERLEAVE] = "interleave",
2668         [MPOL_LOCAL]      = "local",
2669 };
2670 
2671 
2672 #ifdef CONFIG_TMPFS
2673 /**
2674  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2675  * @str:  string containing mempolicy to parse
2676  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2677  *
2678  * Format of input:
2679  *      <mode>[=<flags>][:<nodelist>]
2680  *
2681  * On success, returns 0, else 1
2682  */
2683 int mpol_parse_str(char *str, struct mempolicy **mpol)
2684 {
2685         struct mempolicy *new = NULL;
2686         unsigned short mode;
2687         unsigned short mode_flags;
2688         nodemask_t nodes;
2689         char *nodelist = strchr(str, ':');
2690         char *flags = strchr(str, '=');
2691         int err = 1;
2692 
2693         if (nodelist) {
2694                 /* NUL-terminate mode or flags string */
2695                 *nodelist++ = '\0';
2696                 if (nodelist_parse(nodelist, nodes))
2697                         goto out;
2698                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2699                         goto out;
2700         } else
2701                 nodes_clear(nodes);
2702 
2703         if (flags)
2704                 *flags++ = '\0';        /* terminate mode string */
2705 
2706         for (mode = 0; mode < MPOL_MAX; mode++) {
2707                 if (!strcmp(str, policy_modes[mode])) {
2708                         break;
2709                 }
2710         }
2711         if (mode >= MPOL_MAX)
2712                 goto out;
2713 
2714         switch (mode) {
2715         case MPOL_PREFERRED:
2716                 /*
2717                  * Insist on a nodelist of one node only
2718                  */
2719                 if (nodelist) {
2720                         char *rest = nodelist;
2721                         while (isdigit(*rest))
2722                                 rest++;
2723                         if (*rest)
2724                                 goto out;
2725                 }
2726                 break;
2727         case MPOL_INTERLEAVE:
2728                 /*
2729                  * Default to online nodes with memory if no nodelist
2730                  */
2731                 if (!nodelist)
2732                         nodes = node_states[N_MEMORY];
2733                 break;
2734         case MPOL_LOCAL:
2735                 /*
2736                  * Don't allow a nodelist;  mpol_new() checks flags
2737                  */
2738                 if (nodelist)
2739                         goto out;
2740                 mode = MPOL_PREFERRED;
2741                 break;
2742         case MPOL_DEFAULT:
2743                 /*
2744                  * Insist on a empty nodelist
2745                  */
2746                 if (!nodelist)
2747                         err = 0;
2748                 goto out;
2749         case MPOL_BIND:
2750                 /*
2751                  * Insist on a nodelist
2752                  */
2753                 if (!nodelist)
2754                         goto out;
2755         }
2756 
2757         mode_flags = 0;
2758         if (flags) {
2759                 /*
2760                  * Currently, we only support two mutually exclusive
2761                  * mode flags.
2762                  */
2763                 if (!strcmp(flags, "static"))
2764                         mode_flags |= MPOL_F_STATIC_NODES;
2765                 else if (!strcmp(flags, "relative"))
2766                         mode_flags |= MPOL_F_RELATIVE_NODES;
2767                 else
2768                         goto out;
2769         }
2770 
2771         new = mpol_new(mode, mode_flags, &nodes);
2772         if (IS_ERR(new))
2773                 goto out;
2774 
2775         /*
2776          * Save nodes for mpol_to_str() to show the tmpfs mount options
2777          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2778          */
2779         if (mode != MPOL_PREFERRED)
2780                 new->v.nodes = nodes;
2781         else if (nodelist)
2782                 new->v.preferred_node = first_node(nodes);
2783         else
2784                 new->flags |= MPOL_F_LOCAL;
2785 
2786         /*
2787          * Save nodes for contextualization: this will be used to "clone"
2788          * the mempolicy in a specific context [cpuset] at a later time.
2789          */
2790         new->w.user_nodemask = nodes;
2791 
2792         err = 0;
2793 
2794 out:
2795         /* Restore string for error message */
2796         if (nodelist)
2797                 *--nodelist = ':';
2798         if (flags)
2799                 *--flags = '=';
2800         if (!err)
2801                 *mpol = new;
2802         return err;
2803 }
2804 #endif /* CONFIG_TMPFS */
2805 
2806 /**
2807  * mpol_to_str - format a mempolicy structure for printing
2808  * @buffer:  to contain formatted mempolicy string
2809  * @maxlen:  length of @buffer
2810  * @pol:  pointer to mempolicy to be formatted
2811  *
2812  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2813  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2814  * longest flag, "relative", and to display at least a few node ids.
2815  */
2816 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2817 {
2818         char *p = buffer;
2819         nodemask_t nodes = NODE_MASK_NONE;
2820         unsigned short mode = MPOL_DEFAULT;
2821         unsigned short flags = 0;
2822 
2823         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2824                 mode = pol->mode;
2825                 flags = pol->flags;
2826         }
2827 
2828         switch (mode) {
2829         case MPOL_DEFAULT:
2830                 break;
2831         case MPOL_PREFERRED:
2832                 if (flags & MPOL_F_LOCAL)
2833                         mode = MPOL_LOCAL;
2834                 else
2835                         node_set(pol->v.preferred_node, nodes);
2836                 break;
2837         case MPOL_BIND:
2838         case MPOL_INTERLEAVE:
2839                 nodes = pol->v.nodes;
2840                 break;
2841         default:
2842                 WARN_ON_ONCE(1);
2843                 snprintf(p, maxlen, "unknown");
2844                 return;
2845         }
2846 
2847         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2848 
2849         if (flags & MPOL_MODE_FLAGS) {
2850                 p += snprintf(p, buffer + maxlen - p, "=");
2851 
2852                 /*
2853                  * Currently, the only defined flags are mutually exclusive
2854                  */
2855                 if (flags & MPOL_F_STATIC_NODES)
2856                         p += snprintf(p, buffer + maxlen - p, "static");
2857                 else if (flags & MPOL_F_RELATIVE_NODES)
2858                         p += snprintf(p, buffer + maxlen - p, "relative");
2859         }
2860 
2861         if (!nodes_empty(nodes)) {
2862                 p += snprintf(p, buffer + maxlen - p, ":");
2863                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2864         }
2865 }
2866 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp