~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/mempolicy.c

Version: ~ [ linux-5.9.1 ] ~ [ linux-5.8.16 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.72 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.152 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.202 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.240 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.240 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * Simple NUMA memory policy for the Linux kernel.
  3  *
  4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
  5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
  6  * Subject to the GNU Public License, version 2.
  7  *
  8  * NUMA policy allows the user to give hints in which node(s) memory should
  9  * be allocated.
 10  *
 11  * Support four policies per VMA and per process:
 12  *
 13  * The VMA policy has priority over the process policy for a page fault.
 14  *
 15  * interleave     Allocate memory interleaved over a set of nodes,
 16  *                with normal fallback if it fails.
 17  *                For VMA based allocations this interleaves based on the
 18  *                offset into the backing object or offset into the mapping
 19  *                for anonymous memory. For process policy an process counter
 20  *                is used.
 21  *
 22  * bind           Only allocate memory on a specific set of nodes,
 23  *                no fallback.
 24  *                FIXME: memory is allocated starting with the first node
 25  *                to the last. It would be better if bind would truly restrict
 26  *                the allocation to memory nodes instead
 27  *
 28  * preferred       Try a specific node first before normal fallback.
 29  *                As a special case NUMA_NO_NODE here means do the allocation
 30  *                on the local CPU. This is normally identical to default,
 31  *                but useful to set in a VMA when you have a non default
 32  *                process policy.
 33  *
 34  * default        Allocate on the local node first, or when on a VMA
 35  *                use the process policy. This is what Linux always did
 36  *                in a NUMA aware kernel and still does by, ahem, default.
 37  *
 38  * The process policy is applied for most non interrupt memory allocations
 39  * in that process' context. Interrupts ignore the policies and always
 40  * try to allocate on the local CPU. The VMA policy is only applied for memory
 41  * allocations for a VMA in the VM.
 42  *
 43  * Currently there are a few corner cases in swapping where the policy
 44  * is not applied, but the majority should be handled. When process policy
 45  * is used it is not remembered over swap outs/swap ins.
 46  *
 47  * Only the highest zone in the zone hierarchy gets policied. Allocations
 48  * requesting a lower zone just use default policy. This implies that
 49  * on systems with highmem kernel lowmem allocation don't get policied.
 50  * Same with GFP_DMA allocations.
 51  *
 52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
 53  * all users and remembered even when nobody has memory mapped.
 54  */
 55 
 56 /* Notebook:
 57    fix mmap readahead to honour policy and enable policy for any page cache
 58    object
 59    statistics for bigpages
 60    global policy for page cache? currently it uses process policy. Requires
 61    first item above.
 62    handle mremap for shared memory (currently ignored for the policy)
 63    grows down?
 64    make bind policy root only? It can trigger oom much faster and the
 65    kernel is not always grateful with that.
 66 */
 67 
 68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 69 
 70 #include <linux/mempolicy.h>
 71 #include <linux/mm.h>
 72 #include <linux/highmem.h>
 73 #include <linux/hugetlb.h>
 74 #include <linux/kernel.h>
 75 #include <linux/sched.h>
 76 #include <linux/nodemask.h>
 77 #include <linux/cpuset.h>
 78 #include <linux/slab.h>
 79 #include <linux/string.h>
 80 #include <linux/export.h>
 81 #include <linux/nsproxy.h>
 82 #include <linux/interrupt.h>
 83 #include <linux/init.h>
 84 #include <linux/compat.h>
 85 #include <linux/swap.h>
 86 #include <linux/seq_file.h>
 87 #include <linux/proc_fs.h>
 88 #include <linux/migrate.h>
 89 #include <linux/ksm.h>
 90 #include <linux/rmap.h>
 91 #include <linux/security.h>
 92 #include <linux/syscalls.h>
 93 #include <linux/ctype.h>
 94 #include <linux/mm_inline.h>
 95 #include <linux/mmu_notifier.h>
 96 #include <linux/printk.h>
 97 
 98 #include <asm/tlbflush.h>
 99 #include <asm/uaccess.h>
100 #include <linux/random.h>
101 
102 #include "internal.h"
103 
104 /* Internal flags */
105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
107 
108 static struct kmem_cache *policy_cache;
109 static struct kmem_cache *sn_cache;
110 
111 /* Highest zone. An specific allocation for a zone below that is not
112    policied. */
113 enum zone_type policy_zone = 0;
114 
115 /*
116  * run-time system-wide default policy => local allocation
117  */
118 static struct mempolicy default_policy = {
119         .refcnt = ATOMIC_INIT(1), /* never free it */
120         .mode = MPOL_PREFERRED,
121         .flags = MPOL_F_LOCAL,
122 };
123 
124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125 
126 static struct mempolicy *get_task_policy(struct task_struct *p)
127 {
128         struct mempolicy *pol = p->mempolicy;
129 
130         if (!pol) {
131                 int node = numa_node_id();
132 
133                 if (node != NUMA_NO_NODE) {
134                         pol = &preferred_node_policy[node];
135                         /*
136                          * preferred_node_policy is not initialised early in
137                          * boot
138                          */
139                         if (!pol->mode)
140                                 pol = NULL;
141                 }
142         }
143 
144         return pol;
145 }
146 
147 static const struct mempolicy_operations {
148         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
149         /*
150          * If read-side task has no lock to protect task->mempolicy, write-side
151          * task will rebind the task->mempolicy by two step. The first step is
152          * setting all the newly nodes, and the second step is cleaning all the
153          * disallowed nodes. In this way, we can avoid finding no node to alloc
154          * page.
155          * If we have a lock to protect task->mempolicy in read-side, we do
156          * rebind directly.
157          *
158          * step:
159          *      MPOL_REBIND_ONCE - do rebind work at once
160          *      MPOL_REBIND_STEP1 - set all the newly nodes
161          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
162          */
163         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
164                         enum mpol_rebind_step step);
165 } mpol_ops[MPOL_MAX];
166 
167 /* Check that the nodemask contains at least one populated zone */
168 static int is_valid_nodemask(const nodemask_t *nodemask)
169 {
170         return nodes_intersects(*nodemask, node_states[N_MEMORY]);
171 }
172 
173 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
174 {
175         return pol->flags & MPOL_MODE_FLAGS;
176 }
177 
178 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
179                                    const nodemask_t *rel)
180 {
181         nodemask_t tmp;
182         nodes_fold(tmp, *orig, nodes_weight(*rel));
183         nodes_onto(*ret, tmp, *rel);
184 }
185 
186 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
187 {
188         if (nodes_empty(*nodes))
189                 return -EINVAL;
190         pol->v.nodes = *nodes;
191         return 0;
192 }
193 
194 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
195 {
196         if (!nodes)
197                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
198         else if (nodes_empty(*nodes))
199                 return -EINVAL;                 /*  no allowed nodes */
200         else
201                 pol->v.preferred_node = first_node(*nodes);
202         return 0;
203 }
204 
205 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
206 {
207         if (!is_valid_nodemask(nodes))
208                 return -EINVAL;
209         pol->v.nodes = *nodes;
210         return 0;
211 }
212 
213 /*
214  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
215  * any, for the new policy.  mpol_new() has already validated the nodes
216  * parameter with respect to the policy mode and flags.  But, we need to
217  * handle an empty nodemask with MPOL_PREFERRED here.
218  *
219  * Must be called holding task's alloc_lock to protect task's mems_allowed
220  * and mempolicy.  May also be called holding the mmap_semaphore for write.
221  */
222 static int mpol_set_nodemask(struct mempolicy *pol,
223                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
224 {
225         int ret;
226 
227         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
228         if (pol == NULL)
229                 return 0;
230         /* Check N_MEMORY */
231         nodes_and(nsc->mask1,
232                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
233 
234         VM_BUG_ON(!nodes);
235         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
236                 nodes = NULL;   /* explicit local allocation */
237         else {
238                 if (pol->flags & MPOL_F_RELATIVE_NODES)
239                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
240                 else
241                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
242 
243                 if (mpol_store_user_nodemask(pol))
244                         pol->w.user_nodemask = *nodes;
245                 else
246                         pol->w.cpuset_mems_allowed =
247                                                 cpuset_current_mems_allowed;
248         }
249 
250         if (nodes)
251                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
252         else
253                 ret = mpol_ops[pol->mode].create(pol, NULL);
254         return ret;
255 }
256 
257 /*
258  * This function just creates a new policy, does some check and simple
259  * initialization. You must invoke mpol_set_nodemask() to set nodes.
260  */
261 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262                                   nodemask_t *nodes)
263 {
264         struct mempolicy *policy;
265 
266         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
267                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
268 
269         if (mode == MPOL_DEFAULT) {
270                 if (nodes && !nodes_empty(*nodes))
271                         return ERR_PTR(-EINVAL);
272                 return NULL;
273         }
274         VM_BUG_ON(!nodes);
275 
276         /*
277          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
278          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
279          * All other modes require a valid pointer to a non-empty nodemask.
280          */
281         if (mode == MPOL_PREFERRED) {
282                 if (nodes_empty(*nodes)) {
283                         if (((flags & MPOL_F_STATIC_NODES) ||
284                              (flags & MPOL_F_RELATIVE_NODES)))
285                                 return ERR_PTR(-EINVAL);
286                 }
287         } else if (mode == MPOL_LOCAL) {
288                 if (!nodes_empty(*nodes))
289                         return ERR_PTR(-EINVAL);
290                 mode = MPOL_PREFERRED;
291         } else if (nodes_empty(*nodes))
292                 return ERR_PTR(-EINVAL);
293         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
294         if (!policy)
295                 return ERR_PTR(-ENOMEM);
296         atomic_set(&policy->refcnt, 1);
297         policy->mode = mode;
298         policy->flags = flags;
299 
300         return policy;
301 }
302 
303 /* Slow path of a mpol destructor. */
304 void __mpol_put(struct mempolicy *p)
305 {
306         if (!atomic_dec_and_test(&p->refcnt))
307                 return;
308         kmem_cache_free(policy_cache, p);
309 }
310 
311 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
312                                 enum mpol_rebind_step step)
313 {
314 }
315 
316 /*
317  * step:
318  *      MPOL_REBIND_ONCE  - do rebind work at once
319  *      MPOL_REBIND_STEP1 - set all the newly nodes
320  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
321  */
322 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
323                                  enum mpol_rebind_step step)
324 {
325         nodemask_t tmp;
326 
327         if (pol->flags & MPOL_F_STATIC_NODES)
328                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
329         else if (pol->flags & MPOL_F_RELATIVE_NODES)
330                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
331         else {
332                 /*
333                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
334                  * result
335                  */
336                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
337                         nodes_remap(tmp, pol->v.nodes,
338                                         pol->w.cpuset_mems_allowed, *nodes);
339                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
340                 } else if (step == MPOL_REBIND_STEP2) {
341                         tmp = pol->w.cpuset_mems_allowed;
342                         pol->w.cpuset_mems_allowed = *nodes;
343                 } else
344                         BUG();
345         }
346 
347         if (nodes_empty(tmp))
348                 tmp = *nodes;
349 
350         if (step == MPOL_REBIND_STEP1)
351                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
352         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
353                 pol->v.nodes = tmp;
354         else
355                 BUG();
356 
357         if (!node_isset(current->il_next, tmp)) {
358                 current->il_next = next_node(current->il_next, tmp);
359                 if (current->il_next >= MAX_NUMNODES)
360                         current->il_next = first_node(tmp);
361                 if (current->il_next >= MAX_NUMNODES)
362                         current->il_next = numa_node_id();
363         }
364 }
365 
366 static void mpol_rebind_preferred(struct mempolicy *pol,
367                                   const nodemask_t *nodes,
368                                   enum mpol_rebind_step step)
369 {
370         nodemask_t tmp;
371 
372         if (pol->flags & MPOL_F_STATIC_NODES) {
373                 int node = first_node(pol->w.user_nodemask);
374 
375                 if (node_isset(node, *nodes)) {
376                         pol->v.preferred_node = node;
377                         pol->flags &= ~MPOL_F_LOCAL;
378                 } else
379                         pol->flags |= MPOL_F_LOCAL;
380         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
381                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
382                 pol->v.preferred_node = first_node(tmp);
383         } else if (!(pol->flags & MPOL_F_LOCAL)) {
384                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
385                                                    pol->w.cpuset_mems_allowed,
386                                                    *nodes);
387                 pol->w.cpuset_mems_allowed = *nodes;
388         }
389 }
390 
391 /*
392  * mpol_rebind_policy - Migrate a policy to a different set of nodes
393  *
394  * If read-side task has no lock to protect task->mempolicy, write-side
395  * task will rebind the task->mempolicy by two step. The first step is
396  * setting all the newly nodes, and the second step is cleaning all the
397  * disallowed nodes. In this way, we can avoid finding no node to alloc
398  * page.
399  * If we have a lock to protect task->mempolicy in read-side, we do
400  * rebind directly.
401  *
402  * step:
403  *      MPOL_REBIND_ONCE  - do rebind work at once
404  *      MPOL_REBIND_STEP1 - set all the newly nodes
405  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
406  */
407 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
408                                 enum mpol_rebind_step step)
409 {
410         if (!pol)
411                 return;
412         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
413             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
414                 return;
415 
416         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
417                 return;
418 
419         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
420                 BUG();
421 
422         if (step == MPOL_REBIND_STEP1)
423                 pol->flags |= MPOL_F_REBINDING;
424         else if (step == MPOL_REBIND_STEP2)
425                 pol->flags &= ~MPOL_F_REBINDING;
426         else if (step >= MPOL_REBIND_NSTEP)
427                 BUG();
428 
429         mpol_ops[pol->mode].rebind(pol, newmask, step);
430 }
431 
432 /*
433  * Wrapper for mpol_rebind_policy() that just requires task
434  * pointer, and updates task mempolicy.
435  *
436  * Called with task's alloc_lock held.
437  */
438 
439 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
440                         enum mpol_rebind_step step)
441 {
442         mpol_rebind_policy(tsk->mempolicy, new, step);
443 }
444 
445 /*
446  * Rebind each vma in mm to new nodemask.
447  *
448  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
449  */
450 
451 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
452 {
453         struct vm_area_struct *vma;
454 
455         down_write(&mm->mmap_sem);
456         for (vma = mm->mmap; vma; vma = vma->vm_next)
457                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
458         up_write(&mm->mmap_sem);
459 }
460 
461 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
462         [MPOL_DEFAULT] = {
463                 .rebind = mpol_rebind_default,
464         },
465         [MPOL_INTERLEAVE] = {
466                 .create = mpol_new_interleave,
467                 .rebind = mpol_rebind_nodemask,
468         },
469         [MPOL_PREFERRED] = {
470                 .create = mpol_new_preferred,
471                 .rebind = mpol_rebind_preferred,
472         },
473         [MPOL_BIND] = {
474                 .create = mpol_new_bind,
475                 .rebind = mpol_rebind_nodemask,
476         },
477 };
478 
479 static void migrate_page_add(struct page *page, struct list_head *pagelist,
480                                 unsigned long flags);
481 
482 /*
483  * Scan through pages checking if pages follow certain conditions,
484  * and move them to the pagelist if they do.
485  */
486 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
487                 unsigned long addr, unsigned long end,
488                 const nodemask_t *nodes, unsigned long flags,
489                 void *private)
490 {
491         pte_t *orig_pte;
492         pte_t *pte;
493         spinlock_t *ptl;
494 
495         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
496         do {
497                 struct page *page;
498                 int nid;
499 
500                 if (!pte_present(*pte))
501                         continue;
502                 page = vm_normal_page(vma, addr, *pte);
503                 if (!page)
504                         continue;
505                 /*
506                  * vm_normal_page() filters out zero pages, but there might
507                  * still be PageReserved pages to skip, perhaps in a VDSO.
508                  */
509                 if (PageReserved(page))
510                         continue;
511                 nid = page_to_nid(page);
512                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
513                         continue;
514 
515                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
516                         migrate_page_add(page, private, flags);
517                 else
518                         break;
519         } while (pte++, addr += PAGE_SIZE, addr != end);
520         pte_unmap_unlock(orig_pte, ptl);
521         return addr != end;
522 }
523 
524 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
525                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
526                                     void *private)
527 {
528 #ifdef CONFIG_HUGETLB_PAGE
529         int nid;
530         struct page *page;
531         spinlock_t *ptl;
532         pte_t entry;
533 
534         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
535         entry = huge_ptep_get((pte_t *)pmd);
536         if (!pte_present(entry))
537                 goto unlock;
538         page = pte_page(entry);
539         nid = page_to_nid(page);
540         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
541                 goto unlock;
542         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
543         if (flags & (MPOL_MF_MOVE_ALL) ||
544             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
545                 isolate_huge_page(page, private);
546 unlock:
547         spin_unlock(ptl);
548 #else
549         BUG();
550 #endif
551 }
552 
553 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
554                 unsigned long addr, unsigned long end,
555                 const nodemask_t *nodes, unsigned long flags,
556                 void *private)
557 {
558         pmd_t *pmd;
559         unsigned long next;
560 
561         pmd = pmd_offset(pud, addr);
562         do {
563                 next = pmd_addr_end(addr, end);
564                 if (!pmd_present(*pmd))
565                         continue;
566                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
567                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
568                                                 flags, private);
569                         continue;
570                 }
571                 split_huge_page_pmd(vma, addr, pmd);
572                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
573                         continue;
574                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
575                                     flags, private))
576                         return -EIO;
577         } while (pmd++, addr = next, addr != end);
578         return 0;
579 }
580 
581 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
582                 unsigned long addr, unsigned long end,
583                 const nodemask_t *nodes, unsigned long flags,
584                 void *private)
585 {
586         pud_t *pud;
587         unsigned long next;
588 
589         pud = pud_offset(pgd, addr);
590         do {
591                 next = pud_addr_end(addr, end);
592                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
593                         continue;
594                 if (pud_none_or_clear_bad(pud))
595                         continue;
596                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
597                                     flags, private))
598                         return -EIO;
599         } while (pud++, addr = next, addr != end);
600         return 0;
601 }
602 
603 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
604                 unsigned long addr, unsigned long end,
605                 const nodemask_t *nodes, unsigned long flags,
606                 void *private)
607 {
608         pgd_t *pgd;
609         unsigned long next;
610 
611         pgd = pgd_offset(vma->vm_mm, addr);
612         do {
613                 next = pgd_addr_end(addr, end);
614                 if (pgd_none_or_clear_bad(pgd))
615                         continue;
616                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
617                                     flags, private))
618                         return -EIO;
619         } while (pgd++, addr = next, addr != end);
620         return 0;
621 }
622 
623 #ifdef CONFIG_NUMA_BALANCING
624 /*
625  * This is used to mark a range of virtual addresses to be inaccessible.
626  * These are later cleared by a NUMA hinting fault. Depending on these
627  * faults, pages may be migrated for better NUMA placement.
628  *
629  * This is assuming that NUMA faults are handled using PROT_NONE. If
630  * an architecture makes a different choice, it will need further
631  * changes to the core.
632  */
633 unsigned long change_prot_numa(struct vm_area_struct *vma,
634                         unsigned long addr, unsigned long end)
635 {
636         int nr_updated;
637 
638         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
639         if (nr_updated)
640                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
641 
642         return nr_updated;
643 }
644 #else
645 static unsigned long change_prot_numa(struct vm_area_struct *vma,
646                         unsigned long addr, unsigned long end)
647 {
648         return 0;
649 }
650 #endif /* CONFIG_NUMA_BALANCING */
651 
652 /*
653  * Walk through page tables and collect pages to be migrated.
654  *
655  * If pages found in a given range are on a set of nodes (determined by
656  * @nodes and @flags,) it's isolated and queued to the pagelist which is
657  * passed via @private.)
658  */
659 static int
660 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
661                 const nodemask_t *nodes, unsigned long flags, void *private)
662 {
663         int err = 0;
664         struct vm_area_struct *vma, *prev;
665 
666         vma = find_vma(mm, start);
667         if (!vma)
668                 return -EFAULT;
669         prev = NULL;
670         for (; vma && vma->vm_start < end; vma = vma->vm_next) {
671                 unsigned long endvma = vma->vm_end;
672 
673                 if (endvma > end)
674                         endvma = end;
675                 if (vma->vm_start > start)
676                         start = vma->vm_start;
677 
678                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
679                         if (!vma->vm_next && vma->vm_end < end)
680                                 return -EFAULT;
681                         if (prev && prev->vm_end < vma->vm_start)
682                                 return -EFAULT;
683                 }
684 
685                 if (flags & MPOL_MF_LAZY) {
686                         change_prot_numa(vma, start, endvma);
687                         goto next;
688                 }
689 
690                 if ((flags & MPOL_MF_STRICT) ||
691                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
692                       vma_migratable(vma))) {
693 
694                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
695                                                 flags, private);
696                         if (err)
697                                 break;
698                 }
699 next:
700                 prev = vma;
701         }
702         return err;
703 }
704 
705 /*
706  * Apply policy to a single VMA
707  * This must be called with the mmap_sem held for writing.
708  */
709 static int vma_replace_policy(struct vm_area_struct *vma,
710                                                 struct mempolicy *pol)
711 {
712         int err;
713         struct mempolicy *old;
714         struct mempolicy *new;
715 
716         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
717                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
718                  vma->vm_ops, vma->vm_file,
719                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
720 
721         new = mpol_dup(pol);
722         if (IS_ERR(new))
723                 return PTR_ERR(new);
724 
725         if (vma->vm_ops && vma->vm_ops->set_policy) {
726                 err = vma->vm_ops->set_policy(vma, new);
727                 if (err)
728                         goto err_out;
729         }
730 
731         old = vma->vm_policy;
732         vma->vm_policy = new; /* protected by mmap_sem */
733         mpol_put(old);
734 
735         return 0;
736  err_out:
737         mpol_put(new);
738         return err;
739 }
740 
741 /* Step 2: apply policy to a range and do splits. */
742 static int mbind_range(struct mm_struct *mm, unsigned long start,
743                        unsigned long end, struct mempolicy *new_pol)
744 {
745         struct vm_area_struct *next;
746         struct vm_area_struct *prev;
747         struct vm_area_struct *vma;
748         int err = 0;
749         pgoff_t pgoff;
750         unsigned long vmstart;
751         unsigned long vmend;
752 
753         vma = find_vma(mm, start);
754         if (!vma || vma->vm_start > start)
755                 return -EFAULT;
756 
757         prev = vma->vm_prev;
758         if (start > vma->vm_start)
759                 prev = vma;
760 
761         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
762                 next = vma->vm_next;
763                 vmstart = max(start, vma->vm_start);
764                 vmend   = min(end, vma->vm_end);
765 
766                 if (mpol_equal(vma_policy(vma), new_pol))
767                         continue;
768 
769                 pgoff = vma->vm_pgoff +
770                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
771                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
772                                   vma->anon_vma, vma->vm_file, pgoff,
773                                   new_pol);
774                 if (prev) {
775                         vma = prev;
776                         next = vma->vm_next;
777                         if (mpol_equal(vma_policy(vma), new_pol))
778                                 continue;
779                         /* vma_merge() joined vma && vma->next, case 8 */
780                         goto replace;
781                 }
782                 if (vma->vm_start != vmstart) {
783                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
784                         if (err)
785                                 goto out;
786                 }
787                 if (vma->vm_end != vmend) {
788                         err = split_vma(vma->vm_mm, vma, vmend, 0);
789                         if (err)
790                                 goto out;
791                 }
792  replace:
793                 err = vma_replace_policy(vma, new_pol);
794                 if (err)
795                         goto out;
796         }
797 
798  out:
799         return err;
800 }
801 
802 /* Set the process memory policy */
803 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
804                              nodemask_t *nodes)
805 {
806         struct mempolicy *new, *old;
807         struct mm_struct *mm = current->mm;
808         NODEMASK_SCRATCH(scratch);
809         int ret;
810 
811         if (!scratch)
812                 return -ENOMEM;
813 
814         new = mpol_new(mode, flags, nodes);
815         if (IS_ERR(new)) {
816                 ret = PTR_ERR(new);
817                 goto out;
818         }
819         /*
820          * prevent changing our mempolicy while show_numa_maps()
821          * is using it.
822          * Note:  do_set_mempolicy() can be called at init time
823          * with no 'mm'.
824          */
825         if (mm)
826                 down_write(&mm->mmap_sem);
827         task_lock(current);
828         ret = mpol_set_nodemask(new, nodes, scratch);
829         if (ret) {
830                 task_unlock(current);
831                 if (mm)
832                         up_write(&mm->mmap_sem);
833                 mpol_put(new);
834                 goto out;
835         }
836         old = current->mempolicy;
837         current->mempolicy = new;
838         if (new && new->mode == MPOL_INTERLEAVE &&
839             nodes_weight(new->v.nodes))
840                 current->il_next = first_node(new->v.nodes);
841         task_unlock(current);
842         if (mm)
843                 up_write(&mm->mmap_sem);
844 
845         mpol_put(old);
846         ret = 0;
847 out:
848         NODEMASK_SCRATCH_FREE(scratch);
849         return ret;
850 }
851 
852 /*
853  * Return nodemask for policy for get_mempolicy() query
854  *
855  * Called with task's alloc_lock held
856  */
857 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
858 {
859         nodes_clear(*nodes);
860         if (p == &default_policy)
861                 return;
862 
863         switch (p->mode) {
864         case MPOL_BIND:
865                 /* Fall through */
866         case MPOL_INTERLEAVE:
867                 *nodes = p->v.nodes;
868                 break;
869         case MPOL_PREFERRED:
870                 if (!(p->flags & MPOL_F_LOCAL))
871                         node_set(p->v.preferred_node, *nodes);
872                 /* else return empty node mask for local allocation */
873                 break;
874         default:
875                 BUG();
876         }
877 }
878 
879 static int lookup_node(struct mm_struct *mm, unsigned long addr)
880 {
881         struct page *p;
882         int err;
883 
884         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
885         if (err >= 0) {
886                 err = page_to_nid(p);
887                 put_page(p);
888         }
889         return err;
890 }
891 
892 /* Retrieve NUMA policy */
893 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
894                              unsigned long addr, unsigned long flags)
895 {
896         int err;
897         struct mm_struct *mm = current->mm;
898         struct vm_area_struct *vma = NULL;
899         struct mempolicy *pol = current->mempolicy;
900 
901         if (flags &
902                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
903                 return -EINVAL;
904 
905         if (flags & MPOL_F_MEMS_ALLOWED) {
906                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
907                         return -EINVAL;
908                 *policy = 0;    /* just so it's initialized */
909                 task_lock(current);
910                 *nmask  = cpuset_current_mems_allowed;
911                 task_unlock(current);
912                 return 0;
913         }
914 
915         if (flags & MPOL_F_ADDR) {
916                 /*
917                  * Do NOT fall back to task policy if the
918                  * vma/shared policy at addr is NULL.  We
919                  * want to return MPOL_DEFAULT in this case.
920                  */
921                 down_read(&mm->mmap_sem);
922                 vma = find_vma_intersection(mm, addr, addr+1);
923                 if (!vma) {
924                         up_read(&mm->mmap_sem);
925                         return -EFAULT;
926                 }
927                 if (vma->vm_ops && vma->vm_ops->get_policy)
928                         pol = vma->vm_ops->get_policy(vma, addr);
929                 else
930                         pol = vma->vm_policy;
931         } else if (addr)
932                 return -EINVAL;
933 
934         if (!pol)
935                 pol = &default_policy;  /* indicates default behavior */
936 
937         if (flags & MPOL_F_NODE) {
938                 if (flags & MPOL_F_ADDR) {
939                         err = lookup_node(mm, addr);
940                         if (err < 0)
941                                 goto out;
942                         *policy = err;
943                 } else if (pol == current->mempolicy &&
944                                 pol->mode == MPOL_INTERLEAVE) {
945                         *policy = current->il_next;
946                 } else {
947                         err = -EINVAL;
948                         goto out;
949                 }
950         } else {
951                 *policy = pol == &default_policy ? MPOL_DEFAULT :
952                                                 pol->mode;
953                 /*
954                  * Internal mempolicy flags must be masked off before exposing
955                  * the policy to userspace.
956                  */
957                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
958         }
959 
960         err = 0;
961         if (nmask) {
962                 if (mpol_store_user_nodemask(pol)) {
963                         *nmask = pol->w.user_nodemask;
964                 } else {
965                         task_lock(current);
966                         get_policy_nodemask(pol, nmask);
967                         task_unlock(current);
968                 }
969         }
970 
971  out:
972         mpol_cond_put(pol);
973         if (vma)
974                 up_read(&current->mm->mmap_sem);
975         return err;
976 }
977 
978 #ifdef CONFIG_MIGRATION
979 /*
980  * page migration
981  */
982 static void migrate_page_add(struct page *page, struct list_head *pagelist,
983                                 unsigned long flags)
984 {
985         /*
986          * Avoid migrating a page that is shared with others.
987          */
988         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
989                 if (!isolate_lru_page(page)) {
990                         list_add_tail(&page->lru, pagelist);
991                         inc_zone_page_state(page, NR_ISOLATED_ANON +
992                                             page_is_file_cache(page));
993                 }
994         }
995 }
996 
997 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
998 {
999         if (PageHuge(page))
1000                 return alloc_huge_page_node(page_hstate(compound_head(page)),
1001                                         node);
1002         else
1003                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1004 }
1005 
1006 /*
1007  * Migrate pages from one node to a target node.
1008  * Returns error or the number of pages not migrated.
1009  */
1010 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1011                            int flags)
1012 {
1013         nodemask_t nmask;
1014         LIST_HEAD(pagelist);
1015         int err = 0;
1016 
1017         nodes_clear(nmask);
1018         node_set(source, nmask);
1019 
1020         /*
1021          * This does not "check" the range but isolates all pages that
1022          * need migration.  Between passing in the full user address
1023          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1024          */
1025         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1026         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1027                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1028 
1029         if (!list_empty(&pagelist)) {
1030                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1031                                         MIGRATE_SYNC, MR_SYSCALL);
1032                 if (err)
1033                         putback_movable_pages(&pagelist);
1034         }
1035 
1036         return err;
1037 }
1038 
1039 /*
1040  * Move pages between the two nodesets so as to preserve the physical
1041  * layout as much as possible.
1042  *
1043  * Returns the number of page that could not be moved.
1044  */
1045 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1046                      const nodemask_t *to, int flags)
1047 {
1048         int busy = 0;
1049         int err;
1050         nodemask_t tmp;
1051 
1052         err = migrate_prep();
1053         if (err)
1054                 return err;
1055 
1056         down_read(&mm->mmap_sem);
1057 
1058         err = migrate_vmas(mm, from, to, flags);
1059         if (err)
1060                 goto out;
1061 
1062         /*
1063          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1064          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1065          * bit in 'tmp', and return that <source, dest> pair for migration.
1066          * The pair of nodemasks 'to' and 'from' define the map.
1067          *
1068          * If no pair of bits is found that way, fallback to picking some
1069          * pair of 'source' and 'dest' bits that are not the same.  If the
1070          * 'source' and 'dest' bits are the same, this represents a node
1071          * that will be migrating to itself, so no pages need move.
1072          *
1073          * If no bits are left in 'tmp', or if all remaining bits left
1074          * in 'tmp' correspond to the same bit in 'to', return false
1075          * (nothing left to migrate).
1076          *
1077          * This lets us pick a pair of nodes to migrate between, such that
1078          * if possible the dest node is not already occupied by some other
1079          * source node, minimizing the risk of overloading the memory on a
1080          * node that would happen if we migrated incoming memory to a node
1081          * before migrating outgoing memory source that same node.
1082          *
1083          * A single scan of tmp is sufficient.  As we go, we remember the
1084          * most recent <s, d> pair that moved (s != d).  If we find a pair
1085          * that not only moved, but what's better, moved to an empty slot
1086          * (d is not set in tmp), then we break out then, with that pair.
1087          * Otherwise when we finish scanning from_tmp, we at least have the
1088          * most recent <s, d> pair that moved.  If we get all the way through
1089          * the scan of tmp without finding any node that moved, much less
1090          * moved to an empty node, then there is nothing left worth migrating.
1091          */
1092 
1093         tmp = *from;
1094         while (!nodes_empty(tmp)) {
1095                 int s,d;
1096                 int source = NUMA_NO_NODE;
1097                 int dest = 0;
1098 
1099                 for_each_node_mask(s, tmp) {
1100 
1101                         /*
1102                          * do_migrate_pages() tries to maintain the relative
1103                          * node relationship of the pages established between
1104                          * threads and memory areas.
1105                          *
1106                          * However if the number of source nodes is not equal to
1107                          * the number of destination nodes we can not preserve
1108                          * this node relative relationship.  In that case, skip
1109                          * copying memory from a node that is in the destination
1110                          * mask.
1111                          *
1112                          * Example: [2,3,4] -> [3,4,5] moves everything.
1113                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1114                          */
1115 
1116                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1117                                                 (node_isset(s, *to)))
1118                                 continue;
1119 
1120                         d = node_remap(s, *from, *to);
1121                         if (s == d)
1122                                 continue;
1123 
1124                         source = s;     /* Node moved. Memorize */
1125                         dest = d;
1126 
1127                         /* dest not in remaining from nodes? */
1128                         if (!node_isset(dest, tmp))
1129                                 break;
1130                 }
1131                 if (source == NUMA_NO_NODE)
1132                         break;
1133 
1134                 node_clear(source, tmp);
1135                 err = migrate_to_node(mm, source, dest, flags);
1136                 if (err > 0)
1137                         busy += err;
1138                 if (err < 0)
1139                         break;
1140         }
1141 out:
1142         up_read(&mm->mmap_sem);
1143         if (err < 0)
1144                 return err;
1145         return busy;
1146 
1147 }
1148 
1149 /*
1150  * Allocate a new page for page migration based on vma policy.
1151  * Start by assuming the page is mapped by the same vma as contains @start.
1152  * Search forward from there, if not.  N.B., this assumes that the
1153  * list of pages handed to migrate_pages()--which is how we get here--
1154  * is in virtual address order.
1155  */
1156 static struct page *new_page(struct page *page, unsigned long start, int **x)
1157 {
1158         struct vm_area_struct *vma;
1159         unsigned long uninitialized_var(address);
1160 
1161         vma = find_vma(current->mm, start);
1162         while (vma) {
1163                 address = page_address_in_vma(page, vma);
1164                 if (address != -EFAULT)
1165                         break;
1166                 vma = vma->vm_next;
1167         }
1168 
1169         if (PageHuge(page)) {
1170                 BUG_ON(!vma);
1171                 return alloc_huge_page_noerr(vma, address, 1);
1172         }
1173         /*
1174          * if !vma, alloc_page_vma() will use task or system default policy
1175          */
1176         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1177 }
1178 #else
1179 
1180 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1181                                 unsigned long flags)
1182 {
1183 }
1184 
1185 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1186                      const nodemask_t *to, int flags)
1187 {
1188         return -ENOSYS;
1189 }
1190 
1191 static struct page *new_page(struct page *page, unsigned long start, int **x)
1192 {
1193         return NULL;
1194 }
1195 #endif
1196 
1197 static long do_mbind(unsigned long start, unsigned long len,
1198                      unsigned short mode, unsigned short mode_flags,
1199                      nodemask_t *nmask, unsigned long flags)
1200 {
1201         struct mm_struct *mm = current->mm;
1202         struct mempolicy *new;
1203         unsigned long end;
1204         int err;
1205         LIST_HEAD(pagelist);
1206 
1207         if (flags & ~(unsigned long)MPOL_MF_VALID)
1208                 return -EINVAL;
1209         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1210                 return -EPERM;
1211 
1212         if (start & ~PAGE_MASK)
1213                 return -EINVAL;
1214 
1215         if (mode == MPOL_DEFAULT)
1216                 flags &= ~MPOL_MF_STRICT;
1217 
1218         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1219         end = start + len;
1220 
1221         if (end < start)
1222                 return -EINVAL;
1223         if (end == start)
1224                 return 0;
1225 
1226         new = mpol_new(mode, mode_flags, nmask);
1227         if (IS_ERR(new))
1228                 return PTR_ERR(new);
1229 
1230         if (flags & MPOL_MF_LAZY)
1231                 new->flags |= MPOL_F_MOF;
1232 
1233         /*
1234          * If we are using the default policy then operation
1235          * on discontinuous address spaces is okay after all
1236          */
1237         if (!new)
1238                 flags |= MPOL_MF_DISCONTIG_OK;
1239 
1240         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1241                  start, start + len, mode, mode_flags,
1242                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1243 
1244         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1245 
1246                 err = migrate_prep();
1247                 if (err)
1248                         goto mpol_out;
1249         }
1250         {
1251                 NODEMASK_SCRATCH(scratch);
1252                 if (scratch) {
1253                         down_write(&mm->mmap_sem);
1254                         task_lock(current);
1255                         err = mpol_set_nodemask(new, nmask, scratch);
1256                         task_unlock(current);
1257                         if (err)
1258                                 up_write(&mm->mmap_sem);
1259                 } else
1260                         err = -ENOMEM;
1261                 NODEMASK_SCRATCH_FREE(scratch);
1262         }
1263         if (err)
1264                 goto mpol_out;
1265 
1266         err = queue_pages_range(mm, start, end, nmask,
1267                           flags | MPOL_MF_INVERT, &pagelist);
1268         if (!err)
1269                 err = mbind_range(mm, start, end, new);
1270 
1271         if (!err) {
1272                 int nr_failed = 0;
1273 
1274                 if (!list_empty(&pagelist)) {
1275                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1276                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1277                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1278                         if (nr_failed)
1279                                 putback_movable_pages(&pagelist);
1280                 }
1281 
1282                 if (nr_failed && (flags & MPOL_MF_STRICT))
1283                         err = -EIO;
1284         } else
1285                 putback_movable_pages(&pagelist);
1286 
1287         up_write(&mm->mmap_sem);
1288  mpol_out:
1289         mpol_put(new);
1290         return err;
1291 }
1292 
1293 /*
1294  * User space interface with variable sized bitmaps for nodelists.
1295  */
1296 
1297 /* Copy a node mask from user space. */
1298 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1299                      unsigned long maxnode)
1300 {
1301         unsigned long k;
1302         unsigned long nlongs;
1303         unsigned long endmask;
1304 
1305         --maxnode;
1306         nodes_clear(*nodes);
1307         if (maxnode == 0 || !nmask)
1308                 return 0;
1309         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1310                 return -EINVAL;
1311 
1312         nlongs = BITS_TO_LONGS(maxnode);
1313         if ((maxnode % BITS_PER_LONG) == 0)
1314                 endmask = ~0UL;
1315         else
1316                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1317 
1318         /* When the user specified more nodes than supported just check
1319            if the non supported part is all zero. */
1320         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1321                 if (nlongs > PAGE_SIZE/sizeof(long))
1322                         return -EINVAL;
1323                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1324                         unsigned long t;
1325                         if (get_user(t, nmask + k))
1326                                 return -EFAULT;
1327                         if (k == nlongs - 1) {
1328                                 if (t & endmask)
1329                                         return -EINVAL;
1330                         } else if (t)
1331                                 return -EINVAL;
1332                 }
1333                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1334                 endmask = ~0UL;
1335         }
1336 
1337         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1338                 return -EFAULT;
1339         nodes_addr(*nodes)[nlongs-1] &= endmask;
1340         return 0;
1341 }
1342 
1343 /* Copy a kernel node mask to user space */
1344 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1345                               nodemask_t *nodes)
1346 {
1347         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1348         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1349 
1350         if (copy > nbytes) {
1351                 if (copy > PAGE_SIZE)
1352                         return -EINVAL;
1353                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1354                         return -EFAULT;
1355                 copy = nbytes;
1356         }
1357         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1358 }
1359 
1360 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1361                 unsigned long, mode, const unsigned long __user *, nmask,
1362                 unsigned long, maxnode, unsigned, flags)
1363 {
1364         nodemask_t nodes;
1365         int err;
1366         unsigned short mode_flags;
1367 
1368         mode_flags = mode & MPOL_MODE_FLAGS;
1369         mode &= ~MPOL_MODE_FLAGS;
1370         if (mode >= MPOL_MAX)
1371                 return -EINVAL;
1372         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1373             (mode_flags & MPOL_F_RELATIVE_NODES))
1374                 return -EINVAL;
1375         err = get_nodes(&nodes, nmask, maxnode);
1376         if (err)
1377                 return err;
1378         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1379 }
1380 
1381 /* Set the process memory policy */
1382 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1383                 unsigned long, maxnode)
1384 {
1385         int err;
1386         nodemask_t nodes;
1387         unsigned short flags;
1388 
1389         flags = mode & MPOL_MODE_FLAGS;
1390         mode &= ~MPOL_MODE_FLAGS;
1391         if ((unsigned int)mode >= MPOL_MAX)
1392                 return -EINVAL;
1393         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1394                 return -EINVAL;
1395         err = get_nodes(&nodes, nmask, maxnode);
1396         if (err)
1397                 return err;
1398         return do_set_mempolicy(mode, flags, &nodes);
1399 }
1400 
1401 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1402                 const unsigned long __user *, old_nodes,
1403                 const unsigned long __user *, new_nodes)
1404 {
1405         const struct cred *cred = current_cred(), *tcred;
1406         struct mm_struct *mm = NULL;
1407         struct task_struct *task;
1408         nodemask_t task_nodes;
1409         int err;
1410         nodemask_t *old;
1411         nodemask_t *new;
1412         NODEMASK_SCRATCH(scratch);
1413 
1414         if (!scratch)
1415                 return -ENOMEM;
1416 
1417         old = &scratch->mask1;
1418         new = &scratch->mask2;
1419 
1420         err = get_nodes(old, old_nodes, maxnode);
1421         if (err)
1422                 goto out;
1423 
1424         err = get_nodes(new, new_nodes, maxnode);
1425         if (err)
1426                 goto out;
1427 
1428         /* Find the mm_struct */
1429         rcu_read_lock();
1430         task = pid ? find_task_by_vpid(pid) : current;
1431         if (!task) {
1432                 rcu_read_unlock();
1433                 err = -ESRCH;
1434                 goto out;
1435         }
1436         get_task_struct(task);
1437 
1438         err = -EINVAL;
1439 
1440         /*
1441          * Check if this process has the right to modify the specified
1442          * process. The right exists if the process has administrative
1443          * capabilities, superuser privileges or the same
1444          * userid as the target process.
1445          */
1446         tcred = __task_cred(task);
1447         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1448             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1449             !capable(CAP_SYS_NICE)) {
1450                 rcu_read_unlock();
1451                 err = -EPERM;
1452                 goto out_put;
1453         }
1454         rcu_read_unlock();
1455 
1456         task_nodes = cpuset_mems_allowed(task);
1457         /* Is the user allowed to access the target nodes? */
1458         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1459                 err = -EPERM;
1460                 goto out_put;
1461         }
1462 
1463         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1464                 err = -EINVAL;
1465                 goto out_put;
1466         }
1467 
1468         err = security_task_movememory(task);
1469         if (err)
1470                 goto out_put;
1471 
1472         mm = get_task_mm(task);
1473         put_task_struct(task);
1474 
1475         if (!mm) {
1476                 err = -EINVAL;
1477                 goto out;
1478         }
1479 
1480         err = do_migrate_pages(mm, old, new,
1481                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1482 
1483         mmput(mm);
1484 out:
1485         NODEMASK_SCRATCH_FREE(scratch);
1486 
1487         return err;
1488 
1489 out_put:
1490         put_task_struct(task);
1491         goto out;
1492 
1493 }
1494 
1495 
1496 /* Retrieve NUMA policy */
1497 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1498                 unsigned long __user *, nmask, unsigned long, maxnode,
1499                 unsigned long, addr, unsigned long, flags)
1500 {
1501         int err;
1502         int uninitialized_var(pval);
1503         nodemask_t nodes;
1504 
1505         if (nmask != NULL && maxnode < MAX_NUMNODES)
1506                 return -EINVAL;
1507 
1508         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1509 
1510         if (err)
1511                 return err;
1512 
1513         if (policy && put_user(pval, policy))
1514                 return -EFAULT;
1515 
1516         if (nmask)
1517                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1518 
1519         return err;
1520 }
1521 
1522 #ifdef CONFIG_COMPAT
1523 
1524 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1525                        compat_ulong_t __user *, nmask,
1526                        compat_ulong_t, maxnode,
1527                        compat_ulong_t, addr, compat_ulong_t, flags)
1528 {
1529         long err;
1530         unsigned long __user *nm = NULL;
1531         unsigned long nr_bits, alloc_size;
1532         DECLARE_BITMAP(bm, MAX_NUMNODES);
1533 
1534         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1535         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1536 
1537         if (nmask)
1538                 nm = compat_alloc_user_space(alloc_size);
1539 
1540         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1541 
1542         if (!err && nmask) {
1543                 unsigned long copy_size;
1544                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1545                 err = copy_from_user(bm, nm, copy_size);
1546                 /* ensure entire bitmap is zeroed */
1547                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1548                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1549         }
1550 
1551         return err;
1552 }
1553 
1554 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1555                        compat_ulong_t, maxnode)
1556 {
1557         unsigned long __user *nm = NULL;
1558         unsigned long nr_bits, alloc_size;
1559         DECLARE_BITMAP(bm, MAX_NUMNODES);
1560 
1561         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1562         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1563 
1564         if (nmask) {
1565                 if (compat_get_bitmap(bm, nmask, nr_bits))
1566                         return -EFAULT;
1567                 nm = compat_alloc_user_space(alloc_size);
1568                 if (copy_to_user(nm, bm, alloc_size))
1569                         return -EFAULT;
1570         }
1571 
1572         return sys_set_mempolicy(mode, nm, nr_bits+1);
1573 }
1574 
1575 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1576                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1577                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1578 {
1579         unsigned long __user *nm = NULL;
1580         unsigned long nr_bits, alloc_size;
1581         nodemask_t bm;
1582 
1583         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1584         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1585 
1586         if (nmask) {
1587                 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1588                         return -EFAULT;
1589                 nm = compat_alloc_user_space(alloc_size);
1590                 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1591                         return -EFAULT;
1592         }
1593 
1594         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1595 }
1596 
1597 #endif
1598 
1599 /*
1600  * get_vma_policy(@task, @vma, @addr)
1601  * @task: task for fallback if vma policy == default
1602  * @vma: virtual memory area whose policy is sought
1603  * @addr: address in @vma for shared policy lookup
1604  *
1605  * Returns effective policy for a VMA at specified address.
1606  * Falls back to @task or system default policy, as necessary.
1607  * Current or other task's task mempolicy and non-shared vma policies must be
1608  * protected by task_lock(task) by the caller.
1609  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1610  * count--added by the get_policy() vm_op, as appropriate--to protect against
1611  * freeing by another task.  It is the caller's responsibility to free the
1612  * extra reference for shared policies.
1613  */
1614 struct mempolicy *get_vma_policy(struct task_struct *task,
1615                 struct vm_area_struct *vma, unsigned long addr)
1616 {
1617         struct mempolicy *pol = get_task_policy(task);
1618 
1619         if (vma) {
1620                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1621                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1622                                                                         addr);
1623                         if (vpol)
1624                                 pol = vpol;
1625                 } else if (vma->vm_policy) {
1626                         pol = vma->vm_policy;
1627 
1628                         /*
1629                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1630                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1631                          * count on these policies which will be dropped by
1632                          * mpol_cond_put() later
1633                          */
1634                         if (mpol_needs_cond_ref(pol))
1635                                 mpol_get(pol);
1636                 }
1637         }
1638         if (!pol)
1639                 pol = &default_policy;
1640         return pol;
1641 }
1642 
1643 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1644 {
1645         struct mempolicy *pol = get_task_policy(task);
1646         if (vma) {
1647                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1648                         bool ret = false;
1649 
1650                         pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1651                         if (pol && (pol->flags & MPOL_F_MOF))
1652                                 ret = true;
1653                         mpol_cond_put(pol);
1654 
1655                         return ret;
1656                 } else if (vma->vm_policy) {
1657                         pol = vma->vm_policy;
1658                 }
1659         }
1660 
1661         if (!pol)
1662                 return default_policy.flags & MPOL_F_MOF;
1663 
1664         return pol->flags & MPOL_F_MOF;
1665 }
1666 
1667 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1668 {
1669         enum zone_type dynamic_policy_zone = policy_zone;
1670 
1671         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1672 
1673         /*
1674          * if policy->v.nodes has movable memory only,
1675          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1676          *
1677          * policy->v.nodes is intersect with node_states[N_MEMORY].
1678          * so if the following test faile, it implies
1679          * policy->v.nodes has movable memory only.
1680          */
1681         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1682                 dynamic_policy_zone = ZONE_MOVABLE;
1683 
1684         return zone >= dynamic_policy_zone;
1685 }
1686 
1687 /*
1688  * Return a nodemask representing a mempolicy for filtering nodes for
1689  * page allocation
1690  */
1691 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1692 {
1693         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1694         if (unlikely(policy->mode == MPOL_BIND) &&
1695                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1696                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1697                 return &policy->v.nodes;
1698 
1699         return NULL;
1700 }
1701 
1702 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1703 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1704         int nd)
1705 {
1706         switch (policy->mode) {
1707         case MPOL_PREFERRED:
1708                 if (!(policy->flags & MPOL_F_LOCAL))
1709                         nd = policy->v.preferred_node;
1710                 break;
1711         case MPOL_BIND:
1712                 /*
1713                  * Normally, MPOL_BIND allocations are node-local within the
1714                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1715                  * current node isn't part of the mask, we use the zonelist for
1716                  * the first node in the mask instead.
1717                  */
1718                 if (unlikely(gfp & __GFP_THISNODE) &&
1719                                 unlikely(!node_isset(nd, policy->v.nodes)))
1720                         nd = first_node(policy->v.nodes);
1721                 break;
1722         default:
1723                 BUG();
1724         }
1725         return node_zonelist(nd, gfp);
1726 }
1727 
1728 /* Do dynamic interleaving for a process */
1729 static unsigned interleave_nodes(struct mempolicy *policy)
1730 {
1731         unsigned nid, next;
1732         struct task_struct *me = current;
1733 
1734         nid = me->il_next;
1735         next = next_node(nid, policy->v.nodes);
1736         if (next >= MAX_NUMNODES)
1737                 next = first_node(policy->v.nodes);
1738         if (next < MAX_NUMNODES)
1739                 me->il_next = next;
1740         return nid;
1741 }
1742 
1743 /*
1744  * Depending on the memory policy provide a node from which to allocate the
1745  * next slab entry.
1746  */
1747 unsigned int mempolicy_slab_node(void)
1748 {
1749         struct mempolicy *policy;
1750         int node = numa_mem_id();
1751 
1752         if (in_interrupt())
1753                 return node;
1754 
1755         policy = current->mempolicy;
1756         if (!policy || policy->flags & MPOL_F_LOCAL)
1757                 return node;
1758 
1759         switch (policy->mode) {
1760         case MPOL_PREFERRED:
1761                 /*
1762                  * handled MPOL_F_LOCAL above
1763                  */
1764                 return policy->v.preferred_node;
1765 
1766         case MPOL_INTERLEAVE:
1767                 return interleave_nodes(policy);
1768 
1769         case MPOL_BIND: {
1770                 /*
1771                  * Follow bind policy behavior and start allocation at the
1772                  * first node.
1773                  */
1774                 struct zonelist *zonelist;
1775                 struct zone *zone;
1776                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1777                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1778                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1779                                                         &policy->v.nodes,
1780                                                         &zone);
1781                 return zone ? zone->node : node;
1782         }
1783 
1784         default:
1785                 BUG();
1786         }
1787 }
1788 
1789 /* Do static interleaving for a VMA with known offset. */
1790 static unsigned offset_il_node(struct mempolicy *pol,
1791                 struct vm_area_struct *vma, unsigned long off)
1792 {
1793         unsigned nnodes = nodes_weight(pol->v.nodes);
1794         unsigned target;
1795         int c;
1796         int nid = NUMA_NO_NODE;
1797 
1798         if (!nnodes)
1799                 return numa_node_id();
1800         target = (unsigned int)off % nnodes;
1801         c = 0;
1802         do {
1803                 nid = next_node(nid, pol->v.nodes);
1804                 c++;
1805         } while (c <= target);
1806         return nid;
1807 }
1808 
1809 /* Determine a node number for interleave */
1810 static inline unsigned interleave_nid(struct mempolicy *pol,
1811                  struct vm_area_struct *vma, unsigned long addr, int shift)
1812 {
1813         if (vma) {
1814                 unsigned long off;
1815 
1816                 /*
1817                  * for small pages, there is no difference between
1818                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1819                  * for huge pages, since vm_pgoff is in units of small
1820                  * pages, we need to shift off the always 0 bits to get
1821                  * a useful offset.
1822                  */
1823                 BUG_ON(shift < PAGE_SHIFT);
1824                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1825                 off += (addr - vma->vm_start) >> shift;
1826                 return offset_il_node(pol, vma, off);
1827         } else
1828                 return interleave_nodes(pol);
1829 }
1830 
1831 /*
1832  * Return the bit number of a random bit set in the nodemask.
1833  * (returns NUMA_NO_NODE if nodemask is empty)
1834  */
1835 int node_random(const nodemask_t *maskp)
1836 {
1837         int w, bit = NUMA_NO_NODE;
1838 
1839         w = nodes_weight(*maskp);
1840         if (w)
1841                 bit = bitmap_ord_to_pos(maskp->bits,
1842                         get_random_int() % w, MAX_NUMNODES);
1843         return bit;
1844 }
1845 
1846 #ifdef CONFIG_HUGETLBFS
1847 /*
1848  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1849  * @vma: virtual memory area whose policy is sought
1850  * @addr: address in @vma for shared policy lookup and interleave policy
1851  * @gfp_flags: for requested zone
1852  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1853  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1854  *
1855  * Returns a zonelist suitable for a huge page allocation and a pointer
1856  * to the struct mempolicy for conditional unref after allocation.
1857  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1858  * @nodemask for filtering the zonelist.
1859  *
1860  * Must be protected by read_mems_allowed_begin()
1861  */
1862 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1863                                 gfp_t gfp_flags, struct mempolicy **mpol,
1864                                 nodemask_t **nodemask)
1865 {
1866         struct zonelist *zl;
1867 
1868         *mpol = get_vma_policy(current, vma, addr);
1869         *nodemask = NULL;       /* assume !MPOL_BIND */
1870 
1871         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1872                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1873                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1874         } else {
1875                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1876                 if ((*mpol)->mode == MPOL_BIND)
1877                         *nodemask = &(*mpol)->v.nodes;
1878         }
1879         return zl;
1880 }
1881 
1882 /*
1883  * init_nodemask_of_mempolicy
1884  *
1885  * If the current task's mempolicy is "default" [NULL], return 'false'
1886  * to indicate default policy.  Otherwise, extract the policy nodemask
1887  * for 'bind' or 'interleave' policy into the argument nodemask, or
1888  * initialize the argument nodemask to contain the single node for
1889  * 'preferred' or 'local' policy and return 'true' to indicate presence
1890  * of non-default mempolicy.
1891  *
1892  * We don't bother with reference counting the mempolicy [mpol_get/put]
1893  * because the current task is examining it's own mempolicy and a task's
1894  * mempolicy is only ever changed by the task itself.
1895  *
1896  * N.B., it is the caller's responsibility to free a returned nodemask.
1897  */
1898 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1899 {
1900         struct mempolicy *mempolicy;
1901         int nid;
1902 
1903         if (!(mask && current->mempolicy))
1904                 return false;
1905 
1906         task_lock(current);
1907         mempolicy = current->mempolicy;
1908         switch (mempolicy->mode) {
1909         case MPOL_PREFERRED:
1910                 if (mempolicy->flags & MPOL_F_LOCAL)
1911                         nid = numa_node_id();
1912                 else
1913                         nid = mempolicy->v.preferred_node;
1914                 init_nodemask_of_node(mask, nid);
1915                 break;
1916 
1917         case MPOL_BIND:
1918                 /* Fall through */
1919         case MPOL_INTERLEAVE:
1920                 *mask =  mempolicy->v.nodes;
1921                 break;
1922 
1923         default:
1924                 BUG();
1925         }
1926         task_unlock(current);
1927 
1928         return true;
1929 }
1930 #endif
1931 
1932 /*
1933  * mempolicy_nodemask_intersects
1934  *
1935  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1936  * policy.  Otherwise, check for intersection between mask and the policy
1937  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1938  * policy, always return true since it may allocate elsewhere on fallback.
1939  *
1940  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1941  */
1942 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1943                                         const nodemask_t *mask)
1944 {
1945         struct mempolicy *mempolicy;
1946         bool ret = true;
1947 
1948         if (!mask)
1949                 return ret;
1950         task_lock(tsk);
1951         mempolicy = tsk->mempolicy;
1952         if (!mempolicy)
1953                 goto out;
1954 
1955         switch (mempolicy->mode) {
1956         case MPOL_PREFERRED:
1957                 /*
1958                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1959                  * allocate from, they may fallback to other nodes when oom.
1960                  * Thus, it's possible for tsk to have allocated memory from
1961                  * nodes in mask.
1962                  */
1963                 break;
1964         case MPOL_BIND:
1965         case MPOL_INTERLEAVE:
1966                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1967                 break;
1968         default:
1969                 BUG();
1970         }
1971 out:
1972         task_unlock(tsk);
1973         return ret;
1974 }
1975 
1976 /* Allocate a page in interleaved policy.
1977    Own path because it needs to do special accounting. */
1978 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1979                                         unsigned nid)
1980 {
1981         struct zonelist *zl;
1982         struct page *page;
1983 
1984         zl = node_zonelist(nid, gfp);
1985         page = __alloc_pages(gfp, order, zl);
1986         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1987                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1988         return page;
1989 }
1990 
1991 /**
1992  *      alloc_pages_vma - Allocate a page for a VMA.
1993  *
1994  *      @gfp:
1995  *      %GFP_USER    user allocation.
1996  *      %GFP_KERNEL  kernel allocations,
1997  *      %GFP_HIGHMEM highmem/user allocations,
1998  *      %GFP_FS      allocation should not call back into a file system.
1999  *      %GFP_ATOMIC  don't sleep.
2000  *
2001  *      @order:Order of the GFP allocation.
2002  *      @vma:  Pointer to VMA or NULL if not available.
2003  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2004  *
2005  *      This function allocates a page from the kernel page pool and applies
2006  *      a NUMA policy associated with the VMA or the current process.
2007  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2008  *      mm_struct of the VMA to prevent it from going away. Should be used for
2009  *      all allocations for pages that will be mapped into
2010  *      user space. Returns NULL when no page can be allocated.
2011  *
2012  *      Should be called with the mm_sem of the vma hold.
2013  */
2014 struct page *
2015 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2016                 unsigned long addr, int node)
2017 {
2018         struct mempolicy *pol;
2019         struct page *page;
2020         unsigned int cpuset_mems_cookie;
2021 
2022 retry_cpuset:
2023         pol = get_vma_policy(current, vma, addr);
2024         cpuset_mems_cookie = read_mems_allowed_begin();
2025 
2026         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2027                 unsigned nid;
2028 
2029                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2030                 mpol_cond_put(pol);
2031                 page = alloc_page_interleave(gfp, order, nid);
2032                 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2033                         goto retry_cpuset;
2034 
2035                 return page;
2036         }
2037         page = __alloc_pages_nodemask(gfp, order,
2038                                       policy_zonelist(gfp, pol, node),
2039                                       policy_nodemask(gfp, pol));
2040         if (unlikely(mpol_needs_cond_ref(pol)))
2041                 __mpol_put(pol);
2042         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2043                 goto retry_cpuset;
2044         return page;
2045 }
2046 
2047 /**
2048  *      alloc_pages_current - Allocate pages.
2049  *
2050  *      @gfp:
2051  *              %GFP_USER   user allocation,
2052  *              %GFP_KERNEL kernel allocation,
2053  *              %GFP_HIGHMEM highmem allocation,
2054  *              %GFP_FS     don't call back into a file system.
2055  *              %GFP_ATOMIC don't sleep.
2056  *      @order: Power of two of allocation size in pages. 0 is a single page.
2057  *
2058  *      Allocate a page from the kernel page pool.  When not in
2059  *      interrupt context and apply the current process NUMA policy.
2060  *      Returns NULL when no page can be allocated.
2061  *
2062  *      Don't call cpuset_update_task_memory_state() unless
2063  *      1) it's ok to take cpuset_sem (can WAIT), and
2064  *      2) allocating for current task (not interrupt).
2065  */
2066 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2067 {
2068         struct mempolicy *pol = get_task_policy(current);
2069         struct page *page;
2070         unsigned int cpuset_mems_cookie;
2071 
2072         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2073                 pol = &default_policy;
2074 
2075 retry_cpuset:
2076         cpuset_mems_cookie = read_mems_allowed_begin();
2077 
2078         /*
2079          * No reference counting needed for current->mempolicy
2080          * nor system default_policy
2081          */
2082         if (pol->mode == MPOL_INTERLEAVE)
2083                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2084         else
2085                 page = __alloc_pages_nodemask(gfp, order,
2086                                 policy_zonelist(gfp, pol, numa_node_id()),
2087                                 policy_nodemask(gfp, pol));
2088 
2089         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2090                 goto retry_cpuset;
2091 
2092         return page;
2093 }
2094 EXPORT_SYMBOL(alloc_pages_current);
2095 
2096 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2097 {
2098         struct mempolicy *pol = mpol_dup(vma_policy(src));
2099 
2100         if (IS_ERR(pol))
2101                 return PTR_ERR(pol);
2102         dst->vm_policy = pol;
2103         return 0;
2104 }
2105 
2106 /*
2107  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2108  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2109  * with the mems_allowed returned by cpuset_mems_allowed().  This
2110  * keeps mempolicies cpuset relative after its cpuset moves.  See
2111  * further kernel/cpuset.c update_nodemask().
2112  *
2113  * current's mempolicy may be rebinded by the other task(the task that changes
2114  * cpuset's mems), so we needn't do rebind work for current task.
2115  */
2116 
2117 /* Slow path of a mempolicy duplicate */
2118 struct mempolicy *__mpol_dup(struct mempolicy *old)
2119 {
2120         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2121 
2122         if (!new)
2123                 return ERR_PTR(-ENOMEM);
2124 
2125         /* task's mempolicy is protected by alloc_lock */
2126         if (old == current->mempolicy) {
2127                 task_lock(current);
2128                 *new = *old;
2129                 task_unlock(current);
2130         } else
2131                 *new = *old;
2132 
2133         if (current_cpuset_is_being_rebound()) {
2134                 nodemask_t mems = cpuset_mems_allowed(current);
2135                 if (new->flags & MPOL_F_REBINDING)
2136                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2137                 else
2138                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2139         }
2140         atomic_set(&new->refcnt, 1);
2141         return new;
2142 }
2143 
2144 /* Slow path of a mempolicy comparison */
2145 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2146 {
2147         if (!a || !b)
2148                 return false;
2149         if (a->mode != b->mode)
2150                 return false;
2151         if (a->flags != b->flags)
2152                 return false;
2153         if (mpol_store_user_nodemask(a))
2154                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2155                         return false;
2156 
2157         switch (a->mode) {
2158         case MPOL_BIND:
2159                 /* Fall through */
2160         case MPOL_INTERLEAVE:
2161                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2162         case MPOL_PREFERRED:
2163                 /* a's ->flags is the same as b's */
2164                 if (a->flags & MPOL_F_LOCAL)
2165                         return true;
2166                 return a->v.preferred_node == b->v.preferred_node;
2167         default:
2168                 BUG();
2169                 return false;
2170         }
2171 }
2172 
2173 /*
2174  * Shared memory backing store policy support.
2175  *
2176  * Remember policies even when nobody has shared memory mapped.
2177  * The policies are kept in Red-Black tree linked from the inode.
2178  * They are protected by the sp->lock spinlock, which should be held
2179  * for any accesses to the tree.
2180  */
2181 
2182 /* lookup first element intersecting start-end */
2183 /* Caller holds sp->lock */
2184 static struct sp_node *
2185 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2186 {
2187         struct rb_node *n = sp->root.rb_node;
2188 
2189         while (n) {
2190                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2191 
2192                 if (start >= p->end)
2193                         n = n->rb_right;
2194                 else if (end <= p->start)
2195                         n = n->rb_left;
2196                 else
2197                         break;
2198         }
2199         if (!n)
2200                 return NULL;
2201         for (;;) {
2202                 struct sp_node *w = NULL;
2203                 struct rb_node *prev = rb_prev(n);
2204                 if (!prev)
2205                         break;
2206                 w = rb_entry(prev, struct sp_node, nd);
2207                 if (w->end <= start)
2208                         break;
2209                 n = prev;
2210         }
2211         return rb_entry(n, struct sp_node, nd);
2212 }
2213 
2214 /* Insert a new shared policy into the list. */
2215 /* Caller holds sp->lock */
2216 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2217 {
2218         struct rb_node **p = &sp->root.rb_node;
2219         struct rb_node *parent = NULL;
2220         struct sp_node *nd;
2221 
2222         while (*p) {
2223                 parent = *p;
2224                 nd = rb_entry(parent, struct sp_node, nd);
2225                 if (new->start < nd->start)
2226                         p = &(*p)->rb_left;
2227                 else if (new->end > nd->end)
2228                         p = &(*p)->rb_right;
2229                 else
2230                         BUG();
2231         }
2232         rb_link_node(&new->nd, parent, p);
2233         rb_insert_color(&new->nd, &sp->root);
2234         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2235                  new->policy ? new->policy->mode : 0);
2236 }
2237 
2238 /* Find shared policy intersecting idx */
2239 struct mempolicy *
2240 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2241 {
2242         struct mempolicy *pol = NULL;
2243         struct sp_node *sn;
2244 
2245         if (!sp->root.rb_node)
2246                 return NULL;
2247         spin_lock(&sp->lock);
2248         sn = sp_lookup(sp, idx, idx+1);
2249         if (sn) {
2250                 mpol_get(sn->policy);
2251                 pol = sn->policy;
2252         }
2253         spin_unlock(&sp->lock);
2254         return pol;
2255 }
2256 
2257 static void sp_free(struct sp_node *n)
2258 {
2259         mpol_put(n->policy);
2260         kmem_cache_free(sn_cache, n);
2261 }
2262 
2263 /**
2264  * mpol_misplaced - check whether current page node is valid in policy
2265  *
2266  * @page: page to be checked
2267  * @vma: vm area where page mapped
2268  * @addr: virtual address where page mapped
2269  *
2270  * Lookup current policy node id for vma,addr and "compare to" page's
2271  * node id.
2272  *
2273  * Returns:
2274  *      -1      - not misplaced, page is in the right node
2275  *      node    - node id where the page should be
2276  *
2277  * Policy determination "mimics" alloc_page_vma().
2278  * Called from fault path where we know the vma and faulting address.
2279  */
2280 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2281 {
2282         struct mempolicy *pol;
2283         struct zone *zone;
2284         int curnid = page_to_nid(page);
2285         unsigned long pgoff;
2286         int thiscpu = raw_smp_processor_id();
2287         int thisnid = cpu_to_node(thiscpu);
2288         int polnid = -1;
2289         int ret = -1;
2290 
2291         BUG_ON(!vma);
2292 
2293         pol = get_vma_policy(current, vma, addr);
2294         if (!(pol->flags & MPOL_F_MOF))
2295                 goto out;
2296 
2297         switch (pol->mode) {
2298         case MPOL_INTERLEAVE:
2299                 BUG_ON(addr >= vma->vm_end);
2300                 BUG_ON(addr < vma->vm_start);
2301 
2302                 pgoff = vma->vm_pgoff;
2303                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2304                 polnid = offset_il_node(pol, vma, pgoff);
2305                 break;
2306 
2307         case MPOL_PREFERRED:
2308                 if (pol->flags & MPOL_F_LOCAL)
2309                         polnid = numa_node_id();
2310                 else
2311                         polnid = pol->v.preferred_node;
2312                 break;
2313 
2314         case MPOL_BIND:
2315                 /*
2316                  * allows binding to multiple nodes.
2317                  * use current page if in policy nodemask,
2318                  * else select nearest allowed node, if any.
2319                  * If no allowed nodes, use current [!misplaced].
2320                  */
2321                 if (node_isset(curnid, pol->v.nodes))
2322                         goto out;
2323                 (void)first_zones_zonelist(
2324                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2325                                 gfp_zone(GFP_HIGHUSER),
2326                                 &pol->v.nodes, &zone);
2327                 polnid = zone->node;
2328                 break;
2329 
2330         default:
2331                 BUG();
2332         }
2333 
2334         /* Migrate the page towards the node whose CPU is referencing it */
2335         if (pol->flags & MPOL_F_MORON) {
2336                 polnid = thisnid;
2337 
2338                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2339                         goto out;
2340         }
2341 
2342         if (curnid != polnid)
2343                 ret = polnid;
2344 out:
2345         mpol_cond_put(pol);
2346 
2347         return ret;
2348 }
2349 
2350 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2351 {
2352         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2353         rb_erase(&n->nd, &sp->root);
2354         sp_free(n);
2355 }
2356 
2357 static void sp_node_init(struct sp_node *node, unsigned long start,
2358                         unsigned long end, struct mempolicy *pol)
2359 {
2360         node->start = start;
2361         node->end = end;
2362         node->policy = pol;
2363 }
2364 
2365 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2366                                 struct mempolicy *pol)
2367 {
2368         struct sp_node *n;
2369         struct mempolicy *newpol;
2370 
2371         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2372         if (!n)
2373                 return NULL;
2374 
2375         newpol = mpol_dup(pol);
2376         if (IS_ERR(newpol)) {
2377                 kmem_cache_free(sn_cache, n);
2378                 return NULL;
2379         }
2380         newpol->flags |= MPOL_F_SHARED;
2381         sp_node_init(n, start, end, newpol);
2382 
2383         return n;
2384 }
2385 
2386 /* Replace a policy range. */
2387 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2388                                  unsigned long end, struct sp_node *new)
2389 {
2390         struct sp_node *n;
2391         struct sp_node *n_new = NULL;
2392         struct mempolicy *mpol_new = NULL;
2393         int ret = 0;
2394 
2395 restart:
2396         spin_lock(&sp->lock);
2397         n = sp_lookup(sp, start, end);
2398         /* Take care of old policies in the same range. */
2399         while (n && n->start < end) {
2400                 struct rb_node *next = rb_next(&n->nd);
2401                 if (n->start >= start) {
2402                         if (n->end <= end)
2403                                 sp_delete(sp, n);
2404                         else
2405                                 n->start = end;
2406                 } else {
2407                         /* Old policy spanning whole new range. */
2408                         if (n->end > end) {
2409                                 if (!n_new)
2410                                         goto alloc_new;
2411 
2412                                 *mpol_new = *n->policy;
2413                                 atomic_set(&mpol_new->refcnt, 1);
2414                                 sp_node_init(n_new, end, n->end, mpol_new);
2415                                 n->end = start;
2416                                 sp_insert(sp, n_new);
2417                                 n_new = NULL;
2418                                 mpol_new = NULL;
2419                                 break;
2420                         } else
2421                                 n->end = start;
2422                 }
2423                 if (!next)
2424                         break;
2425                 n = rb_entry(next, struct sp_node, nd);
2426         }
2427         if (new)
2428                 sp_insert(sp, new);
2429         spin_unlock(&sp->lock);
2430         ret = 0;
2431 
2432 err_out:
2433         if (mpol_new)
2434                 mpol_put(mpol_new);
2435         if (n_new)
2436                 kmem_cache_free(sn_cache, n_new);
2437 
2438         return ret;
2439 
2440 alloc_new:
2441         spin_unlock(&sp->lock);
2442         ret = -ENOMEM;
2443         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2444         if (!n_new)
2445                 goto err_out;
2446         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2447         if (!mpol_new)
2448                 goto err_out;
2449         goto restart;
2450 }
2451 
2452 /**
2453  * mpol_shared_policy_init - initialize shared policy for inode
2454  * @sp: pointer to inode shared policy
2455  * @mpol:  struct mempolicy to install
2456  *
2457  * Install non-NULL @mpol in inode's shared policy rb-tree.
2458  * On entry, the current task has a reference on a non-NULL @mpol.
2459  * This must be released on exit.
2460  * This is called at get_inode() calls and we can use GFP_KERNEL.
2461  */
2462 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2463 {
2464         int ret;
2465 
2466         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2467         spin_lock_init(&sp->lock);
2468 
2469         if (mpol) {
2470                 struct vm_area_struct pvma;
2471                 struct mempolicy *new;
2472                 NODEMASK_SCRATCH(scratch);
2473 
2474                 if (!scratch)
2475                         goto put_mpol;
2476                 /* contextualize the tmpfs mount point mempolicy */
2477                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2478                 if (IS_ERR(new))
2479                         goto free_scratch; /* no valid nodemask intersection */
2480 
2481                 task_lock(current);
2482                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2483                 task_unlock(current);
2484                 if (ret)
2485                         goto put_new;
2486 
2487                 /* Create pseudo-vma that contains just the policy */
2488                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2489                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2490                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2491 
2492 put_new:
2493                 mpol_put(new);                  /* drop initial ref */
2494 free_scratch:
2495                 NODEMASK_SCRATCH_FREE(scratch);
2496 put_mpol:
2497                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2498         }
2499 }
2500 
2501 int mpol_set_shared_policy(struct shared_policy *info,
2502                         struct vm_area_struct *vma, struct mempolicy *npol)
2503 {
2504         int err;
2505         struct sp_node *new = NULL;
2506         unsigned long sz = vma_pages(vma);
2507 
2508         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2509                  vma->vm_pgoff,
2510                  sz, npol ? npol->mode : -1,
2511                  npol ? npol->flags : -1,
2512                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2513 
2514         if (npol) {
2515                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2516                 if (!new)
2517                         return -ENOMEM;
2518         }
2519         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2520         if (err && new)
2521                 sp_free(new);
2522         return err;
2523 }
2524 
2525 /* Free a backing policy store on inode delete. */
2526 void mpol_free_shared_policy(struct shared_policy *p)
2527 {
2528         struct sp_node *n;
2529         struct rb_node *next;
2530 
2531         if (!p->root.rb_node)
2532                 return;
2533         spin_lock(&p->lock);
2534         next = rb_first(&p->root);
2535         while (next) {
2536                 n = rb_entry(next, struct sp_node, nd);
2537                 next = rb_next(&n->nd);
2538                 sp_delete(p, n);
2539         }
2540         spin_unlock(&p->lock);
2541 }
2542 
2543 #ifdef CONFIG_NUMA_BALANCING
2544 static int __initdata numabalancing_override;
2545 
2546 static void __init check_numabalancing_enable(void)
2547 {
2548         bool numabalancing_default = false;
2549 
2550         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2551                 numabalancing_default = true;
2552 
2553         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2554         if (numabalancing_override)
2555                 set_numabalancing_state(numabalancing_override == 1);
2556 
2557         if (num_online_nodes() > 1 && !numabalancing_override) {
2558                 pr_info("%s automatic NUMA balancing. "
2559                         "Configure with numa_balancing= or the "
2560                         "kernel.numa_balancing sysctl",
2561                         numabalancing_default ? "Enabling" : "Disabling");
2562                 set_numabalancing_state(numabalancing_default);
2563         }
2564 }
2565 
2566 static int __init setup_numabalancing(char *str)
2567 {
2568         int ret = 0;
2569         if (!str)
2570                 goto out;
2571 
2572         if (!strcmp(str, "enable")) {
2573                 numabalancing_override = 1;
2574                 ret = 1;
2575         } else if (!strcmp(str, "disable")) {
2576                 numabalancing_override = -1;
2577                 ret = 1;
2578         }
2579 out:
2580         if (!ret)
2581                 pr_warn("Unable to parse numa_balancing=\n");
2582 
2583         return ret;
2584 }
2585 __setup("numa_balancing=", setup_numabalancing);
2586 #else
2587 static inline void __init check_numabalancing_enable(void)
2588 {
2589 }
2590 #endif /* CONFIG_NUMA_BALANCING */
2591 
2592 /* assumes fs == KERNEL_DS */
2593 void __init numa_policy_init(void)
2594 {
2595         nodemask_t interleave_nodes;
2596         unsigned long largest = 0;
2597         int nid, prefer = 0;
2598 
2599         policy_cache = kmem_cache_create("numa_policy",
2600                                          sizeof(struct mempolicy),
2601                                          0, SLAB_PANIC, NULL);
2602 
2603         sn_cache = kmem_cache_create("shared_policy_node",
2604                                      sizeof(struct sp_node),
2605                                      0, SLAB_PANIC, NULL);
2606 
2607         for_each_node(nid) {
2608                 preferred_node_policy[nid] = (struct mempolicy) {
2609                         .refcnt = ATOMIC_INIT(1),
2610                         .mode = MPOL_PREFERRED,
2611                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2612                         .v = { .preferred_node = nid, },
2613                 };
2614         }
2615 
2616         /*
2617          * Set interleaving policy for system init. Interleaving is only
2618          * enabled across suitably sized nodes (default is >= 16MB), or
2619          * fall back to the largest node if they're all smaller.
2620          */
2621         nodes_clear(interleave_nodes);
2622         for_each_node_state(nid, N_MEMORY) {
2623                 unsigned long total_pages = node_present_pages(nid);
2624 
2625                 /* Preserve the largest node */
2626                 if (largest < total_pages) {
2627                         largest = total_pages;
2628                         prefer = nid;
2629                 }
2630 
2631                 /* Interleave this node? */
2632                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2633                         node_set(nid, interleave_nodes);
2634         }
2635 
2636         /* All too small, use the largest */
2637         if (unlikely(nodes_empty(interleave_nodes)))
2638                 node_set(prefer, interleave_nodes);
2639 
2640         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2641                 pr_err("%s: interleaving failed\n", __func__);
2642 
2643         check_numabalancing_enable();
2644 }
2645 
2646 /* Reset policy of current process to default */
2647 void numa_default_policy(void)
2648 {
2649         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2650 }
2651 
2652 /*
2653  * Parse and format mempolicy from/to strings
2654  */
2655 
2656 /*
2657  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2658  */
2659 static const char * const policy_modes[] =
2660 {
2661         [MPOL_DEFAULT]    = "default",
2662         [MPOL_PREFERRED]  = "prefer",
2663         [MPOL_BIND]       = "bind",
2664         [MPOL_INTERLEAVE] = "interleave",
2665         [MPOL_LOCAL]      = "local",
2666 };
2667 
2668 
2669 #ifdef CONFIG_TMPFS
2670 /**
2671  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2672  * @str:  string containing mempolicy to parse
2673  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2674  *
2675  * Format of input:
2676  *      <mode>[=<flags>][:<nodelist>]
2677  *
2678  * On success, returns 0, else 1
2679  */
2680 int mpol_parse_str(char *str, struct mempolicy **mpol)
2681 {
2682         struct mempolicy *new = NULL;
2683         unsigned short mode;
2684         unsigned short mode_flags;
2685         nodemask_t nodes;
2686         char *nodelist = strchr(str, ':');
2687         char *flags = strchr(str, '=');
2688         int err = 1;
2689 
2690         if (flags)
2691                 *flags++ = '\0';        /* terminate mode string */
2692 
2693         if (nodelist) {
2694                 /* NUL-terminate mode or flags string */
2695                 *nodelist++ = '\0';
2696                 if (nodelist_parse(nodelist, nodes))
2697                         goto out;
2698                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2699                         goto out;
2700         } else
2701                 nodes_clear(nodes);
2702 
2703         for (mode = 0; mode < MPOL_MAX; mode++) {
2704                 if (!strcmp(str, policy_modes[mode])) {
2705                         break;
2706                 }
2707         }
2708         if (mode >= MPOL_MAX)
2709                 goto out;
2710 
2711         switch (mode) {
2712         case MPOL_PREFERRED:
2713                 /*
2714                  * Insist on a nodelist of one node only, although later
2715                  * we use first_node(nodes) to grab a single node, so here
2716                  * nodelist (or nodes) cannot be empty.
2717                  */
2718                 if (nodelist) {
2719                         char *rest = nodelist;
2720                         while (isdigit(*rest))
2721                                 rest++;
2722                         if (*rest)
2723                                 goto out;
2724                         if (nodes_empty(nodes))
2725                                 goto out;
2726                 }
2727                 break;
2728         case MPOL_INTERLEAVE:
2729                 /*
2730                  * Default to online nodes with memory if no nodelist
2731                  */
2732                 if (!nodelist)
2733                         nodes = node_states[N_MEMORY];
2734                 break;
2735         case MPOL_LOCAL:
2736                 /*
2737                  * Don't allow a nodelist;  mpol_new() checks flags
2738                  */
2739                 if (nodelist)
2740                         goto out;
2741                 mode = MPOL_PREFERRED;
2742                 break;
2743         case MPOL_DEFAULT:
2744                 /*
2745                  * Insist on a empty nodelist
2746                  */
2747                 if (!nodelist)
2748                         err = 0;
2749                 goto out;
2750         case MPOL_BIND:
2751                 /*
2752                  * Insist on a nodelist
2753                  */
2754                 if (!nodelist)
2755                         goto out;
2756         }
2757 
2758         mode_flags = 0;
2759         if (flags) {
2760                 /*
2761                  * Currently, we only support two mutually exclusive
2762                  * mode flags.
2763                  */
2764                 if (!strcmp(flags, "static"))
2765                         mode_flags |= MPOL_F_STATIC_NODES;
2766                 else if (!strcmp(flags, "relative"))
2767                         mode_flags |= MPOL_F_RELATIVE_NODES;
2768                 else
2769                         goto out;
2770         }
2771 
2772         new = mpol_new(mode, mode_flags, &nodes);
2773         if (IS_ERR(new))
2774                 goto out;
2775 
2776         /*
2777          * Save nodes for mpol_to_str() to show the tmpfs mount options
2778          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2779          */
2780         if (mode != MPOL_PREFERRED)
2781                 new->v.nodes = nodes;
2782         else if (nodelist)
2783                 new->v.preferred_node = first_node(nodes);
2784         else
2785                 new->flags |= MPOL_F_LOCAL;
2786 
2787         /*
2788          * Save nodes for contextualization: this will be used to "clone"
2789          * the mempolicy in a specific context [cpuset] at a later time.
2790          */
2791         new->w.user_nodemask = nodes;
2792 
2793         err = 0;
2794 
2795 out:
2796         /* Restore string for error message */
2797         if (nodelist)
2798                 *--nodelist = ':';
2799         if (flags)
2800                 *--flags = '=';
2801         if (!err)
2802                 *mpol = new;
2803         return err;
2804 }
2805 #endif /* CONFIG_TMPFS */
2806 
2807 /**
2808  * mpol_to_str - format a mempolicy structure for printing
2809  * @buffer:  to contain formatted mempolicy string
2810  * @maxlen:  length of @buffer
2811  * @pol:  pointer to mempolicy to be formatted
2812  *
2813  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2814  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2815  * longest flag, "relative", and to display at least a few node ids.
2816  */
2817 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2818 {
2819         char *p = buffer;
2820         nodemask_t nodes = NODE_MASK_NONE;
2821         unsigned short mode = MPOL_DEFAULT;
2822         unsigned short flags = 0;
2823 
2824         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2825                 mode = pol->mode;
2826                 flags = pol->flags;
2827         }
2828 
2829         switch (mode) {
2830         case MPOL_DEFAULT:
2831                 break;
2832         case MPOL_PREFERRED:
2833                 if (flags & MPOL_F_LOCAL)
2834                         mode = MPOL_LOCAL;
2835                 else
2836                         node_set(pol->v.preferred_node, nodes);
2837                 break;
2838         case MPOL_BIND:
2839         case MPOL_INTERLEAVE:
2840                 nodes = pol->v.nodes;
2841                 break;
2842         default:
2843                 WARN_ON_ONCE(1);
2844                 snprintf(p, maxlen, "unknown");
2845                 return;
2846         }
2847 
2848         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2849 
2850         if (flags & MPOL_MODE_FLAGS) {
2851                 p += snprintf(p, buffer + maxlen - p, "=");
2852 
2853                 /*
2854                  * Currently, the only defined flags are mutually exclusive
2855                  */
2856                 if (flags & MPOL_F_STATIC_NODES)
2857                         p += snprintf(p, buffer + maxlen - p, "static");
2858                 else if (flags & MPOL_F_RELATIVE_NODES)
2859                         p += snprintf(p, buffer + maxlen - p, "relative");
2860         }
2861 
2862         if (!nodes_empty(nodes)) {
2863                 p += snprintf(p, buffer + maxlen - p, ":");
2864                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2865         }
2866 }
2867 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp