~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/page_alloc.c

Version: ~ [ linux-5.15-rc1 ] ~ [ linux-5.14.5 ] ~ [ linux-5.13.18 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.66 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.147 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.206 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.246 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.282 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.283 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  *  linux/mm/page_alloc.c
  4  *
  5  *  Manages the free list, the system allocates free pages here.
  6  *  Note that kmalloc() lives in slab.c
  7  *
  8  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  9  *  Swap reorganised 29.12.95, Stephen Tweedie
 10  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 11  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 12  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 13  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 14  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
 15  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 16  */
 17 
 18 #include <linux/stddef.h>
 19 #include <linux/mm.h>
 20 #include <linux/highmem.h>
 21 #include <linux/swap.h>
 22 #include <linux/interrupt.h>
 23 #include <linux/pagemap.h>
 24 #include <linux/jiffies.h>
 25 #include <linux/memblock.h>
 26 #include <linux/compiler.h>
 27 #include <linux/kernel.h>
 28 #include <linux/kasan.h>
 29 #include <linux/module.h>
 30 #include <linux/suspend.h>
 31 #include <linux/pagevec.h>
 32 #include <linux/blkdev.h>
 33 #include <linux/slab.h>
 34 #include <linux/ratelimit.h>
 35 #include <linux/oom.h>
 36 #include <linux/topology.h>
 37 #include <linux/sysctl.h>
 38 #include <linux/cpu.h>
 39 #include <linux/cpuset.h>
 40 #include <linux/memory_hotplug.h>
 41 #include <linux/nodemask.h>
 42 #include <linux/vmalloc.h>
 43 #include <linux/vmstat.h>
 44 #include <linux/mempolicy.h>
 45 #include <linux/memremap.h>
 46 #include <linux/stop_machine.h>
 47 #include <linux/random.h>
 48 #include <linux/sort.h>
 49 #include <linux/pfn.h>
 50 #include <linux/backing-dev.h>
 51 #include <linux/fault-inject.h>
 52 #include <linux/page-isolation.h>
 53 #include <linux/page_ext.h>
 54 #include <linux/debugobjects.h>
 55 #include <linux/kmemleak.h>
 56 #include <linux/compaction.h>
 57 #include <trace/events/kmem.h>
 58 #include <trace/events/oom.h>
 59 #include <linux/prefetch.h>
 60 #include <linux/mm_inline.h>
 61 #include <linux/migrate.h>
 62 #include <linux/hugetlb.h>
 63 #include <linux/sched/rt.h>
 64 #include <linux/sched/mm.h>
 65 #include <linux/page_owner.h>
 66 #include <linux/kthread.h>
 67 #include <linux/memcontrol.h>
 68 #include <linux/ftrace.h>
 69 #include <linux/lockdep.h>
 70 #include <linux/nmi.h>
 71 #include <linux/psi.h>
 72 
 73 #include <asm/sections.h>
 74 #include <asm/tlbflush.h>
 75 #include <asm/div64.h>
 76 #include "internal.h"
 77 #include "shuffle.h"
 78 
 79 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 80 static DEFINE_MUTEX(pcp_batch_high_lock);
 81 #define MIN_PERCPU_PAGELIST_FRACTION    (8)
 82 
 83 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 84 DEFINE_PER_CPU(int, numa_node);
 85 EXPORT_PER_CPU_SYMBOL(numa_node);
 86 #endif
 87 
 88 DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
 89 
 90 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 91 /*
 92  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 93  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 94  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
 95  * defined in <linux/topology.h>.
 96  */
 97 DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
 98 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 99 int _node_numa_mem_[MAX_NUMNODES];
100 #endif
101 
102 /* work_structs for global per-cpu drains */
103 struct pcpu_drain {
104         struct zone *zone;
105         struct work_struct work;
106 };
107 DEFINE_MUTEX(pcpu_drain_mutex);
108 DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
109 
110 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
111 volatile unsigned long latent_entropy __latent_entropy;
112 EXPORT_SYMBOL(latent_entropy);
113 #endif
114 
115 /*
116  * Array of node states.
117  */
118 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
119         [N_POSSIBLE] = NODE_MASK_ALL,
120         [N_ONLINE] = { { [0] = 1UL } },
121 #ifndef CONFIG_NUMA
122         [N_NORMAL_MEMORY] = { { [0] = 1UL } },
123 #ifdef CONFIG_HIGHMEM
124         [N_HIGH_MEMORY] = { { [0] = 1UL } },
125 #endif
126         [N_MEMORY] = { { [0] = 1UL } },
127         [N_CPU] = { { [0] = 1UL } },
128 #endif  /* NUMA */
129 };
130 EXPORT_SYMBOL(node_states);
131 
132 atomic_long_t _totalram_pages __read_mostly;
133 EXPORT_SYMBOL(_totalram_pages);
134 unsigned long totalreserve_pages __read_mostly;
135 unsigned long totalcma_pages __read_mostly;
136 
137 int percpu_pagelist_fraction;
138 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
139 
140 /*
141  * A cached value of the page's pageblock's migratetype, used when the page is
142  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
143  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
144  * Also the migratetype set in the page does not necessarily match the pcplist
145  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
146  * other index - this ensures that it will be put on the correct CMA freelist.
147  */
148 static inline int get_pcppage_migratetype(struct page *page)
149 {
150         return page->index;
151 }
152 
153 static inline void set_pcppage_migratetype(struct page *page, int migratetype)
154 {
155         page->index = migratetype;
156 }
157 
158 #ifdef CONFIG_PM_SLEEP
159 /*
160  * The following functions are used by the suspend/hibernate code to temporarily
161  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
162  * while devices are suspended.  To avoid races with the suspend/hibernate code,
163  * they should always be called with system_transition_mutex held
164  * (gfp_allowed_mask also should only be modified with system_transition_mutex
165  * held, unless the suspend/hibernate code is guaranteed not to run in parallel
166  * with that modification).
167  */
168 
169 static gfp_t saved_gfp_mask;
170 
171 void pm_restore_gfp_mask(void)
172 {
173         WARN_ON(!mutex_is_locked(&system_transition_mutex));
174         if (saved_gfp_mask) {
175                 gfp_allowed_mask = saved_gfp_mask;
176                 saved_gfp_mask = 0;
177         }
178 }
179 
180 void pm_restrict_gfp_mask(void)
181 {
182         WARN_ON(!mutex_is_locked(&system_transition_mutex));
183         WARN_ON(saved_gfp_mask);
184         saved_gfp_mask = gfp_allowed_mask;
185         gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
186 }
187 
188 bool pm_suspended_storage(void)
189 {
190         if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
191                 return false;
192         return true;
193 }
194 #endif /* CONFIG_PM_SLEEP */
195 
196 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
197 unsigned int pageblock_order __read_mostly;
198 #endif
199 
200 static void __free_pages_ok(struct page *page, unsigned int order);
201 
202 /*
203  * results with 256, 32 in the lowmem_reserve sysctl:
204  *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
205  *      1G machine -> (16M dma, 784M normal, 224M high)
206  *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
207  *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
208  *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
209  *
210  * TBD: should special case ZONE_DMA32 machines here - in those we normally
211  * don't need any ZONE_NORMAL reservation
212  */
213 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
214 #ifdef CONFIG_ZONE_DMA
215         [ZONE_DMA] = 256,
216 #endif
217 #ifdef CONFIG_ZONE_DMA32
218         [ZONE_DMA32] = 256,
219 #endif
220         [ZONE_NORMAL] = 32,
221 #ifdef CONFIG_HIGHMEM
222         [ZONE_HIGHMEM] = 0,
223 #endif
224         [ZONE_MOVABLE] = 0,
225 };
226 
227 EXPORT_SYMBOL(totalram_pages);
228 
229 static char * const zone_names[MAX_NR_ZONES] = {
230 #ifdef CONFIG_ZONE_DMA
231          "DMA",
232 #endif
233 #ifdef CONFIG_ZONE_DMA32
234          "DMA32",
235 #endif
236          "Normal",
237 #ifdef CONFIG_HIGHMEM
238          "HighMem",
239 #endif
240          "Movable",
241 #ifdef CONFIG_ZONE_DEVICE
242          "Device",
243 #endif
244 };
245 
246 const char * const migratetype_names[MIGRATE_TYPES] = {
247         "Unmovable",
248         "Movable",
249         "Reclaimable",
250         "HighAtomic",
251 #ifdef CONFIG_CMA
252         "CMA",
253 #endif
254 #ifdef CONFIG_MEMORY_ISOLATION
255         "Isolate",
256 #endif
257 };
258 
259 compound_page_dtor * const compound_page_dtors[] = {
260         NULL,
261         free_compound_page,
262 #ifdef CONFIG_HUGETLB_PAGE
263         free_huge_page,
264 #endif
265 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
266         free_transhuge_page,
267 #endif
268 };
269 
270 int min_free_kbytes = 1024;
271 int user_min_free_kbytes = -1;
272 #ifdef CONFIG_DISCONTIGMEM
273 /*
274  * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
275  * are not on separate NUMA nodes. Functionally this works but with
276  * watermark_boost_factor, it can reclaim prematurely as the ranges can be
277  * quite small. By default, do not boost watermarks on discontigmem as in
278  * many cases very high-order allocations like THP are likely to be
279  * unsupported and the premature reclaim offsets the advantage of long-term
280  * fragmentation avoidance.
281  */
282 int watermark_boost_factor __read_mostly;
283 #else
284 int watermark_boost_factor __read_mostly = 15000;
285 #endif
286 int watermark_scale_factor = 10;
287 
288 static unsigned long nr_kernel_pages __initdata;
289 static unsigned long nr_all_pages __initdata;
290 static unsigned long dma_reserve __initdata;
291 
292 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
293 static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
294 static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
295 static unsigned long required_kernelcore __initdata;
296 static unsigned long required_kernelcore_percent __initdata;
297 static unsigned long required_movablecore __initdata;
298 static unsigned long required_movablecore_percent __initdata;
299 static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
300 static bool mirrored_kernelcore __meminitdata;
301 
302 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
303 int movable_zone;
304 EXPORT_SYMBOL(movable_zone);
305 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
306 
307 #if MAX_NUMNODES > 1
308 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
309 unsigned int nr_online_nodes __read_mostly = 1;
310 EXPORT_SYMBOL(nr_node_ids);
311 EXPORT_SYMBOL(nr_online_nodes);
312 #endif
313 
314 int page_group_by_mobility_disabled __read_mostly;
315 
316 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
317 /*
318  * During boot we initialize deferred pages on-demand, as needed, but once
319  * page_alloc_init_late() has finished, the deferred pages are all initialized,
320  * and we can permanently disable that path.
321  */
322 static DEFINE_STATIC_KEY_TRUE(deferred_pages);
323 
324 /*
325  * Calling kasan_free_pages() only after deferred memory initialization
326  * has completed. Poisoning pages during deferred memory init will greatly
327  * lengthen the process and cause problem in large memory systems as the
328  * deferred pages initialization is done with interrupt disabled.
329  *
330  * Assuming that there will be no reference to those newly initialized
331  * pages before they are ever allocated, this should have no effect on
332  * KASAN memory tracking as the poison will be properly inserted at page
333  * allocation time. The only corner case is when pages are allocated by
334  * on-demand allocation and then freed again before the deferred pages
335  * initialization is done, but this is not likely to happen.
336  */
337 static inline void kasan_free_nondeferred_pages(struct page *page, int order)
338 {
339         if (!static_branch_unlikely(&deferred_pages))
340                 kasan_free_pages(page, order);
341 }
342 
343 /* Returns true if the struct page for the pfn is uninitialised */
344 static inline bool __meminit early_page_uninitialised(unsigned long pfn)
345 {
346         int nid = early_pfn_to_nid(pfn);
347 
348         if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
349                 return true;
350 
351         return false;
352 }
353 
354 /*
355  * Returns true when the remaining initialisation should be deferred until
356  * later in the boot cycle when it can be parallelised.
357  */
358 static bool __meminit
359 defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
360 {
361         static unsigned long prev_end_pfn, nr_initialised;
362 
363         /*
364          * prev_end_pfn static that contains the end of previous zone
365          * No need to protect because called very early in boot before smp_init.
366          */
367         if (prev_end_pfn != end_pfn) {
368                 prev_end_pfn = end_pfn;
369                 nr_initialised = 0;
370         }
371 
372         /* Always populate low zones for address-constrained allocations */
373         if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
374                 return false;
375 
376         /*
377          * We start only with one section of pages, more pages are added as
378          * needed until the rest of deferred pages are initialized.
379          */
380         nr_initialised++;
381         if ((nr_initialised > PAGES_PER_SECTION) &&
382             (pfn & (PAGES_PER_SECTION - 1)) == 0) {
383                 NODE_DATA(nid)->first_deferred_pfn = pfn;
384                 return true;
385         }
386         return false;
387 }
388 #else
389 #define kasan_free_nondeferred_pages(p, o)      kasan_free_pages(p, o)
390 
391 static inline bool early_page_uninitialised(unsigned long pfn)
392 {
393         return false;
394 }
395 
396 static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
397 {
398         return false;
399 }
400 #endif
401 
402 /* Return a pointer to the bitmap storing bits affecting a block of pages */
403 static inline unsigned long *get_pageblock_bitmap(struct page *page,
404                                                         unsigned long pfn)
405 {
406 #ifdef CONFIG_SPARSEMEM
407         return __pfn_to_section(pfn)->pageblock_flags;
408 #else
409         return page_zone(page)->pageblock_flags;
410 #endif /* CONFIG_SPARSEMEM */
411 }
412 
413 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
414 {
415 #ifdef CONFIG_SPARSEMEM
416         pfn &= (PAGES_PER_SECTION-1);
417         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
418 #else
419         pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
420         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
421 #endif /* CONFIG_SPARSEMEM */
422 }
423 
424 /**
425  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
426  * @page: The page within the block of interest
427  * @pfn: The target page frame number
428  * @end_bitidx: The last bit of interest to retrieve
429  * @mask: mask of bits that the caller is interested in
430  *
431  * Return: pageblock_bits flags
432  */
433 static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
434                                         unsigned long pfn,
435                                         unsigned long end_bitidx,
436                                         unsigned long mask)
437 {
438         unsigned long *bitmap;
439         unsigned long bitidx, word_bitidx;
440         unsigned long word;
441 
442         bitmap = get_pageblock_bitmap(page, pfn);
443         bitidx = pfn_to_bitidx(page, pfn);
444         word_bitidx = bitidx / BITS_PER_LONG;
445         bitidx &= (BITS_PER_LONG-1);
446 
447         word = bitmap[word_bitidx];
448         bitidx += end_bitidx;
449         return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
450 }
451 
452 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
453                                         unsigned long end_bitidx,
454                                         unsigned long mask)
455 {
456         return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
457 }
458 
459 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
460 {
461         return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
462 }
463 
464 /**
465  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
466  * @page: The page within the block of interest
467  * @flags: The flags to set
468  * @pfn: The target page frame number
469  * @end_bitidx: The last bit of interest
470  * @mask: mask of bits that the caller is interested in
471  */
472 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
473                                         unsigned long pfn,
474                                         unsigned long end_bitidx,
475                                         unsigned long mask)
476 {
477         unsigned long *bitmap;
478         unsigned long bitidx, word_bitidx;
479         unsigned long old_word, word;
480 
481         BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
482         BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
483 
484         bitmap = get_pageblock_bitmap(page, pfn);
485         bitidx = pfn_to_bitidx(page, pfn);
486         word_bitidx = bitidx / BITS_PER_LONG;
487         bitidx &= (BITS_PER_LONG-1);
488 
489         VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
490 
491         bitidx += end_bitidx;
492         mask <<= (BITS_PER_LONG - bitidx - 1);
493         flags <<= (BITS_PER_LONG - bitidx - 1);
494 
495         word = READ_ONCE(bitmap[word_bitidx]);
496         for (;;) {
497                 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
498                 if (word == old_word)
499                         break;
500                 word = old_word;
501         }
502 }
503 
504 void set_pageblock_migratetype(struct page *page, int migratetype)
505 {
506         if (unlikely(page_group_by_mobility_disabled &&
507                      migratetype < MIGRATE_PCPTYPES))
508                 migratetype = MIGRATE_UNMOVABLE;
509 
510         set_pageblock_flags_group(page, (unsigned long)migratetype,
511                                         PB_migrate, PB_migrate_end);
512 }
513 
514 #ifdef CONFIG_DEBUG_VM
515 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
516 {
517         int ret = 0;
518         unsigned seq;
519         unsigned long pfn = page_to_pfn(page);
520         unsigned long sp, start_pfn;
521 
522         do {
523                 seq = zone_span_seqbegin(zone);
524                 start_pfn = zone->zone_start_pfn;
525                 sp = zone->spanned_pages;
526                 if (!zone_spans_pfn(zone, pfn))
527                         ret = 1;
528         } while (zone_span_seqretry(zone, seq));
529 
530         if (ret)
531                 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
532                         pfn, zone_to_nid(zone), zone->name,
533                         start_pfn, start_pfn + sp);
534 
535         return ret;
536 }
537 
538 static int page_is_consistent(struct zone *zone, struct page *page)
539 {
540         if (!pfn_valid_within(page_to_pfn(page)))
541                 return 0;
542         if (zone != page_zone(page))
543                 return 0;
544 
545         return 1;
546 }
547 /*
548  * Temporary debugging check for pages not lying within a given zone.
549  */
550 static int __maybe_unused bad_range(struct zone *zone, struct page *page)
551 {
552         if (page_outside_zone_boundaries(zone, page))
553                 return 1;
554         if (!page_is_consistent(zone, page))
555                 return 1;
556 
557         return 0;
558 }
559 #else
560 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
561 {
562         return 0;
563 }
564 #endif
565 
566 static void bad_page(struct page *page, const char *reason,
567                 unsigned long bad_flags)
568 {
569         static unsigned long resume;
570         static unsigned long nr_shown;
571         static unsigned long nr_unshown;
572 
573         /*
574          * Allow a burst of 60 reports, then keep quiet for that minute;
575          * or allow a steady drip of one report per second.
576          */
577         if (nr_shown == 60) {
578                 if (time_before(jiffies, resume)) {
579                         nr_unshown++;
580                         goto out;
581                 }
582                 if (nr_unshown) {
583                         pr_alert(
584                               "BUG: Bad page state: %lu messages suppressed\n",
585                                 nr_unshown);
586                         nr_unshown = 0;
587                 }
588                 nr_shown = 0;
589         }
590         if (nr_shown++ == 0)
591                 resume = jiffies + 60 * HZ;
592 
593         pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
594                 current->comm, page_to_pfn(page));
595         __dump_page(page, reason);
596         bad_flags &= page->flags;
597         if (bad_flags)
598                 pr_alert("bad because of flags: %#lx(%pGp)\n",
599                                                 bad_flags, &bad_flags);
600         dump_page_owner(page);
601 
602         print_modules();
603         dump_stack();
604 out:
605         /* Leave bad fields for debug, except PageBuddy could make trouble */
606         page_mapcount_reset(page); /* remove PageBuddy */
607         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
608 }
609 
610 /*
611  * Higher-order pages are called "compound pages".  They are structured thusly:
612  *
613  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
614  *
615  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
616  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
617  *
618  * The first tail page's ->compound_dtor holds the offset in array of compound
619  * page destructors. See compound_page_dtors.
620  *
621  * The first tail page's ->compound_order holds the order of allocation.
622  * This usage means that zero-order pages may not be compound.
623  */
624 
625 void free_compound_page(struct page *page)
626 {
627         __free_pages_ok(page, compound_order(page));
628 }
629 
630 void prep_compound_page(struct page *page, unsigned int order)
631 {
632         int i;
633         int nr_pages = 1 << order;
634 
635         set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
636         set_compound_order(page, order);
637         __SetPageHead(page);
638         for (i = 1; i < nr_pages; i++) {
639                 struct page *p = page + i;
640                 set_page_count(p, 0);
641                 p->mapping = TAIL_MAPPING;
642                 set_compound_head(p, page);
643         }
644         atomic_set(compound_mapcount_ptr(page), -1);
645 }
646 
647 #ifdef CONFIG_DEBUG_PAGEALLOC
648 unsigned int _debug_guardpage_minorder;
649 bool _debug_pagealloc_enabled __read_mostly
650                         = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
651 EXPORT_SYMBOL(_debug_pagealloc_enabled);
652 bool _debug_guardpage_enabled __read_mostly;
653 
654 static int __init early_debug_pagealloc(char *buf)
655 {
656         if (!buf)
657                 return -EINVAL;
658         return kstrtobool(buf, &_debug_pagealloc_enabled);
659 }
660 early_param("debug_pagealloc", early_debug_pagealloc);
661 
662 static bool need_debug_guardpage(void)
663 {
664         /* If we don't use debug_pagealloc, we don't need guard page */
665         if (!debug_pagealloc_enabled())
666                 return false;
667 
668         if (!debug_guardpage_minorder())
669                 return false;
670 
671         return true;
672 }
673 
674 static void init_debug_guardpage(void)
675 {
676         if (!debug_pagealloc_enabled())
677                 return;
678 
679         if (!debug_guardpage_minorder())
680                 return;
681 
682         _debug_guardpage_enabled = true;
683 }
684 
685 struct page_ext_operations debug_guardpage_ops = {
686         .need = need_debug_guardpage,
687         .init = init_debug_guardpage,
688 };
689 
690 static int __init debug_guardpage_minorder_setup(char *buf)
691 {
692         unsigned long res;
693 
694         if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
695                 pr_err("Bad debug_guardpage_minorder value\n");
696                 return 0;
697         }
698         _debug_guardpage_minorder = res;
699         pr_info("Setting debug_guardpage_minorder to %lu\n", res);
700         return 0;
701 }
702 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
703 
704 static inline bool set_page_guard(struct zone *zone, struct page *page,
705                                 unsigned int order, int migratetype)
706 {
707         struct page_ext *page_ext;
708 
709         if (!debug_guardpage_enabled())
710                 return false;
711 
712         if (order >= debug_guardpage_minorder())
713                 return false;
714 
715         page_ext = lookup_page_ext(page);
716         if (unlikely(!page_ext))
717                 return false;
718 
719         __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
720 
721         INIT_LIST_HEAD(&page->lru);
722         set_page_private(page, order);
723         /* Guard pages are not available for any usage */
724         __mod_zone_freepage_state(zone, -(1 << order), migratetype);
725 
726         return true;
727 }
728 
729 static inline void clear_page_guard(struct zone *zone, struct page *page,
730                                 unsigned int order, int migratetype)
731 {
732         struct page_ext *page_ext;
733 
734         if (!debug_guardpage_enabled())
735                 return;
736 
737         page_ext = lookup_page_ext(page);
738         if (unlikely(!page_ext))
739                 return;
740 
741         __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
742 
743         set_page_private(page, 0);
744         if (!is_migrate_isolate(migratetype))
745                 __mod_zone_freepage_state(zone, (1 << order), migratetype);
746 }
747 #else
748 struct page_ext_operations debug_guardpage_ops;
749 static inline bool set_page_guard(struct zone *zone, struct page *page,
750                         unsigned int order, int migratetype) { return false; }
751 static inline void clear_page_guard(struct zone *zone, struct page *page,
752                                 unsigned int order, int migratetype) {}
753 #endif
754 
755 static inline void set_page_order(struct page *page, unsigned int order)
756 {
757         set_page_private(page, order);
758         __SetPageBuddy(page);
759 }
760 
761 /*
762  * This function checks whether a page is free && is the buddy
763  * we can coalesce a page and its buddy if
764  * (a) the buddy is not in a hole (check before calling!) &&
765  * (b) the buddy is in the buddy system &&
766  * (c) a page and its buddy have the same order &&
767  * (d) a page and its buddy are in the same zone.
768  *
769  * For recording whether a page is in the buddy system, we set PageBuddy.
770  * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
771  *
772  * For recording page's order, we use page_private(page).
773  */
774 static inline int page_is_buddy(struct page *page, struct page *buddy,
775                                                         unsigned int order)
776 {
777         if (page_is_guard(buddy) && page_order(buddy) == order) {
778                 if (page_zone_id(page) != page_zone_id(buddy))
779                         return 0;
780 
781                 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
782 
783                 return 1;
784         }
785 
786         if (PageBuddy(buddy) && page_order(buddy) == order) {
787                 /*
788                  * zone check is done late to avoid uselessly
789                  * calculating zone/node ids for pages that could
790                  * never merge.
791                  */
792                 if (page_zone_id(page) != page_zone_id(buddy))
793                         return 0;
794 
795                 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
796 
797                 return 1;
798         }
799         return 0;
800 }
801 
802 #ifdef CONFIG_COMPACTION
803 static inline struct capture_control *task_capc(struct zone *zone)
804 {
805         struct capture_control *capc = current->capture_control;
806 
807         return capc &&
808                 !(current->flags & PF_KTHREAD) &&
809                 !capc->page &&
810                 capc->cc->zone == zone &&
811                 capc->cc->direct_compaction ? capc : NULL;
812 }
813 
814 static inline bool
815 compaction_capture(struct capture_control *capc, struct page *page,
816                    int order, int migratetype)
817 {
818         if (!capc || order != capc->cc->order)
819                 return false;
820 
821         /* Do not accidentally pollute CMA or isolated regions*/
822         if (is_migrate_cma(migratetype) ||
823             is_migrate_isolate(migratetype))
824                 return false;
825 
826         /*
827          * Do not let lower order allocations polluate a movable pageblock.
828          * This might let an unmovable request use a reclaimable pageblock
829          * and vice-versa but no more than normal fallback logic which can
830          * have trouble finding a high-order free page.
831          */
832         if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
833                 return false;
834 
835         capc->page = page;
836         return true;
837 }
838 
839 #else
840 static inline struct capture_control *task_capc(struct zone *zone)
841 {
842         return NULL;
843 }
844 
845 static inline bool
846 compaction_capture(struct capture_control *capc, struct page *page,
847                    int order, int migratetype)
848 {
849         return false;
850 }
851 #endif /* CONFIG_COMPACTION */
852 
853 /*
854  * Freeing function for a buddy system allocator.
855  *
856  * The concept of a buddy system is to maintain direct-mapped table
857  * (containing bit values) for memory blocks of various "orders".
858  * The bottom level table contains the map for the smallest allocatable
859  * units of memory (here, pages), and each level above it describes
860  * pairs of units from the levels below, hence, "buddies".
861  * At a high level, all that happens here is marking the table entry
862  * at the bottom level available, and propagating the changes upward
863  * as necessary, plus some accounting needed to play nicely with other
864  * parts of the VM system.
865  * At each level, we keep a list of pages, which are heads of continuous
866  * free pages of length of (1 << order) and marked with PageBuddy.
867  * Page's order is recorded in page_private(page) field.
868  * So when we are allocating or freeing one, we can derive the state of the
869  * other.  That is, if we allocate a small block, and both were
870  * free, the remainder of the region must be split into blocks.
871  * If a block is freed, and its buddy is also free, then this
872  * triggers coalescing into a block of larger size.
873  *
874  * -- nyc
875  */
876 
877 static inline void __free_one_page(struct page *page,
878                 unsigned long pfn,
879                 struct zone *zone, unsigned int order,
880                 int migratetype)
881 {
882         unsigned long combined_pfn;
883         unsigned long uninitialized_var(buddy_pfn);
884         struct page *buddy;
885         unsigned int max_order;
886         struct capture_control *capc = task_capc(zone);
887 
888         max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
889 
890         VM_BUG_ON(!zone_is_initialized(zone));
891         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
892 
893         VM_BUG_ON(migratetype == -1);
894         if (likely(!is_migrate_isolate(migratetype)))
895                 __mod_zone_freepage_state(zone, 1 << order, migratetype);
896 
897         VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
898         VM_BUG_ON_PAGE(bad_range(zone, page), page);
899 
900 continue_merging:
901         while (order < max_order - 1) {
902                 if (compaction_capture(capc, page, order, migratetype)) {
903                         __mod_zone_freepage_state(zone, -(1 << order),
904                                                                 migratetype);
905                         return;
906                 }
907                 buddy_pfn = __find_buddy_pfn(pfn, order);
908                 buddy = page + (buddy_pfn - pfn);
909 
910                 if (!pfn_valid_within(buddy_pfn))
911                         goto done_merging;
912                 if (!page_is_buddy(page, buddy, order))
913                         goto done_merging;
914                 /*
915                  * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
916                  * merge with it and move up one order.
917                  */
918                 if (page_is_guard(buddy))
919                         clear_page_guard(zone, buddy, order, migratetype);
920                 else
921                         del_page_from_free_area(buddy, &zone->free_area[order]);
922                 combined_pfn = buddy_pfn & pfn;
923                 page = page + (combined_pfn - pfn);
924                 pfn = combined_pfn;
925                 order++;
926         }
927         if (max_order < MAX_ORDER) {
928                 /* If we are here, it means order is >= pageblock_order.
929                  * We want to prevent merge between freepages on isolate
930                  * pageblock and normal pageblock. Without this, pageblock
931                  * isolation could cause incorrect freepage or CMA accounting.
932                  *
933                  * We don't want to hit this code for the more frequent
934                  * low-order merging.
935                  */
936                 if (unlikely(has_isolate_pageblock(zone))) {
937                         int buddy_mt;
938 
939                         buddy_pfn = __find_buddy_pfn(pfn, order);
940                         buddy = page + (buddy_pfn - pfn);
941                         buddy_mt = get_pageblock_migratetype(buddy);
942 
943                         if (migratetype != buddy_mt
944                                         && (is_migrate_isolate(migratetype) ||
945                                                 is_migrate_isolate(buddy_mt)))
946                                 goto done_merging;
947                 }
948                 max_order++;
949                 goto continue_merging;
950         }
951 
952 done_merging:
953         set_page_order(page, order);
954 
955         /*
956          * If this is not the largest possible page, check if the buddy
957          * of the next-highest order is free. If it is, it's possible
958          * that pages are being freed that will coalesce soon. In case,
959          * that is happening, add the free page to the tail of the list
960          * so it's less likely to be used soon and more likely to be merged
961          * as a higher order page
962          */
963         if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
964                         && !is_shuffle_order(order)) {
965                 struct page *higher_page, *higher_buddy;
966                 combined_pfn = buddy_pfn & pfn;
967                 higher_page = page + (combined_pfn - pfn);
968                 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
969                 higher_buddy = higher_page + (buddy_pfn - combined_pfn);
970                 if (pfn_valid_within(buddy_pfn) &&
971                     page_is_buddy(higher_page, higher_buddy, order + 1)) {
972                         add_to_free_area_tail(page, &zone->free_area[order],
973                                               migratetype);
974                         return;
975                 }
976         }
977 
978         if (is_shuffle_order(order))
979                 add_to_free_area_random(page, &zone->free_area[order],
980                                 migratetype);
981         else
982                 add_to_free_area(page, &zone->free_area[order], migratetype);
983 
984 }
985 
986 /*
987  * A bad page could be due to a number of fields. Instead of multiple branches,
988  * try and check multiple fields with one check. The caller must do a detailed
989  * check if necessary.
990  */
991 static inline bool page_expected_state(struct page *page,
992                                         unsigned long check_flags)
993 {
994         if (unlikely(atomic_read(&page->_mapcount) != -1))
995                 return false;
996 
997         if (unlikely((unsigned long)page->mapping |
998                         page_ref_count(page) |
999 #ifdef CONFIG_MEMCG
1000                         (unsigned long)page->mem_cgroup |
1001 #endif
1002                         (page->flags & check_flags)))
1003                 return false;
1004 
1005         return true;
1006 }
1007 
1008 static void free_pages_check_bad(struct page *page)
1009 {
1010         const char *bad_reason;
1011         unsigned long bad_flags;
1012 
1013         bad_reason = NULL;
1014         bad_flags = 0;
1015 
1016         if (unlikely(atomic_read(&page->_mapcount) != -1))
1017                 bad_reason = "nonzero mapcount";
1018         if (unlikely(page->mapping != NULL))
1019                 bad_reason = "non-NULL mapping";
1020         if (unlikely(page_ref_count(page) != 0))
1021                 bad_reason = "nonzero _refcount";
1022         if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
1023                 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1024                 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
1025         }
1026 #ifdef CONFIG_MEMCG
1027         if (unlikely(page->mem_cgroup))
1028                 bad_reason = "page still charged to cgroup";
1029 #endif
1030         bad_page(page, bad_reason, bad_flags);
1031 }
1032 
1033 static inline int free_pages_check(struct page *page)
1034 {
1035         if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1036                 return 0;
1037 
1038         /* Something has gone sideways, find it */
1039         free_pages_check_bad(page);
1040         return 1;
1041 }
1042 
1043 static int free_tail_pages_check(struct page *head_page, struct page *page)
1044 {
1045         int ret = 1;
1046 
1047         /*
1048          * We rely page->lru.next never has bit 0 set, unless the page
1049          * is PageTail(). Let's make sure that's true even for poisoned ->lru.
1050          */
1051         BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
1052 
1053         if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
1054                 ret = 0;
1055                 goto out;
1056         }
1057         switch (page - head_page) {
1058         case 1:
1059                 /* the first tail page: ->mapping may be compound_mapcount() */
1060                 if (unlikely(compound_mapcount(page))) {
1061                         bad_page(page, "nonzero compound_mapcount", 0);
1062                         goto out;
1063                 }
1064                 break;
1065         case 2:
1066                 /*
1067                  * the second tail page: ->mapping is
1068                  * deferred_list.next -- ignore value.
1069                  */
1070                 break;
1071         default:
1072                 if (page->mapping != TAIL_MAPPING) {
1073                         bad_page(page, "corrupted mapping in tail page", 0);
1074                         goto out;
1075                 }
1076                 break;
1077         }
1078         if (unlikely(!PageTail(page))) {
1079                 bad_page(page, "PageTail not set", 0);
1080                 goto out;
1081         }
1082         if (unlikely(compound_head(page) != head_page)) {
1083                 bad_page(page, "compound_head not consistent", 0);
1084                 goto out;
1085         }
1086         ret = 0;
1087 out:
1088         page->mapping = NULL;
1089         clear_compound_head(page);
1090         return ret;
1091 }
1092 
1093 static __always_inline bool free_pages_prepare(struct page *page,
1094                                         unsigned int order, bool check_free)
1095 {
1096         int bad = 0;
1097 
1098         VM_BUG_ON_PAGE(PageTail(page), page);
1099 
1100         trace_mm_page_free(page, order);
1101 
1102         /*
1103          * Check tail pages before head page information is cleared to
1104          * avoid checking PageCompound for order-0 pages.
1105          */
1106         if (unlikely(order)) {
1107                 bool compound = PageCompound(page);
1108                 int i;
1109 
1110                 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1111 
1112                 if (compound)
1113                         ClearPageDoubleMap(page);
1114                 for (i = 1; i < (1 << order); i++) {
1115                         if (compound)
1116                                 bad += free_tail_pages_check(page, page + i);
1117                         if (unlikely(free_pages_check(page + i))) {
1118                                 bad++;
1119                                 continue;
1120                         }
1121                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1122                 }
1123         }
1124         if (PageMappingFlags(page))
1125                 page->mapping = NULL;
1126         if (memcg_kmem_enabled() && PageKmemcg(page))
1127                 __memcg_kmem_uncharge(page, order);
1128         if (check_free)
1129                 bad += free_pages_check(page);
1130         if (bad)
1131                 return false;
1132 
1133         page_cpupid_reset_last(page);
1134         page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1135         reset_page_owner(page, order);
1136 
1137         if (!PageHighMem(page)) {
1138                 debug_check_no_locks_freed(page_address(page),
1139                                            PAGE_SIZE << order);
1140                 debug_check_no_obj_freed(page_address(page),
1141                                            PAGE_SIZE << order);
1142         }
1143         arch_free_page(page, order);
1144         kernel_poison_pages(page, 1 << order, 0);
1145         if (debug_pagealloc_enabled())
1146                 kernel_map_pages(page, 1 << order, 0);
1147 
1148         kasan_free_nondeferred_pages(page, order);
1149 
1150         return true;
1151 }
1152 
1153 #ifdef CONFIG_DEBUG_VM
1154 static inline bool free_pcp_prepare(struct page *page)
1155 {
1156         return free_pages_prepare(page, 0, true);
1157 }
1158 
1159 static inline bool bulkfree_pcp_prepare(struct page *page)
1160 {
1161         return false;
1162 }
1163 #else
1164 static bool free_pcp_prepare(struct page *page)
1165 {
1166         return free_pages_prepare(page, 0, false);
1167 }
1168 
1169 static bool bulkfree_pcp_prepare(struct page *page)
1170 {
1171         return free_pages_check(page);
1172 }
1173 #endif /* CONFIG_DEBUG_VM */
1174 
1175 static inline void prefetch_buddy(struct page *page)
1176 {
1177         unsigned long pfn = page_to_pfn(page);
1178         unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
1179         struct page *buddy = page + (buddy_pfn - pfn);
1180 
1181         prefetch(buddy);
1182 }
1183 
1184 /*
1185  * Frees a number of pages from the PCP lists
1186  * Assumes all pages on list are in same zone, and of same order.
1187  * count is the number of pages to free.
1188  *
1189  * If the zone was previously in an "all pages pinned" state then look to
1190  * see if this freeing clears that state.
1191  *
1192  * And clear the zone's pages_scanned counter, to hold off the "all pages are
1193  * pinned" detection logic.
1194  */
1195 static void free_pcppages_bulk(struct zone *zone, int count,
1196                                         struct per_cpu_pages *pcp)
1197 {
1198         int migratetype = 0;
1199         int batch_free = 0;
1200         int prefetch_nr = 0;
1201         bool isolated_pageblocks;
1202         struct page *page, *tmp;
1203         LIST_HEAD(head);
1204 
1205         while (count) {
1206                 struct list_head *list;
1207 
1208                 /*
1209                  * Remove pages from lists in a round-robin fashion. A
1210                  * batch_free count is maintained that is incremented when an
1211                  * empty list is encountered.  This is so more pages are freed
1212                  * off fuller lists instead of spinning excessively around empty
1213                  * lists
1214                  */
1215                 do {
1216                         batch_free++;
1217                         if (++migratetype == MIGRATE_PCPTYPES)
1218                                 migratetype = 0;
1219                         list = &pcp->lists[migratetype];
1220                 } while (list_empty(list));
1221 
1222                 /* This is the only non-empty list. Free them all. */
1223                 if (batch_free == MIGRATE_PCPTYPES)
1224                         batch_free = count;
1225 
1226                 do {
1227                         page = list_last_entry(list, struct page, lru);
1228                         /* must delete to avoid corrupting pcp list */
1229                         list_del(&page->lru);
1230                         pcp->count--;
1231 
1232                         if (bulkfree_pcp_prepare(page))
1233                                 continue;
1234 
1235                         list_add_tail(&page->lru, &head);
1236 
1237                         /*
1238                          * We are going to put the page back to the global
1239                          * pool, prefetch its buddy to speed up later access
1240                          * under zone->lock. It is believed the overhead of
1241                          * an additional test and calculating buddy_pfn here
1242                          * can be offset by reduced memory latency later. To
1243                          * avoid excessive prefetching due to large count, only
1244                          * prefetch buddy for the first pcp->batch nr of pages.
1245                          */
1246                         if (prefetch_nr++ < pcp->batch)
1247                                 prefetch_buddy(page);
1248                 } while (--count && --batch_free && !list_empty(list));
1249         }
1250 
1251         spin_lock(&zone->lock);
1252         isolated_pageblocks = has_isolate_pageblock(zone);
1253 
1254         /*
1255          * Use safe version since after __free_one_page(),
1256          * page->lru.next will not point to original list.
1257          */
1258         list_for_each_entry_safe(page, tmp, &head, lru) {
1259                 int mt = get_pcppage_migratetype(page);
1260                 /* MIGRATE_ISOLATE page should not go to pcplists */
1261                 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1262                 /* Pageblock could have been isolated meanwhile */
1263                 if (unlikely(isolated_pageblocks))
1264                         mt = get_pageblock_migratetype(page);
1265 
1266                 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1267                 trace_mm_page_pcpu_drain(page, 0, mt);
1268         }
1269         spin_unlock(&zone->lock);
1270 }
1271 
1272 static void free_one_page(struct zone *zone,
1273                                 struct page *page, unsigned long pfn,
1274                                 unsigned int order,
1275                                 int migratetype)
1276 {
1277         spin_lock(&zone->lock);
1278         if (unlikely(has_isolate_pageblock(zone) ||
1279                 is_migrate_isolate(migratetype))) {
1280                 migratetype = get_pfnblock_migratetype(page, pfn);
1281         }
1282         __free_one_page(page, pfn, zone, order, migratetype);
1283         spin_unlock(&zone->lock);
1284 }
1285 
1286 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1287                                 unsigned long zone, int nid)
1288 {
1289         mm_zero_struct_page(page);
1290         set_page_links(page, zone, nid, pfn);
1291         init_page_count(page);
1292         page_mapcount_reset(page);
1293         page_cpupid_reset_last(page);
1294         page_kasan_tag_reset(page);
1295 
1296         INIT_LIST_HEAD(&page->lru);
1297 #ifdef WANT_PAGE_VIRTUAL
1298         /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1299         if (!is_highmem_idx(zone))
1300                 set_page_address(page, __va(pfn << PAGE_SHIFT));
1301 #endif
1302 }
1303 
1304 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1305 static void __meminit init_reserved_page(unsigned long pfn)
1306 {
1307         pg_data_t *pgdat;
1308         int nid, zid;
1309 
1310         if (!early_page_uninitialised(pfn))
1311                 return;
1312 
1313         nid = early_pfn_to_nid(pfn);
1314         pgdat = NODE_DATA(nid);
1315 
1316         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1317                 struct zone *zone = &pgdat->node_zones[zid];
1318 
1319                 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
1320                         break;
1321         }
1322         __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
1323 }
1324 #else
1325 static inline void init_reserved_page(unsigned long pfn)
1326 {
1327 }
1328 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1329 
1330 /*
1331  * Initialised pages do not have PageReserved set. This function is
1332  * called for each range allocated by the bootmem allocator and
1333  * marks the pages PageReserved. The remaining valid pages are later
1334  * sent to the buddy page allocator.
1335  */
1336 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1337 {
1338         unsigned long start_pfn = PFN_DOWN(start);
1339         unsigned long end_pfn = PFN_UP(end);
1340 
1341         for (; start_pfn < end_pfn; start_pfn++) {
1342                 if (pfn_valid(start_pfn)) {
1343                         struct page *page = pfn_to_page(start_pfn);
1344 
1345                         init_reserved_page(start_pfn);
1346 
1347                         /* Avoid false-positive PageTail() */
1348                         INIT_LIST_HEAD(&page->lru);
1349 
1350                         /*
1351                          * no need for atomic set_bit because the struct
1352                          * page is not visible yet so nobody should
1353                          * access it yet.
1354                          */
1355                         __SetPageReserved(page);
1356                 }
1357         }
1358 }
1359 
1360 static void __free_pages_ok(struct page *page, unsigned int order)
1361 {
1362         unsigned long flags;
1363         int migratetype;
1364         unsigned long pfn = page_to_pfn(page);
1365 
1366         if (!free_pages_prepare(page, order, true))
1367                 return;
1368 
1369         migratetype = get_pfnblock_migratetype(page, pfn);
1370         local_irq_save(flags);
1371         __count_vm_events(PGFREE, 1 << order);
1372         free_one_page(page_zone(page), page, pfn, order, migratetype);
1373         local_irq_restore(flags);
1374 }
1375 
1376 void __free_pages_core(struct page *page, unsigned int order)
1377 {
1378         unsigned int nr_pages = 1 << order;
1379         struct page *p = page;
1380         unsigned int loop;
1381 
1382         prefetchw(p);
1383         for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1384                 prefetchw(p + 1);
1385                 __ClearPageReserved(p);
1386                 set_page_count(p, 0);
1387         }
1388         __ClearPageReserved(p);
1389         set_page_count(p, 0);
1390 
1391         atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1392         set_page_refcounted(page);
1393         __free_pages(page, order);
1394 }
1395 
1396 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
1397         defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1398 
1399 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1400 
1401 int __meminit early_pfn_to_nid(unsigned long pfn)
1402 {
1403         static DEFINE_SPINLOCK(early_pfn_lock);
1404         int nid;
1405 
1406         spin_lock(&early_pfn_lock);
1407         nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1408         if (nid < 0)
1409                 nid = first_online_node;
1410         spin_unlock(&early_pfn_lock);
1411 
1412         return nid;
1413 }
1414 #endif
1415 
1416 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
1417 /* Only safe to use early in boot when initialisation is single-threaded */
1418 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1419 {
1420         int nid;
1421 
1422         nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1423         if (nid >= 0 && nid != node)
1424                 return false;
1425         return true;
1426 }
1427 
1428 #else
1429 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1430 {
1431         return true;
1432 }
1433 #endif
1434 
1435 
1436 void __init memblock_free_pages(struct page *page, unsigned long pfn,
1437                                                         unsigned int order)
1438 {
1439         if (early_page_uninitialised(pfn))
1440                 return;
1441         __free_pages_core(page, order);
1442 }
1443 
1444 /*
1445  * Check that the whole (or subset of) a pageblock given by the interval of
1446  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1447  * with the migration of free compaction scanner. The scanners then need to
1448  * use only pfn_valid_within() check for arches that allow holes within
1449  * pageblocks.
1450  *
1451  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1452  *
1453  * It's possible on some configurations to have a setup like node0 node1 node0
1454  * i.e. it's possible that all pages within a zones range of pages do not
1455  * belong to a single zone. We assume that a border between node0 and node1
1456  * can occur within a single pageblock, but not a node0 node1 node0
1457  * interleaving within a single pageblock. It is therefore sufficient to check
1458  * the first and last page of a pageblock and avoid checking each individual
1459  * page in a pageblock.
1460  */
1461 struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1462                                      unsigned long end_pfn, struct zone *zone)
1463 {
1464         struct page *start_page;
1465         struct page *end_page;
1466 
1467         /* end_pfn is one past the range we are checking */
1468         end_pfn--;
1469 
1470         if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1471                 return NULL;
1472 
1473         start_page = pfn_to_online_page(start_pfn);
1474         if (!start_page)
1475                 return NULL;
1476 
1477         if (page_zone(start_page) != zone)
1478                 return NULL;
1479 
1480         end_page = pfn_to_page(end_pfn);
1481 
1482         /* This gives a shorter code than deriving page_zone(end_page) */
1483         if (page_zone_id(start_page) != page_zone_id(end_page))
1484                 return NULL;
1485 
1486         return start_page;
1487 }
1488 
1489 void set_zone_contiguous(struct zone *zone)
1490 {
1491         unsigned long block_start_pfn = zone->zone_start_pfn;
1492         unsigned long block_end_pfn;
1493 
1494         block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1495         for (; block_start_pfn < zone_end_pfn(zone);
1496                         block_start_pfn = block_end_pfn,
1497                          block_end_pfn += pageblock_nr_pages) {
1498 
1499                 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1500 
1501                 if (!__pageblock_pfn_to_page(block_start_pfn,
1502                                              block_end_pfn, zone))
1503                         return;
1504         }
1505 
1506         /* We confirm that there is no hole */
1507         zone->contiguous = true;
1508 }
1509 
1510 void clear_zone_contiguous(struct zone *zone)
1511 {
1512         zone->contiguous = false;
1513 }
1514 
1515 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1516 static void __init deferred_free_range(unsigned long pfn,
1517                                        unsigned long nr_pages)
1518 {
1519         struct page *page;
1520         unsigned long i;
1521 
1522         if (!nr_pages)
1523                 return;
1524 
1525         page = pfn_to_page(pfn);
1526 
1527         /* Free a large naturally-aligned chunk if possible */
1528         if (nr_pages == pageblock_nr_pages &&
1529             (pfn & (pageblock_nr_pages - 1)) == 0) {
1530                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1531                 __free_pages_core(page, pageblock_order);
1532                 return;
1533         }
1534 
1535         for (i = 0; i < nr_pages; i++, page++, pfn++) {
1536                 if ((pfn & (pageblock_nr_pages - 1)) == 0)
1537                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1538                 __free_pages_core(page, 0);
1539         }
1540 }
1541 
1542 /* Completion tracking for deferred_init_memmap() threads */
1543 static atomic_t pgdat_init_n_undone __initdata;
1544 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1545 
1546 static inline void __init pgdat_init_report_one_done(void)
1547 {
1548         if (atomic_dec_and_test(&pgdat_init_n_undone))
1549                 complete(&pgdat_init_all_done_comp);
1550 }
1551 
1552 /*
1553  * Returns true if page needs to be initialized or freed to buddy allocator.
1554  *
1555  * First we check if pfn is valid on architectures where it is possible to have
1556  * holes within pageblock_nr_pages. On systems where it is not possible, this
1557  * function is optimized out.
1558  *
1559  * Then, we check if a current large page is valid by only checking the validity
1560  * of the head pfn.
1561  */
1562 static inline bool __init deferred_pfn_valid(unsigned long pfn)
1563 {
1564         if (!pfn_valid_within(pfn))
1565                 return false;
1566         if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1567                 return false;
1568         return true;
1569 }
1570 
1571 /*
1572  * Free pages to buddy allocator. Try to free aligned pages in
1573  * pageblock_nr_pages sizes.
1574  */
1575 static void __init deferred_free_pages(unsigned long pfn,
1576                                        unsigned long end_pfn)
1577 {
1578         unsigned long nr_pgmask = pageblock_nr_pages - 1;
1579         unsigned long nr_free = 0;
1580 
1581         for (; pfn < end_pfn; pfn++) {
1582                 if (!deferred_pfn_valid(pfn)) {
1583                         deferred_free_range(pfn - nr_free, nr_free);
1584                         nr_free = 0;
1585                 } else if (!(pfn & nr_pgmask)) {
1586                         deferred_free_range(pfn - nr_free, nr_free);
1587                         nr_free = 1;
1588                         touch_nmi_watchdog();
1589                 } else {
1590                         nr_free++;
1591                 }
1592         }
1593         /* Free the last block of pages to allocator */
1594         deferred_free_range(pfn - nr_free, nr_free);
1595 }
1596 
1597 /*
1598  * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
1599  * by performing it only once every pageblock_nr_pages.
1600  * Return number of pages initialized.
1601  */
1602 static unsigned long  __init deferred_init_pages(struct zone *zone,
1603                                                  unsigned long pfn,
1604                                                  unsigned long end_pfn)
1605 {
1606         unsigned long nr_pgmask = pageblock_nr_pages - 1;
1607         int nid = zone_to_nid(zone);
1608         unsigned long nr_pages = 0;
1609         int zid = zone_idx(zone);
1610         struct page *page = NULL;
1611 
1612         for (; pfn < end_pfn; pfn++) {
1613                 if (!deferred_pfn_valid(pfn)) {
1614                         page = NULL;
1615                         continue;
1616                 } else if (!page || !(pfn & nr_pgmask)) {
1617                         page = pfn_to_page(pfn);
1618                         touch_nmi_watchdog();
1619                 } else {
1620                         page++;
1621                 }
1622                 __init_single_page(page, pfn, zid, nid);
1623                 nr_pages++;
1624         }
1625         return (nr_pages);
1626 }
1627 
1628 /*
1629  * This function is meant to pre-load the iterator for the zone init.
1630  * Specifically it walks through the ranges until we are caught up to the
1631  * first_init_pfn value and exits there. If we never encounter the value we
1632  * return false indicating there are no valid ranges left.
1633  */
1634 static bool __init
1635 deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
1636                                     unsigned long *spfn, unsigned long *epfn,
1637                                     unsigned long first_init_pfn)
1638 {
1639         u64 j;
1640 
1641         /*
1642          * Start out by walking through the ranges in this zone that have
1643          * already been initialized. We don't need to do anything with them
1644          * so we just need to flush them out of the system.
1645          */
1646         for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
1647                 if (*epfn <= first_init_pfn)
1648                         continue;
1649                 if (*spfn < first_init_pfn)
1650                         *spfn = first_init_pfn;
1651                 *i = j;
1652                 return true;
1653         }
1654 
1655         return false;
1656 }
1657 
1658 /*
1659  * Initialize and free pages. We do it in two loops: first we initialize
1660  * struct page, then free to buddy allocator, because while we are
1661  * freeing pages we can access pages that are ahead (computing buddy
1662  * page in __free_one_page()).
1663  *
1664  * In order to try and keep some memory in the cache we have the loop
1665  * broken along max page order boundaries. This way we will not cause
1666  * any issues with the buddy page computation.
1667  */
1668 static unsigned long __init
1669 deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
1670                        unsigned long *end_pfn)
1671 {
1672         unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
1673         unsigned long spfn = *start_pfn, epfn = *end_pfn;
1674         unsigned long nr_pages = 0;
1675         u64 j = *i;
1676 
1677         /* First we loop through and initialize the page values */
1678         for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
1679                 unsigned long t;
1680 
1681                 if (mo_pfn <= *start_pfn)
1682                         break;
1683 
1684                 t = min(mo_pfn, *end_pfn);
1685                 nr_pages += deferred_init_pages(zone, *start_pfn, t);
1686 
1687                 if (mo_pfn < *end_pfn) {
1688                         *start_pfn = mo_pfn;
1689                         break;
1690                 }
1691         }
1692 
1693         /* Reset values and now loop through freeing pages as needed */
1694         swap(j, *i);
1695 
1696         for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
1697                 unsigned long t;
1698 
1699                 if (mo_pfn <= spfn)
1700                         break;
1701 
1702                 t = min(mo_pfn, epfn);
1703                 deferred_free_pages(spfn, t);
1704 
1705                 if (mo_pfn <= epfn)
1706                         break;
1707         }
1708 
1709         return nr_pages;
1710 }
1711 
1712 /* Initialise remaining memory on a node */
1713 static int __init deferred_init_memmap(void *data)
1714 {
1715         pg_data_t *pgdat = data;
1716         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1717         unsigned long spfn = 0, epfn = 0, nr_pages = 0;
1718         unsigned long first_init_pfn, flags;
1719         unsigned long start = jiffies;
1720         struct zone *zone;
1721         int zid;
1722         u64 i;
1723 
1724         /* Bind memory initialisation thread to a local node if possible */
1725         if (!cpumask_empty(cpumask))
1726                 set_cpus_allowed_ptr(current, cpumask);
1727 
1728         pgdat_resize_lock(pgdat, &flags);
1729         first_init_pfn = pgdat->first_deferred_pfn;
1730         if (first_init_pfn == ULONG_MAX) {
1731                 pgdat_resize_unlock(pgdat, &flags);
1732                 pgdat_init_report_one_done();
1733                 return 0;
1734         }
1735 
1736         /* Sanity check boundaries */
1737         BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
1738         BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
1739         pgdat->first_deferred_pfn = ULONG_MAX;
1740 
1741         /* Only the highest zone is deferred so find it */
1742         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1743                 zone = pgdat->node_zones + zid;
1744                 if (first_init_pfn < zone_end_pfn(zone))
1745                         break;
1746         }
1747 
1748         /* If the zone is empty somebody else may have cleared out the zone */
1749         if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
1750                                                  first_init_pfn))
1751                 goto zone_empty;
1752 
1753         /*
1754          * Initialize and free pages in MAX_ORDER sized increments so
1755          * that we can avoid introducing any issues with the buddy
1756          * allocator.
1757          */
1758         while (spfn < epfn)
1759                 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
1760 zone_empty:
1761         pgdat_resize_unlock(pgdat, &flags);
1762 
1763         /* Sanity check that the next zone really is unpopulated */
1764         WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1765 
1766         pr_info("node %d initialised, %lu pages in %ums\n",
1767                 pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
1768 
1769         pgdat_init_report_one_done();
1770         return 0;
1771 }
1772 
1773 /*
1774  * If this zone has deferred pages, try to grow it by initializing enough
1775  * deferred pages to satisfy the allocation specified by order, rounded up to
1776  * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
1777  * of SECTION_SIZE bytes by initializing struct pages in increments of
1778  * PAGES_PER_SECTION * sizeof(struct page) bytes.
1779  *
1780  * Return true when zone was grown, otherwise return false. We return true even
1781  * when we grow less than requested, to let the caller decide if there are
1782  * enough pages to satisfy the allocation.
1783  *
1784  * Note: We use noinline because this function is needed only during boot, and
1785  * it is called from a __ref function _deferred_grow_zone. This way we are
1786  * making sure that it is not inlined into permanent text section.
1787  */
1788 static noinline bool __init
1789 deferred_grow_zone(struct zone *zone, unsigned int order)
1790 {
1791         unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1792         pg_data_t *pgdat = zone->zone_pgdat;
1793         unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1794         unsigned long spfn, epfn, flags;
1795         unsigned long nr_pages = 0;
1796         u64 i;
1797 
1798         /* Only the last zone may have deferred pages */
1799         if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
1800                 return false;
1801 
1802         pgdat_resize_lock(pgdat, &flags);
1803 
1804         /*
1805          * If deferred pages have been initialized while we were waiting for
1806          * the lock, return true, as the zone was grown.  The caller will retry
1807          * this zone.  We won't return to this function since the caller also
1808          * has this static branch.
1809          */
1810         if (!static_branch_unlikely(&deferred_pages)) {
1811                 pgdat_resize_unlock(pgdat, &flags);
1812                 return true;
1813         }
1814 
1815         /*
1816          * If someone grew this zone while we were waiting for spinlock, return
1817          * true, as there might be enough pages already.
1818          */
1819         if (first_deferred_pfn != pgdat->first_deferred_pfn) {
1820                 pgdat_resize_unlock(pgdat, &flags);
1821                 return true;
1822         }
1823 
1824         /* If the zone is empty somebody else may have cleared out the zone */
1825         if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
1826                                                  first_deferred_pfn)) {
1827                 pgdat->first_deferred_pfn = ULONG_MAX;
1828                 pgdat_resize_unlock(pgdat, &flags);
1829                 /* Retry only once. */
1830                 return first_deferred_pfn != ULONG_MAX;
1831         }
1832 
1833         /*
1834          * Initialize and free pages in MAX_ORDER sized increments so
1835          * that we can avoid introducing any issues with the buddy
1836          * allocator.
1837          */
1838         while (spfn < epfn) {
1839                 /* update our first deferred PFN for this section */
1840                 first_deferred_pfn = spfn;
1841 
1842                 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
1843 
1844                 /* We should only stop along section boundaries */
1845                 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
1846                         continue;
1847 
1848                 /* If our quota has been met we can stop here */
1849                 if (nr_pages >= nr_pages_needed)
1850                         break;
1851         }
1852 
1853         pgdat->first_deferred_pfn = spfn;
1854         pgdat_resize_unlock(pgdat, &flags);
1855 
1856         return nr_pages > 0;
1857 }
1858 
1859 /*
1860  * deferred_grow_zone() is __init, but it is called from
1861  * get_page_from_freelist() during early boot until deferred_pages permanently
1862  * disables this call. This is why we have refdata wrapper to avoid warning,
1863  * and to ensure that the function body gets unloaded.
1864  */
1865 static bool __ref
1866 _deferred_grow_zone(struct zone *zone, unsigned int order)
1867 {
1868         return deferred_grow_zone(zone, order);
1869 }
1870 
1871 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1872 
1873 void __init page_alloc_init_late(void)
1874 {
1875         struct zone *zone;
1876         int nid;
1877 
1878 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1879 
1880         /* There will be num_node_state(N_MEMORY) threads */
1881         atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
1882         for_each_node_state(nid, N_MEMORY) {
1883                 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
1884         }
1885 
1886         /* Block until all are initialised */
1887         wait_for_completion(&pgdat_init_all_done_comp);
1888 
1889         /*
1890          * We initialized the rest of the deferred pages.  Permanently disable
1891          * on-demand struct page initialization.
1892          */
1893         static_branch_disable(&deferred_pages);
1894 
1895         /* Reinit limits that are based on free pages after the kernel is up */
1896         files_maxfiles_init();
1897 #endif
1898 
1899         /* Discard memblock private memory */
1900         memblock_discard();
1901 
1902         for_each_node_state(nid, N_MEMORY)
1903                 shuffle_free_memory(NODE_DATA(nid));
1904 
1905         for_each_populated_zone(zone)
1906                 set_zone_contiguous(zone);
1907 }
1908 
1909 #ifdef CONFIG_CMA
1910 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
1911 void __init init_cma_reserved_pageblock(struct page *page)
1912 {
1913         unsigned i = pageblock_nr_pages;
1914         struct page *p = page;
1915 
1916         do {
1917                 __ClearPageReserved(p);
1918                 set_page_count(p, 0);
1919         } while (++p, --i);
1920 
1921         set_pageblock_migratetype(page, MIGRATE_CMA);
1922 
1923         if (pageblock_order >= MAX_ORDER) {
1924                 i = pageblock_nr_pages;
1925                 p = page;
1926                 do {
1927                         set_page_refcounted(p);
1928                         __free_pages(p, MAX_ORDER - 1);
1929                         p += MAX_ORDER_NR_PAGES;
1930                 } while (i -= MAX_ORDER_NR_PAGES);
1931         } else {
1932                 set_page_refcounted(page);
1933                 __free_pages(page, pageblock_order);
1934         }
1935 
1936         adjust_managed_page_count(page, pageblock_nr_pages);
1937 }
1938 #endif
1939 
1940 /*
1941  * The order of subdivision here is critical for the IO subsystem.
1942  * Please do not alter this order without good reasons and regression
1943  * testing. Specifically, as large blocks of memory are subdivided,
1944  * the order in which smaller blocks are delivered depends on the order
1945  * they're subdivided in this function. This is the primary factor
1946  * influencing the order in which pages are delivered to the IO
1947  * subsystem according to empirical testing, and this is also justified
1948  * by considering the behavior of a buddy system containing a single
1949  * large block of memory acted on by a series of small allocations.
1950  * This behavior is a critical factor in sglist merging's success.
1951  *
1952  * -- nyc
1953  */
1954 static inline void expand(struct zone *zone, struct page *page,
1955         int low, int high, struct free_area *area,
1956         int migratetype)
1957 {
1958         unsigned long size = 1 << high;
1959 
1960         while (high > low) {
1961                 area--;
1962                 high--;
1963                 size >>= 1;
1964                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1965 
1966                 /*
1967                  * Mark as guard pages (or page), that will allow to
1968                  * merge back to allocator when buddy will be freed.
1969                  * Corresponding page table entries will not be touched,
1970                  * pages will stay not present in virtual address space
1971                  */
1972                 if (set_page_guard(zone, &page[size], high, migratetype))
1973                         continue;
1974 
1975                 add_to_free_area(&page[size], area, migratetype);
1976                 set_page_order(&page[size], high);
1977         }
1978 }
1979 
1980 static void check_new_page_bad(struct page *page)
1981 {
1982         const char *bad_reason = NULL;
1983         unsigned long bad_flags = 0;
1984 
1985         if (unlikely(atomic_read(&page->_mapcount) != -1))
1986                 bad_reason = "nonzero mapcount";
1987         if (unlikely(page->mapping != NULL))
1988                 bad_reason = "non-NULL mapping";
1989         if (unlikely(page_ref_count(page) != 0))
1990                 bad_reason = "nonzero _refcount";
1991         if (unlikely(page->flags & __PG_HWPOISON)) {
1992                 bad_reason = "HWPoisoned (hardware-corrupted)";
1993                 bad_flags = __PG_HWPOISON;
1994                 /* Don't complain about hwpoisoned pages */
1995                 page_mapcount_reset(page); /* remove PageBuddy */
1996                 return;
1997         }
1998         if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1999                 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
2000                 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
2001         }
2002 #ifdef CONFIG_MEMCG
2003         if (unlikely(page->mem_cgroup))
2004                 bad_reason = "page still charged to cgroup";
2005 #endif
2006         bad_page(page, bad_reason, bad_flags);
2007 }
2008 
2009 /*
2010  * This page is about to be returned from the page allocator
2011  */
2012 static inline int check_new_page(struct page *page)
2013 {
2014         if (likely(page_expected_state(page,
2015                                 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
2016                 return 0;
2017 
2018         check_new_page_bad(page);
2019         return 1;
2020 }
2021 
2022 static inline bool free_pages_prezeroed(void)
2023 {
2024         return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
2025                 page_poisoning_enabled();
2026 }
2027 
2028 #ifdef CONFIG_DEBUG_VM
2029 static bool check_pcp_refill(struct page *page)
2030 {
2031         return false;
2032 }
2033 
2034 static bool check_new_pcp(struct page *page)
2035 {
2036         return check_new_page(page);
2037 }
2038 #else
2039 static bool check_pcp_refill(struct page *page)
2040 {
2041         return check_new_page(page);
2042 }
2043 static bool check_new_pcp(struct page *page)
2044 {
2045         return false;
2046 }
2047 #endif /* CONFIG_DEBUG_VM */
2048 
2049 static bool check_new_pages(struct page *page, unsigned int order)
2050 {
2051         int i;
2052         for (i = 0; i < (1 << order); i++) {
2053                 struct page *p = page + i;
2054 
2055                 if (unlikely(check_new_page(p)))
2056                         return true;
2057         }
2058 
2059         return false;
2060 }
2061 
2062 inline void post_alloc_hook(struct page *page, unsigned int order,
2063                                 gfp_t gfp_flags)
2064 {
2065         set_page_private(page, 0);
2066         set_page_refcounted(page);
2067 
2068         arch_alloc_page(page, order);
2069         if (debug_pagealloc_enabled())
2070                 kernel_map_pages(page, 1 << order, 1);
2071         kasan_alloc_pages(page, order);
2072         kernel_poison_pages(page, 1 << order, 1);
2073         set_page_owner(page, order, gfp_flags);
2074 }
2075 
2076 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
2077                                                         unsigned int alloc_flags)
2078 {
2079         int i;
2080 
2081         post_alloc_hook(page, order, gfp_flags);
2082 
2083         if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
2084                 for (i = 0; i < (1 << order); i++)
2085                         clear_highpage(page + i);
2086 
2087         if (order && (gfp_flags & __GFP_COMP))
2088                 prep_compound_page(page, order);
2089 
2090         /*
2091          * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
2092          * allocate the page. The expectation is that the caller is taking
2093          * steps that will free more memory. The caller should avoid the page
2094          * being used for !PFMEMALLOC purposes.
2095          */
2096         if (alloc_flags & ALLOC_NO_WATERMARKS)
2097                 set_page_pfmemalloc(page);
2098         else
2099                 clear_page_pfmemalloc(page);
2100 }
2101 
2102 /*
2103  * Go through the free lists for the given migratetype and remove
2104  * the smallest available page from the freelists
2105  */
2106 static __always_inline
2107 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
2108                                                 int migratetype)
2109 {
2110         unsigned int current_order;
2111         struct free_area *area;
2112         struct page *page;
2113 
2114         /* Find a page of the appropriate size in the preferred list */
2115         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
2116                 area = &(zone->free_area[current_order]);
2117                 page = get_page_from_free_area(area, migratetype);
2118                 if (!page)
2119                         continue;
2120                 del_page_from_free_area(page, area);
2121                 expand(zone, page, order, current_order, area, migratetype);
2122                 set_pcppage_migratetype(page, migratetype);
2123                 return page;
2124         }
2125 
2126         return NULL;
2127 }
2128 
2129 
2130 /*
2131  * This array describes the order lists are fallen back to when
2132  * the free lists for the desirable migrate type are depleted
2133  */
2134 static int fallbacks[MIGRATE_TYPES][4] = {
2135         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
2136         [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
2137         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
2138 #ifdef CONFIG_CMA
2139         [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
2140 #endif
2141 #ifdef CONFIG_MEMORY_ISOLATION
2142         [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
2143 #endif
2144 };
2145 
2146 #ifdef CONFIG_CMA
2147 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
2148                                         unsigned int order)
2149 {
2150         return __rmqueue_smallest(zone, order, MIGRATE_CMA);
2151 }
2152 #else
2153 static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
2154                                         unsigned int order) { return NULL; }
2155 #endif
2156 
2157 /*
2158  * Move the free pages in a range to the free lists of the requested type.
2159  * Note that start_page and end_pages are not aligned on a pageblock
2160  * boundary. If alignment is required, use move_freepages_block()
2161  */
2162 static int move_freepages(struct zone *zone,
2163                           struct page *start_page, struct page *end_page,
2164                           int migratetype, int *num_movable)
2165 {
2166         struct page *page;
2167         unsigned int order;
2168         int pages_moved = 0;
2169 
2170         for (page = start_page; page <= end_page;) {
2171                 if (!pfn_valid_within(page_to_pfn(page))) {
2172                         page++;
2173                         continue;
2174                 }
2175 
2176                 if (!PageBuddy(page)) {
2177                         /*
2178                          * We assume that pages that could be isolated for
2179                          * migration are movable. But we don't actually try
2180                          * isolating, as that would be expensive.
2181                          */
2182                         if (num_movable &&
2183                                         (PageLRU(page) || __PageMovable(page)))
2184                                 (*num_movable)++;
2185 
2186                         page++;
2187                         continue;
2188                 }
2189 
2190                 /* Make sure we are not inadvertently changing nodes */
2191                 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2192                 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
2193 
2194                 order = page_order(page);
2195                 move_to_free_area(page, &zone->free_area[order], migratetype);
2196                 page += 1 << order;
2197                 pages_moved += 1 << order;
2198         }
2199 
2200         return pages_moved;
2201 }
2202 
2203 int move_freepages_block(struct zone *zone, struct page *page,
2204                                 int migratetype, int *num_movable)
2205 {
2206         unsigned long start_pfn, end_pfn;
2207         struct page *start_page, *end_page;
2208 
2209         if (num_movable)
2210                 *num_movable = 0;
2211 
2212         start_pfn = page_to_pfn(page);
2213         start_pfn = start_pfn & ~(pageblock_nr_pages-1);
2214         start_page = pfn_to_page(start_pfn);
2215         end_page = start_page + pageblock_nr_pages - 1;
2216         end_pfn = start_pfn + pageblock_nr_pages - 1;
2217 
2218         /* Do not cross zone boundaries */
2219         if (!zone_spans_pfn(zone, start_pfn))
2220                 start_page = page;
2221         if (!zone_spans_pfn(zone, end_pfn))
2222                 return 0;
2223 
2224         return move_freepages(zone, start_page, end_page, migratetype,
2225                                                                 num_movable);
2226 }
2227 
2228 static void change_pageblock_range(struct page *pageblock_page,
2229                                         int start_order, int migratetype)
2230 {
2231         int nr_pageblocks = 1 << (start_order - pageblock_order);
2232 
2233         while (nr_pageblocks--) {
2234                 set_pageblock_migratetype(pageblock_page, migratetype);
2235                 pageblock_page += pageblock_nr_pages;
2236         }
2237 }
2238 
2239 /*
2240  * When we are falling back to another migratetype during allocation, try to
2241  * steal extra free pages from the same pageblocks to satisfy further
2242  * allocations, instead of polluting multiple pageblocks.
2243  *
2244  * If we are stealing a relatively large buddy page, it is likely there will
2245  * be more free pages in the pageblock, so try to steal them all. For
2246  * reclaimable and unmovable allocations, we steal regardless of page size,
2247  * as fragmentation caused by those allocations polluting movable pageblocks
2248  * is worse than movable allocations stealing from unmovable and reclaimable
2249  * pageblocks.
2250  */
2251 static bool can_steal_fallback(unsigned int order, int start_mt)
2252 {
2253         /*
2254          * Leaving this order check is intended, although there is
2255          * relaxed order check in next check. The reason is that
2256          * we can actually steal whole pageblock if this condition met,
2257          * but, below check doesn't guarantee it and that is just heuristic
2258          * so could be changed anytime.
2259          */
2260         if (order >= pageblock_order)
2261                 return true;
2262 
2263         if (order >= pageblock_order / 2 ||
2264                 start_mt == MIGRATE_RECLAIMABLE ||
2265                 start_mt == MIGRATE_UNMOVABLE ||
2266                 page_group_by_mobility_disabled)
2267                 return true;
2268 
2269         return false;
2270 }
2271 
2272 static inline void boost_watermark(struct zone *zone)
2273 {
2274         unsigned long max_boost;
2275 
2276         if (!watermark_boost_factor)
2277                 return;
2278 
2279         max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2280                         watermark_boost_factor, 10000);
2281 
2282         /*
2283          * high watermark may be uninitialised if fragmentation occurs
2284          * very early in boot so do not boost. We do not fall
2285          * through and boost by pageblock_nr_pages as failing
2286          * allocations that early means that reclaim is not going
2287          * to help and it may even be impossible to reclaim the
2288          * boosted watermark resulting in a hang.
2289          */
2290         if (!max_boost)
2291                 return;
2292 
2293         max_boost = max(pageblock_nr_pages, max_boost);
2294 
2295         zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2296                 max_boost);
2297 }
2298 
2299 /*
2300  * This function implements actual steal behaviour. If order is large enough,
2301  * we can steal whole pageblock. If not, we first move freepages in this
2302  * pageblock to our migratetype and determine how many already-allocated pages
2303  * are there in the pageblock with a compatible migratetype. If at least half
2304  * of pages are free or compatible, we can change migratetype of the pageblock
2305  * itself, so pages freed in the future will be put on the correct free list.
2306  */
2307 static void steal_suitable_fallback(struct zone *zone, struct page *page,
2308                 unsigned int alloc_flags, int start_type, bool whole_block)
2309 {
2310         unsigned int current_order = page_order(page);
2311         struct free_area *area;
2312         int free_pages, movable_pages, alike_pages;
2313         int old_block_type;
2314 
2315         old_block_type = get_pageblock_migratetype(page);
2316 
2317         /*
2318          * This can happen due to races and we want to prevent broken
2319          * highatomic accounting.
2320          */
2321         if (is_migrate_highatomic(old_block_type))
2322                 goto single_page;
2323 
2324         /* Take ownership for orders >= pageblock_order */
2325         if (current_order >= pageblock_order) {
2326                 change_pageblock_range(page, current_order, start_type);
2327                 goto single_page;
2328         }
2329 
2330         /*
2331          * Boost watermarks to increase reclaim pressure to reduce the
2332          * likelihood of future fallbacks. Wake kswapd now as the node
2333          * may be balanced overall and kswapd will not wake naturally.
2334          */
2335         boost_watermark(zone);
2336         if (alloc_flags & ALLOC_KSWAPD)
2337                 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2338 
2339         /* We are not allowed to try stealing from the whole block */
2340         if (!whole_block)
2341                 goto single_page;
2342 
2343         free_pages = move_freepages_block(zone, page, start_type,
2344                                                 &movable_pages);
2345         /*
2346          * Determine how many pages are compatible with our allocation.
2347          * For movable allocation, it's the number of movable pages which
2348          * we just obtained. For other types it's a bit more tricky.
2349          */
2350         if (start_type == MIGRATE_MOVABLE) {
2351                 alike_pages = movable_pages;
2352         } else {
2353                 /*
2354                  * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2355                  * to MOVABLE pageblock, consider all non-movable pages as
2356                  * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2357                  * vice versa, be conservative since we can't distinguish the
2358                  * exact migratetype of non-movable pages.
2359                  */
2360                 if (old_block_type == MIGRATE_MOVABLE)
2361                         alike_pages = pageblock_nr_pages
2362                                                 - (free_pages + movable_pages);
2363                 else
2364                         alike_pages = 0;
2365         }
2366 
2367         /* moving whole block can fail due to zone boundary conditions */
2368         if (!free_pages)
2369                 goto single_page;
2370 
2371         /*
2372          * If a sufficient number of pages in the block are either free or of
2373          * comparable migratability as our allocation, claim the whole block.
2374          */
2375         if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
2376                         page_group_by_mobility_disabled)
2377                 set_pageblock_migratetype(page, start_type);
2378 
2379         return;
2380 
2381 single_page:
2382         area = &zone->free_area[current_order];
2383         move_to_free_area(page, area, start_type);
2384 }
2385 
2386 /*
2387  * Check whether there is a suitable fallback freepage with requested order.
2388  * If only_stealable is true, this function returns fallback_mt only if
2389  * we can steal other freepages all together. This would help to reduce
2390  * fragmentation due to mixed migratetype pages in one pageblock.
2391  */
2392 int find_suitable_fallback(struct free_area *area, unsigned int order,
2393                         int migratetype, bool only_stealable, bool *can_steal)
2394 {
2395         int i;
2396         int fallback_mt;
2397 
2398         if (area->nr_free == 0)
2399                 return -1;
2400 
2401         *can_steal = false;
2402         for (i = 0;; i++) {
2403                 fallback_mt = fallbacks[migratetype][i];
2404                 if (fallback_mt == MIGRATE_TYPES)
2405                         break;
2406 
2407                 if (free_area_empty(area, fallback_mt))
2408                         continue;
2409 
2410                 if (can_steal_fallback(order, migratetype))
2411                         *can_steal = true;
2412 
2413                 if (!only_stealable)
2414                         return fallback_mt;
2415 
2416                 if (*can_steal)
2417                         return fallback_mt;
2418         }
2419 
2420         return -1;
2421 }
2422 
2423 /*
2424  * Reserve a pageblock for exclusive use of high-order atomic allocations if
2425  * there are no empty page blocks that contain a page with a suitable order
2426  */
2427 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2428                                 unsigned int alloc_order)
2429 {
2430         int mt;
2431         unsigned long max_managed, flags;
2432 
2433         /*
2434          * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2435          * Check is race-prone but harmless.
2436          */
2437         max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2438         if (zone->nr_reserved_highatomic >= max_managed)
2439                 return;
2440 
2441         spin_lock_irqsave(&zone->lock, flags);
2442 
2443         /* Recheck the nr_reserved_highatomic limit under the lock */
2444         if (zone->nr_reserved_highatomic >= max_managed)
2445                 goto out_unlock;
2446 
2447         /* Yoink! */
2448         mt = get_pageblock_migratetype(page);
2449         if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
2450             && !is_migrate_cma(mt)) {
2451                 zone->nr_reserved_highatomic += pageblock_nr_pages;
2452                 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
2453                 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
2454         }
2455 
2456 out_unlock:
2457         spin_unlock_irqrestore(&zone->lock, flags);
2458 }
2459 
2460 /*
2461  * Used when an allocation is about to fail under memory pressure. This
2462  * potentially hurts the reliability of high-order allocations when under
2463  * intense memory pressure but failed atomic allocations should be easier
2464  * to recover from than an OOM.
2465  *
2466  * If @force is true, try to unreserve a pageblock even though highatomic
2467  * pageblock is exhausted.
2468  */
2469 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2470                                                 bool force)
2471 {
2472         struct zonelist *zonelist = ac->zonelist;
2473         unsigned long flags;
2474         struct zoneref *z;
2475         struct zone *zone;
2476         struct page *page;
2477         int order;
2478         bool ret;
2479 
2480         for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2481                                                                 ac->nodemask) {
2482                 /*
2483                  * Preserve at least one pageblock unless memory pressure
2484                  * is really high.
2485                  */
2486                 if (!force && zone->nr_reserved_highatomic <=
2487                                         pageblock_nr_pages)
2488                         continue;
2489 
2490                 spin_lock_irqsave(&zone->lock, flags);
2491                 for (order = 0; order < MAX_ORDER; order++) {
2492                         struct free_area *area = &(zone->free_area[order]);
2493 
2494                         page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
2495                         if (!page)
2496                                 continue;
2497 
2498                         /*
2499                          * In page freeing path, migratetype change is racy so
2500                          * we can counter several free pages in a pageblock
2501                          * in this loop althoug we changed the pageblock type
2502                          * from highatomic to ac->migratetype. So we should
2503                          * adjust the count once.
2504                          */
2505                         if (is_migrate_highatomic_page(page)) {
2506                                 /*
2507                                  * It should never happen but changes to
2508                                  * locking could inadvertently allow a per-cpu
2509                                  * drain to add pages to MIGRATE_HIGHATOMIC
2510                                  * while unreserving so be safe and watch for
2511                                  * underflows.
2512                                  */
2513                                 zone->nr_reserved_highatomic -= min(
2514                                                 pageblock_nr_pages,
2515                                                 zone->nr_reserved_highatomic);
2516                         }
2517 
2518                         /*
2519                          * Convert to ac->migratetype and avoid the normal
2520                          * pageblock stealing heuristics. Minimally, the caller
2521                          * is doing the work and needs the pages. More
2522                          * importantly, if the block was always converted to
2523                          * MIGRATE_UNMOVABLE or another type then the number
2524                          * of pageblocks that cannot be completely freed
2525                          * may increase.
2526                          */
2527                         set_pageblock_migratetype(page, ac->migratetype);
2528                         ret = move_freepages_block(zone, page, ac->migratetype,
2529                                                                         NULL);
2530                         if (ret) {
2531                                 spin_unlock_irqrestore(&zone->lock, flags);
2532                                 return ret;
2533                         }
2534                 }
2535                 spin_unlock_irqrestore(&zone->lock, flags);
2536         }
2537 
2538         return false;
2539 }
2540 
2541 /*
2542  * Try finding a free buddy page on the fallback list and put it on the free
2543  * list of requested migratetype, possibly along with other pages from the same
2544  * block, depending on fragmentation avoidance heuristics. Returns true if
2545  * fallback was found so that __rmqueue_smallest() can grab it.
2546  *
2547  * The use of signed ints for order and current_order is a deliberate
2548  * deviation from the rest of this file, to make the for loop
2549  * condition simpler.
2550  */
2551 static __always_inline bool
2552 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2553                                                 unsigned int alloc_flags)
2554 {
2555         struct free_area *area;
2556         int current_order;
2557         int min_order = order;
2558         struct page *page;
2559         int fallback_mt;
2560         bool can_steal;
2561 
2562         /*
2563          * Do not steal pages from freelists belonging to other pageblocks
2564          * i.e. orders < pageblock_order. If there are no local zones free,
2565          * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2566          */
2567         if (alloc_flags & ALLOC_NOFRAGMENT)
2568                 min_order = pageblock_order;
2569 
2570         /*
2571          * Find the largest available free page in the other list. This roughly
2572          * approximates finding the pageblock with the most free pages, which
2573          * would be too costly to do exactly.
2574          */
2575         for (current_order = MAX_ORDER - 1; current_order >= min_order;
2576                                 --current_order) {
2577                 area = &(zone->free_area[current_order]);
2578                 fallback_mt = find_suitable_fallback(area, current_order,
2579                                 start_migratetype, false, &can_steal);
2580                 if (fallback_mt == -1)
2581                         continue;
2582 
2583                 /*
2584                  * We cannot steal all free pages from the pageblock and the
2585                  * requested migratetype is movable. In that case it's better to
2586                  * steal and split the smallest available page instead of the
2587                  * largest available page, because even if the next movable
2588                  * allocation falls back into a different pageblock than this
2589                  * one, it won't cause permanent fragmentation.
2590                  */
2591                 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2592                                         && current_order > order)
2593                         goto find_smallest;
2594 
2595                 goto do_steal;
2596         }
2597 
2598         return false;
2599 
2600 find_smallest:
2601         for (current_order = order; current_order < MAX_ORDER;
2602                                                         current_order++) {
2603                 area = &(zone->free_area[current_order]);
2604                 fallback_mt = find_suitable_fallback(area, current_order,
2605                                 start_migratetype, false, &can_steal);
2606                 if (fallback_mt != -1)
2607                         break;
2608         }
2609 
2610         /*
2611          * This should not happen - we already found a suitable fallback
2612          * when looking for the largest page.
2613          */
2614         VM_BUG_ON(current_order == MAX_ORDER);
2615 
2616 do_steal:
2617         page = get_page_from_free_area(area, fallback_mt);
2618 
2619         steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2620                                                                 can_steal);
2621 
2622         trace_mm_page_alloc_extfrag(page, order, current_order,
2623                 start_migratetype, fallback_mt);
2624 
2625         return true;
2626 
2627 }
2628 
2629 /*
2630  * Do the hard work of removing an element from the buddy allocator.
2631  * Call me with the zone->lock already held.
2632  */
2633 static __always_inline struct page *
2634 __rmqueue(struct zone *zone, unsigned int order, int migratetype,
2635                                                 unsigned int alloc_flags)
2636 {
2637         struct page *page;
2638 
2639 retry:
2640         page = __rmqueue_smallest(zone, order, migratetype);
2641         if (unlikely(!page)) {
2642                 if (migratetype == MIGRATE_MOVABLE)
2643                         page = __rmqueue_cma_fallback(zone, order);
2644 
2645                 if (!page && __rmqueue_fallback(zone, order, migratetype,
2646                                                                 alloc_flags))
2647                         goto retry;
2648         }
2649 
2650         trace_mm_page_alloc_zone_locked(page, order, migratetype);
2651         return page;
2652 }
2653 
2654 /*
2655  * Obtain a specified number of elements from the buddy allocator, all under
2656  * a single hold of the lock, for efficiency.  Add them to the supplied list.
2657  * Returns the number of new pages which were placed at *list.
2658  */
2659 static int rmqueue_bulk(struct zone *zone, unsigned int order,
2660                         unsigned long count, struct list_head *list,
2661                         int migratetype, unsigned int alloc_flags)
2662 {
2663         int i, alloced = 0;
2664 
2665         spin_lock(&zone->lock);
2666         for (i = 0; i < count; ++i) {
2667                 struct page *page = __rmqueue(zone, order, migratetype,
2668                                                                 alloc_flags);
2669                 if (unlikely(page == NULL))
2670                         break;
2671 
2672                 if (unlikely(check_pcp_refill(page)))
2673                         continue;
2674 
2675                 /*
2676                  * Split buddy pages returned by expand() are received here in
2677                  * physical page order. The page is added to the tail of
2678                  * caller's list. From the callers perspective, the linked list
2679                  * is ordered by page number under some conditions. This is
2680                  * useful for IO devices that can forward direction from the
2681                  * head, thus also in the physical page order. This is useful
2682                  * for IO devices that can merge IO requests if the physical
2683                  * pages are ordered properly.
2684                  */
2685                 list_add_tail(&page->lru, list);
2686                 alloced++;
2687                 if (is_migrate_cma(get_pcppage_migratetype(page)))
2688                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2689                                               -(1 << order));
2690         }
2691 
2692         /*
2693          * i pages were removed from the buddy list even if some leak due
2694          * to check_pcp_refill failing so adjust NR_FREE_PAGES based
2695          * on i. Do not confuse with 'alloced' which is the number of
2696          * pages added to the pcp list.
2697          */
2698         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2699         spin_unlock(&zone->lock);
2700         return alloced;
2701 }
2702 
2703 #ifdef CONFIG_NUMA
2704 /*
2705  * Called from the vmstat counter updater to drain pagesets of this
2706  * currently executing processor on remote nodes after they have
2707  * expired.
2708  *
2709  * Note that this function must be called with the thread pinned to
2710  * a single processor.
2711  */
2712 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2713 {
2714         unsigned long flags;
2715         int to_drain, batch;
2716 
2717         local_irq_save(flags);
2718         batch = READ_ONCE(pcp->batch);
2719         to_drain = min(pcp->count, batch);
2720         if (to_drain > 0)
2721                 free_pcppages_bulk(zone, to_drain, pcp);
2722         local_irq_restore(flags);
2723 }
2724 #endif
2725 
2726 /*
2727  * Drain pcplists of the indicated processor and zone.
2728  *
2729  * The processor must either be the current processor and the
2730  * thread pinned to the current processor or a processor that
2731  * is not online.
2732  */
2733 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2734 {
2735         unsigned long flags;
2736         struct per_cpu_pageset *pset;
2737         struct per_cpu_pages *pcp;
2738 
2739         local_irq_save(flags);
2740         pset = per_cpu_ptr(zone->pageset, cpu);
2741 
2742         pcp = &pset->pcp;
2743         if (pcp->count)
2744                 free_pcppages_bulk(zone, pcp->count, pcp);
2745         local_irq_restore(flags);
2746 }
2747 
2748 /*
2749  * Drain pcplists of all zones on the indicated processor.
2750  *
2751  * The processor must either be the current processor and the
2752  * thread pinned to the current processor or a processor that
2753  * is not online.
2754  */
2755 static void drain_pages(unsigned int cpu)
2756 {
2757         struct zone *zone;
2758 
2759         for_each_populated_zone(zone) {
2760                 drain_pages_zone(cpu, zone);
2761         }
2762 }
2763 
2764 /*
2765  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2766  *
2767  * The CPU has to be pinned. When zone parameter is non-NULL, spill just
2768  * the single zone's pages.
2769  */
2770 void drain_local_pages(struct zone *zone)
2771 {
2772         int cpu = smp_processor_id();
2773 
2774         if (zone)
2775                 drain_pages_zone(cpu, zone);
2776         else
2777                 drain_pages(cpu);
2778 }
2779 
2780 static void drain_local_pages_wq(struct work_struct *work)
2781 {
2782         struct pcpu_drain *drain;
2783 
2784         drain = container_of(work, struct pcpu_drain, work);
2785 
2786         /*
2787          * drain_all_pages doesn't use proper cpu hotplug protection so
2788          * we can race with cpu offline when the WQ can move this from
2789          * a cpu pinned worker to an unbound one. We can operate on a different
2790          * cpu which is allright but we also have to make sure to not move to
2791          * a different one.
2792          */
2793         preempt_disable();
2794         drain_local_pages(drain->zone);
2795         preempt_enable();
2796 }
2797 
2798 /*
2799  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2800  *
2801  * When zone parameter is non-NULL, spill just the single zone's pages.
2802  *
2803  * Note that this can be extremely slow as the draining happens in a workqueue.
2804  */
2805 void drain_all_pages(struct zone *zone)
2806 {
2807         int cpu;
2808 
2809         /*
2810          * Allocate in the BSS so we wont require allocation in
2811          * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2812          */
2813         static cpumask_t cpus_with_pcps;
2814 
2815         /*
2816          * Make sure nobody triggers this path before mm_percpu_wq is fully
2817          * initialized.
2818          */
2819         if (WARN_ON_ONCE(!mm_percpu_wq))
2820                 return;
2821 
2822         /*
2823          * Do not drain if one is already in progress unless it's specific to
2824          * a zone. Such callers are primarily CMA and memory hotplug and need
2825          * the drain to be complete when the call returns.
2826          */
2827         if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2828                 if (!zone)
2829                         return;
2830                 mutex_lock(&pcpu_drain_mutex);
2831         }
2832 
2833         /*
2834          * We don't care about racing with CPU hotplug event
2835          * as offline notification will cause the notified
2836          * cpu to drain that CPU pcps and on_each_cpu_mask
2837          * disables preemption as part of its processing
2838          */
2839         for_each_online_cpu(cpu) {
2840                 struct per_cpu_pageset *pcp;
2841                 struct zone *z;
2842                 bool has_pcps = false;
2843 
2844                 if (zone) {
2845                         pcp = per_cpu_ptr(zone->pageset, cpu);
2846                         if (pcp->pcp.count)
2847                                 has_pcps = true;
2848                 } else {
2849                         for_each_populated_zone(z) {
2850                                 pcp = per_cpu_ptr(z->pageset, cpu);
2851                                 if (pcp->pcp.count) {
2852                                         has_pcps = true;
2853                                         break;
2854                                 }
2855                         }
2856                 }
2857 
2858                 if (has_pcps)
2859                         cpumask_set_cpu(cpu, &cpus_with_pcps);
2860                 else
2861                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
2862         }
2863 
2864         for_each_cpu(cpu, &cpus_with_pcps) {
2865                 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
2866 
2867                 drain->zone = zone;
2868                 INIT_WORK(&drain->work, drain_local_pages_wq);
2869                 queue_work_on(cpu, mm_percpu_wq, &drain->work);
2870         }
2871         for_each_cpu(cpu, &cpus_with_pcps)
2872                 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
2873 
2874         mutex_unlock(&pcpu_drain_mutex);
2875 }
2876 
2877 #ifdef CONFIG_HIBERNATION
2878 
2879 /*
2880  * Touch the watchdog for every WD_PAGE_COUNT pages.
2881  */
2882 #define WD_PAGE_COUNT   (128*1024)
2883 
2884 void mark_free_pages(struct zone *zone)
2885 {
2886         unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
2887         unsigned long flags;
2888         unsigned int order, t;
2889         struct page *page;
2890 
2891         if (zone_is_empty(zone))
2892                 return;
2893 
2894         spin_lock_irqsave(&zone->lock, flags);
2895 
2896         max_zone_pfn = zone_end_pfn(zone);
2897         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2898                 if (pfn_valid(pfn)) {
2899                         page = pfn_to_page(pfn);
2900 
2901                         if (!--page_count) {
2902                                 touch_nmi_watchdog();
2903                                 page_count = WD_PAGE_COUNT;
2904                         }
2905 
2906                         if (page_zone(page) != zone)
2907                                 continue;
2908 
2909                         if (!swsusp_page_is_forbidden(page))
2910                                 swsusp_unset_page_free(page);
2911                 }
2912 
2913         for_each_migratetype_order(order, t) {
2914                 list_for_each_entry(page,
2915                                 &zone->free_area[order].free_list[t], lru) {
2916                         unsigned long i;
2917 
2918                         pfn = page_to_pfn(page);
2919                         for (i = 0; i < (1UL << order); i++) {
2920                                 if (!--page_count) {
2921                                         touch_nmi_watchdog();
2922                                         page_count = WD_PAGE_COUNT;
2923                                 }
2924                                 swsusp_set_page_free(pfn_to_page(pfn + i));
2925                         }
2926                 }
2927         }
2928         spin_unlock_irqrestore(&zone->lock, flags);
2929 }
2930 #endif /* CONFIG_PM */
2931 
2932 static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
2933 {
2934         int migratetype;
2935 
2936         if (!free_pcp_prepare(page))
2937                 return false;
2938 
2939         migratetype = get_pfnblock_migratetype(page, pfn);
2940         set_pcppage_migratetype(page, migratetype);
2941         return true;
2942 }
2943 
2944 static void free_unref_page_commit(struct page *page, unsigned long pfn)
2945 {
2946         struct zone *zone = page_zone(page);
2947         struct per_cpu_pages *pcp;
2948         int migratetype;
2949 
2950         migratetype = get_pcppage_migratetype(page);
2951         __count_vm_event(PGFREE);
2952 
2953         /*
2954          * We only track unmovable, reclaimable and movable on pcp lists.
2955          * Free ISOLATE pages back to the allocator because they are being
2956          * offlined but treat HIGHATOMIC as movable pages so we can get those
2957          * areas back if necessary. Otherwise, we may have to free
2958          * excessively into the page allocator
2959          */
2960         if (migratetype >= MIGRATE_PCPTYPES) {
2961                 if (unlikely(is_migrate_isolate(migratetype))) {
2962                         free_one_page(zone, page, pfn, 0, migratetype);
2963                         return;
2964                 }
2965                 migratetype = MIGRATE_MOVABLE;
2966         }
2967 
2968         pcp = &this_cpu_ptr(zone->pageset)->pcp;
2969         list_add(&page->lru, &pcp->lists[migratetype]);
2970         pcp->count++;
2971         if (pcp->count >= pcp->high) {
2972                 unsigned long batch = READ_ONCE(pcp->batch);
2973                 free_pcppages_bulk(zone, batch, pcp);
2974         }
2975 }
2976 
2977 /*
2978  * Free a 0-order page
2979  */
2980 void free_unref_page(struct page *page)
2981 {
2982         unsigned long flags;
2983         unsigned long pfn = page_to_pfn(page);
2984 
2985         if (!free_unref_page_prepare(page, pfn))
2986                 return;
2987 
2988         local_irq_save(flags);
2989         free_unref_page_commit(page, pfn);
2990         local_irq_restore(flags);
2991 }
2992 
2993 /*
2994  * Free a list of 0-order pages
2995  */
2996 void free_unref_page_list(struct list_head *list)
2997 {
2998         struct page *page, *next;
2999         unsigned long flags, pfn;
3000         int batch_count = 0;
3001 
3002         /* Prepare pages for freeing */
3003         list_for_each_entry_safe(page, next, list, lru) {
3004                 pfn = page_to_pfn(page);
3005                 if (!free_unref_page_prepare(page, pfn))
3006                         list_del(&page->lru);
3007                 set_page_private(page, pfn);
3008         }
3009 
3010         local_irq_save(flags);
3011         list_for_each_entry_safe(page, next, list, lru) {
3012                 unsigned long pfn = page_private(page);
3013 
3014                 set_page_private(page, 0);
3015                 trace_mm_page_free_batched(page);
3016                 free_unref_page_commit(page, pfn);
3017 
3018                 /*
3019                  * Guard against excessive IRQ disabled times when we get
3020                  * a large list of pages to free.
3021                  */
3022                 if (++batch_count == SWAP_CLUSTER_MAX) {
3023                         local_irq_restore(flags);
3024                         batch_count = 0;
3025                         local_irq_save(flags);
3026                 }
3027         }
3028         local_irq_restore(flags);
3029 }
3030 
3031 /*
3032  * split_page takes a non-compound higher-order page, and splits it into
3033  * n (1<<order) sub-pages: page[0..n]
3034  * Each sub-page must be freed individually.
3035  *
3036  * Note: this is probably too low level an operation for use in drivers.
3037  * Please consult with lkml before using this in your driver.
3038  */
3039 void split_page(struct page *page, unsigned int order)
3040 {
3041         int i;
3042 
3043         VM_BUG_ON_PAGE(PageCompound(page), page);
3044         VM_BUG_ON_PAGE(!page_count(page), page);
3045 
3046         for (i = 1; i < (1 << order); i++)
3047                 set_page_refcounted(page + i);
3048         split_page_owner(page, order);
3049 }
3050 EXPORT_SYMBOL_GPL(split_page);
3051 
3052 int __isolate_free_page(struct page *page, unsigned int order)
3053 {
3054         struct free_area *area = &page_zone(page)->free_area[order];
3055         unsigned long watermark;
3056         struct zone *zone;
3057         int mt;
3058 
3059         BUG_ON(!PageBuddy(page));
3060 
3061         zone = page_zone(page);
3062         mt = get_pageblock_migratetype(page);
3063 
3064         if (!is_migrate_isolate(mt)) {
3065                 /*
3066                  * Obey watermarks as if the page was being allocated. We can
3067                  * emulate a high-order watermark check with a raised order-0
3068                  * watermark, because we already know our high-order page
3069                  * exists.
3070                  */
3071                 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
3072                 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
3073                         return 0;
3074 
3075                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
3076         }
3077 
3078         /* Remove page from free list */
3079 
3080         del_page_from_free_area(page, area);
3081 
3082         /*
3083          * Set the pageblock if the isolated page is at least half of a
3084          * pageblock
3085          */
3086         if (order >= pageblock_order - 1) {
3087                 struct page *endpage = page + (1 << order) - 1;
3088                 for (; page < endpage; page += pageblock_nr_pages) {
3089                         int mt = get_pageblock_migratetype(page);
3090                         if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
3091                             && !is_migrate_highatomic(mt))
3092                                 set_pageblock_migratetype(page,
3093                                                           MIGRATE_MOVABLE);
3094                 }
3095         }
3096 
3097 
3098         return 1UL << order;
3099 }
3100 
3101 /*
3102  * Update NUMA hit/miss statistics
3103  *
3104  * Must be called with interrupts disabled.
3105  */
3106 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
3107 {
3108 #ifdef CONFIG_NUMA
3109         enum numa_stat_item local_stat = NUMA_LOCAL;
3110 
3111         /* skip numa counters update if numa stats is disabled */
3112         if (!static_branch_likely(&vm_numa_stat_key))
3113                 return;
3114 
3115         if (zone_to_nid(z) != numa_node_id())
3116                 local_stat = NUMA_OTHER;
3117 
3118         if (zone_to_nid(z) == zone_to_nid(preferred_zone))
3119                 __inc_numa_state(z, NUMA_HIT);
3120         else {
3121                 __inc_numa_state(z, NUMA_MISS);
3122                 __inc_numa_state(preferred_zone, NUMA_FOREIGN);
3123         }
3124         __inc_numa_state(z, local_stat);
3125 #endif
3126 }
3127 
3128 /* Remove page from the per-cpu list, caller must protect the list */
3129 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
3130                         unsigned int alloc_flags,
3131                         struct per_cpu_pages *pcp,
3132                         struct list_head *list)
3133 {
3134         struct page *page;
3135 
3136         do {
3137                 if (list_empty(list)) {
3138                         pcp->count += rmqueue_bulk(zone, 0,
3139                                         pcp->batch, list,
3140                                         migratetype, alloc_flags);
3141                         if (unlikely(list_empty(list)))
3142                                 return NULL;
3143                 }
3144 
3145                 page = list_first_entry(list, struct page, lru);
3146                 list_del(&page->lru);
3147                 pcp->count--;
3148         } while (check_new_pcp(page));
3149 
3150         return page;
3151 }
3152 
3153 /* Lock and remove page from the per-cpu list */
3154 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
3155                         struct zone *zone, gfp_t gfp_flags,
3156                         int migratetype, unsigned int alloc_flags)
3157 {
3158         struct per_cpu_pages *pcp;
3159         struct list_head *list;
3160         struct page *page;
3161         unsigned long flags;
3162 
3163         local_irq_save(flags);
3164         pcp = &this_cpu_ptr(zone->pageset)->pcp;
3165         list = &pcp->lists[migratetype];
3166         page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
3167         if (page) {
3168                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
3169                 zone_statistics(preferred_zone, zone);
3170         }
3171         local_irq_restore(flags);
3172         return page;
3173 }
3174 
3175 /*
3176  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
3177  */
3178 static inline
3179 struct page *rmqueue(struct zone *preferred_zone,
3180                         struct zone *zone, unsigned int order,
3181                         gfp_t gfp_flags, unsigned int alloc_flags,
3182                         int migratetype)
3183 {
3184         unsigned long flags;
3185         struct page *page;
3186 
3187         if (likely(order == 0)) {
3188                 page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
3189                                         migratetype, alloc_flags);
3190                 goto out;
3191         }
3192 
3193         /*
3194          * We most definitely don't want callers attempting to
3195          * allocate greater than order-1 page units with __GFP_NOFAIL.
3196          */
3197         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3198         spin_lock_irqsave(&zone->lock, flags);
3199 
3200         do {
3201                 page = NULL;
3202                 if (alloc_flags & ALLOC_HARDER) {
3203                         page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3204                         if (page)
3205                                 trace_mm_page_alloc_zone_locked(page, order, migratetype);
3206                 }
3207                 if (!page)
3208                         page = __rmqueue(zone, order, migratetype, alloc_flags);
3209         } while (page && check_new_pages(page, order));
3210         spin_unlock(&zone->lock);
3211         if (!page)
3212                 goto failed;
3213         __mod_zone_freepage_state(zone, -(1 << order),
3214                                   get_pcppage_migratetype(page));
3215 
3216         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3217         zone_statistics(preferred_zone, zone);
3218         local_irq_restore(flags);
3219 
3220 out:
3221         /* Separate test+clear to avoid unnecessary atomics */
3222         if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
3223                 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3224                 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3225         }
3226 
3227         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3228         return page;
3229 
3230 failed:
3231         local_irq_restore(flags);
3232         return NULL;
3233 }
3234 
3235 #ifdef CONFIG_FAIL_PAGE_ALLOC
3236 
3237 static struct {
3238         struct fault_attr attr;
3239 
3240         bool ignore_gfp_highmem;
3241         bool ignore_gfp_reclaim;
3242         u32 min_order;
3243 } fail_page_alloc = {
3244         .attr = FAULT_ATTR_INITIALIZER,
3245         .ignore_gfp_reclaim = true,
3246         .ignore_gfp_highmem = true,
3247         .min_order = 1,
3248 };
3249 
3250 static int __init setup_fail_page_alloc(char *str)
3251 {
3252         return setup_fault_attr(&fail_page_alloc.attr, str);
3253 }
3254 __setup("fail_page_alloc=", setup_fail_page_alloc);
3255 
3256 static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3257 {
3258         if (order < fail_page_alloc.min_order)
3259                 return false;
3260         if (gfp_mask & __GFP_NOFAIL)
3261                 return false;
3262         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
3263                 return false;
3264         if (fail_page_alloc.ignore_gfp_reclaim &&
3265                         (gfp_mask & __GFP_DIRECT_RECLAIM))
3266                 return false;
3267 
3268         return should_fail(&fail_page_alloc.attr, 1 << order);
3269 }
3270 
3271 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3272 
3273 static int __init fail_page_alloc_debugfs(void)
3274 {
3275         umode_t mode = S_IFREG | 0600;
3276         struct dentry *dir;
3277 
3278         dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3279                                         &fail_page_alloc.attr);
3280 
3281         debugfs_create_bool("ignore-gfp-wait", mode, dir,
3282                             &fail_page_alloc.ignore_gfp_reclaim);
3283         debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3284                             &fail_page_alloc.ignore_gfp_highmem);
3285         debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
3286 
3287         return 0;
3288 }
3289 
3290 late_initcall(fail_page_alloc_debugfs);
3291 
3292 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3293 
3294 #else /* CONFIG_FAIL_PAGE_ALLOC */
3295 
3296 static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3297 {
3298         return false;
3299 }
3300 
3301 #endif /* CONFIG_FAIL_PAGE_ALLOC */
3302 
3303 static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3304 {
3305         return __should_fail_alloc_page(gfp_mask, order);
3306 }
3307 ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3308 
3309 /*
3310  * Return true if free base pages are above 'mark'. For high-order checks it
3311  * will return true of the order-0 watermark is reached and there is at least
3312  * one free page of a suitable size. Checking now avoids taking the zone lock
3313  * to check in the allocation paths if no pages are free.
3314  */
3315 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3316                          int classzone_idx, unsigned int alloc_flags,
3317                          long free_pages)
3318 {
3319         long min = mark;
3320         int o;
3321         const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
3322 
3323         /* free_pages may go negative - that's OK */
3324         free_pages -= (1 << order) - 1;
3325 
3326         if (alloc_flags & ALLOC_HIGH)
3327                 min -= min / 2;
3328 
3329         /*
3330          * If the caller does not have rights to ALLOC_HARDER then subtract
3331          * the high-atomic reserves. This will over-estimate the size of the
3332          * atomic reserve but it avoids a search.
3333          */
3334         if (likely(!alloc_harder)) {
3335                 free_pages -= z->nr_reserved_highatomic;
3336         } else {
3337                 /*
3338                  * OOM victims can try even harder than normal ALLOC_HARDER
3339                  * users on the grounds that it's definitely going to be in
3340                  * the exit path shortly and free memory. Any allocation it
3341                  * makes during the free path will be small and short-lived.
3342                  */
3343                 if (alloc_flags & ALLOC_OOM)
3344                         min -= min / 2;
3345                 else
3346                         min -= min / 4;
3347         }
3348 
3349 
3350 #ifdef CONFIG_CMA
3351         /* If allocation can't use CMA areas don't use free CMA pages */
3352         if (!(alloc_flags & ALLOC_CMA))
3353                 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
3354 #endif
3355 
3356         /*
3357          * Check watermarks for an order-0 allocation request. If these
3358          * are not met, then a high-order request also cannot go ahead
3359          * even if a suitable page happened to be free.
3360          */
3361         if (free_pages <= min + z->lowmem_reserve[classzone_idx])
3362                 return false;
3363 
3364         /* If this is an order-0 request then the watermark is fine */
3365         if (!order)
3366                 return true;
3367 
3368         /* For a high-order request, check at least one suitable page is free */
3369         for (o = order; o < MAX_ORDER; o++) {
3370                 struct free_area *area = &z->free_area[o];
3371                 int mt;
3372 
3373                 if (!area->nr_free)
3374                         continue;
3375 
3376                 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3377                         if (!free_area_empty(area, mt))
3378                                 return true;
3379                 }
3380 
3381 #ifdef CONFIG_CMA
3382                 if ((alloc_flags & ALLOC_CMA) &&
3383                     !free_area_empty(area, MIGRATE_CMA)) {
3384                         return true;
3385                 }
3386 #endif
3387                 if (alloc_harder &&
3388                         !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3389                         return true;
3390         }
3391         return false;
3392 }
3393 
3394 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3395                       int classzone_idx, unsigned int alloc_flags)
3396 {
3397         return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3398                                         zone_page_state(z, NR_FREE_PAGES));
3399 }
3400 
3401 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3402                 unsigned long mark, int classzone_idx, unsigned int alloc_flags)
3403 {
3404         long free_pages = zone_page_state(z, NR_FREE_PAGES);
3405         long cma_pages = 0;
3406 
3407 #ifdef CONFIG_CMA
3408         /* If allocation can't use CMA areas don't use free CMA pages */
3409         if (!(alloc_flags & ALLOC_CMA))
3410                 cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3411 #endif
3412 
3413         /*
3414          * Fast check for order-0 only. If this fails then the reserves
3415          * need to be calculated. There is a corner case where the check
3416          * passes but only the high-order atomic reserve are free. If
3417          * the caller is !atomic then it'll uselessly search the free
3418          * list. That corner case is then slower but it is harmless.
3419          */
3420         if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3421                 return true;
3422 
3423         return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3424                                         free_pages);
3425 }
3426 
3427 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3428                         unsigned long mark, int classzone_idx)
3429 {
3430         long free_pages = zone_page_state(z, NR_FREE_PAGES);
3431 
3432         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3433                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3434 
3435         return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
3436                                                                 free_pages);
3437 }
3438 
3439 #ifdef CONFIG_NUMA
3440 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3441 {
3442         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3443                                 RECLAIM_DISTANCE;
3444 }
3445 #else   /* CONFIG_NUMA */
3446 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3447 {
3448         return true;
3449 }
3450 #endif  /* CONFIG_NUMA */
3451 
3452 /*
3453  * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3454  * fragmentation is subtle. If the preferred zone was HIGHMEM then
3455  * premature use of a lower zone may cause lowmem pressure problems that
3456  * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3457  * probably too small. It only makes sense to spread allocations to avoid
3458  * fragmentation between the Normal and DMA32 zones.
3459  */
3460 static inline unsigned int
3461 alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3462 {
3463         unsigned int alloc_flags = 0;
3464 
3465         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3466                 alloc_flags |= ALLOC_KSWAPD;
3467 
3468 #ifdef CONFIG_ZONE_DMA32
3469         if (!zone)
3470                 return alloc_flags;
3471 
3472         if (zone_idx(zone) != ZONE_NORMAL)
3473                 return alloc_flags;
3474 
3475         /*
3476          * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3477          * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3478          * on UMA that if Normal is populated then so is DMA32.
3479          */
3480         BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3481         if (nr_online_nodes > 1 && !populated_zone(--zone))
3482                 return alloc_flags;
3483 
3484         alloc_flags |= ALLOC_NOFRAGMENT;
3485 #endif /* CONFIG_ZONE_DMA32 */
3486         return alloc_flags;
3487 }
3488 
3489 /*
3490  * get_page_from_freelist goes through the zonelist trying to allocate
3491  * a page.
3492  */
3493 static struct page *
3494 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3495                                                 const struct alloc_context *ac)
3496 {
3497         struct zoneref *z;
3498         struct zone *zone;
3499         struct pglist_data *last_pgdat_dirty_limit = NULL;
3500         bool no_fallback;
3501 
3502 retry:
3503         /*
3504          * Scan zonelist, looking for a zone with enough free.
3505          * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3506          */
3507         no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3508         z = ac->preferred_zoneref;
3509         for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3510                                                                 ac->nodemask) {
3511                 struct page *page;
3512                 unsigned long mark;
3513 
3514                 if (cpusets_enabled() &&
3515                         (alloc_flags & ALLOC_CPUSET) &&
3516                         !__cpuset_zone_allowed(zone, gfp_mask))
3517                                 continue;
3518                 /*
3519                  * When allocating a page cache page for writing, we
3520                  * want to get it from a node that is within its dirty
3521                  * limit, such that no single node holds more than its
3522                  * proportional share of globally allowed dirty pages.
3523                  * The dirty limits take into account the node's
3524                  * lowmem reserves and high watermark so that kswapd
3525                  * should be able to balance it without having to
3526                  * write pages from its LRU list.
3527                  *
3528                  * XXX: For now, allow allocations to potentially
3529                  * exceed the per-node dirty limit in the slowpath
3530                  * (spread_dirty_pages unset) before going into reclaim,
3531                  * which is important when on a NUMA setup the allowed
3532                  * nodes are together not big enough to reach the
3533                  * global limit.  The proper fix for these situations
3534                  * will require awareness of nodes in the
3535                  * dirty-throttling and the flusher threads.
3536                  */
3537                 if (ac->spread_dirty_pages) {
3538                         if (last_pgdat_dirty_limit == zone->zone_pgdat)
3539                                 continue;
3540 
3541                         if (!node_dirty_ok(zone->zone_pgdat)) {
3542                                 last_pgdat_dirty_limit = zone->zone_pgdat;
3543                                 continue;
3544                         }
3545                 }
3546 
3547                 if (no_fallback && nr_online_nodes > 1 &&
3548                     zone != ac->preferred_zoneref->zone) {
3549                         int local_nid;
3550 
3551                         /*
3552                          * If moving to a remote node, retry but allow
3553                          * fragmenting fallbacks. Locality is more important
3554                          * than fragmentation avoidance.
3555                          */
3556                         local_nid = zone_to_nid(ac->preferred_zoneref->zone);
3557                         if (zone_to_nid(zone) != local_nid) {
3558                                 alloc_flags &= ~ALLOC_NOFRAGMENT;
3559                                 goto retry;
3560                         }
3561                 }
3562 
3563                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3564                 if (!zone_watermark_fast(zone, order, mark,
3565                                        ac_classzone_idx(ac), alloc_flags)) {
3566                         int ret;
3567 
3568 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3569                         /*
3570                          * Watermark failed for this zone, but see if we can
3571                          * grow this zone if it contains deferred pages.
3572                          */
3573                         if (static_branch_unlikely(&deferred_pages)) {
3574                                 if (_deferred_grow_zone(zone, order))
3575                                         goto try_this_zone;
3576                         }
3577 #endif
3578                         /* Checked here to keep the fast path fast */
3579                         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3580                         if (alloc_flags & ALLOC_NO_WATERMARKS)
3581                                 goto try_this_zone;
3582 
3583                         if (node_reclaim_mode == 0 ||
3584                             !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
3585                                 continue;
3586 
3587                         ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3588                         switch (ret) {
3589                         case NODE_RECLAIM_NOSCAN:
3590                                 /* did not scan */
3591                                 continue;
3592                         case NODE_RECLAIM_FULL:
3593                                 /* scanned but unreclaimable */
3594                                 continue;
3595                         default:
3596                                 /* did we reclaim enough */
3597                                 if (zone_watermark_ok(zone, order, mark,
3598                                                 ac_classzone_idx(ac), alloc_flags))
3599                                         goto try_this_zone;
3600 
3601                                 continue;
3602                         }
3603                 }
3604 
3605 try_this_zone:
3606                 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
3607                                 gfp_mask, alloc_flags, ac->migratetype);
3608                 if (page) {
3609                         prep_new_page(page, order, gfp_mask, alloc_flags);
3610 
3611                         /*
3612                          * If this is a high-order atomic allocation then check
3613                          * if the pageblock should be reserved for the future
3614                          */
3615                         if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
3616                                 reserve_highatomic_pageblock(page, zone, order);
3617 
3618                         return page;
3619                 } else {
3620 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3621                         /* Try again if zone has deferred pages */
3622                         if (static_branch_unlikely(&deferred_pages)) {
3623                                 if (_deferred_grow_zone(zone, order))
3624                                         goto try_this_zone;
3625                         }
3626 #endif
3627                 }
3628         }
3629 
3630         /*
3631          * It's possible on a UMA machine to get through all zones that are
3632          * fragmented. If avoiding fragmentation, reset and try again.
3633          */
3634         if (no_fallback) {
3635                 alloc_flags &= ~ALLOC_NOFRAGMENT;
3636                 goto retry;
3637         }
3638 
3639         return NULL;
3640 }
3641 
3642 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3643 {
3644         unsigned int filter = SHOW_MEM_FILTER_NODES;
3645         static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3646 
3647         if (!__ratelimit(&show_mem_rs))
3648                 return;
3649 
3650         /*
3651          * This documents exceptions given to allocations in certain
3652          * contexts that are allowed to allocate outside current's set
3653          * of allowed nodes.
3654          */
3655         if (!(gfp_mask & __GFP_NOMEMALLOC))
3656                 if (tsk_is_oom_victim(current) ||
3657                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
3658                         filter &= ~SHOW_MEM_FILTER_NODES;
3659         if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3660                 filter &= ~SHOW_MEM_FILTER_NODES;
3661 
3662         show_mem(filter, nodemask);
3663 }
3664 
3665 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3666 {
3667         struct va_format vaf;
3668         va_list args;
3669         static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3670                                       DEFAULT_RATELIMIT_BURST);
3671 
3672         if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3673                 return;
3674 
3675         va_start(args, fmt);
3676         vaf.fmt = fmt;
3677         vaf.va = &args;
3678         pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3679                         current->comm, &vaf, gfp_mask, &gfp_mask,
3680                         nodemask_pr_args(nodemask));
3681         va_end(args);
3682 
3683         cpuset_print_current_mems_allowed();
3684         pr_cont("\n");
3685         dump_stack();
3686         warn_alloc_show_mem(gfp_mask, nodemask);
3687 }
3688 
3689 static inline struct page *
3690 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3691                               unsigned int alloc_flags,
3692                               const struct alloc_context *ac)
3693 {
3694         struct page *page;
3695 
3696         page = get_page_from_freelist(gfp_mask, order,
3697                         alloc_flags|ALLOC_CPUSET, ac);
3698         /*
3699          * fallback to ignore cpuset restriction if our nodes
3700          * are depleted
3701          */
3702         if (!page)
3703                 page = get_page_from_freelist(gfp_mask, order,
3704                                 alloc_flags, ac);
3705 
3706         return page;
3707 }
3708 
3709 static inline struct page *
3710 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3711         const struct alloc_context *ac, unsigned long *did_some_progress)
3712 {
3713         struct oom_control oc = {
3714                 .zonelist = ac->zonelist,
3715                 .nodemask = ac->nodemask,
3716                 .memcg = NULL,
3717                 .gfp_mask = gfp_mask,
3718                 .order = order,
3719         };
3720         struct page *page;
3721 
3722         *did_some_progress = 0;
3723 
3724         /*
3725          * Acquire the oom lock.  If that fails, somebody else is
3726          * making progress for us.
3727          */
3728         if (!mutex_trylock(&oom_lock)) {
3729                 *did_some_progress = 1;
3730                 schedule_timeout_uninterruptible(1);
3731                 return NULL;
3732         }
3733 
3734         /*
3735          * Go through the zonelist yet one more time, keep very high watermark
3736          * here, this is only to catch a parallel oom killing, we must fail if
3737          * we're still under heavy pressure. But make sure that this reclaim
3738          * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3739          * allocation which will never fail due to oom_lock already held.
3740          */
3741         page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
3742                                       ~__GFP_DIRECT_RECLAIM, order,
3743                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
3744         if (page)
3745                 goto out;
3746 
3747         /* Coredumps can quickly deplete all memory reserves */
3748         if (current->flags & PF_DUMPCORE)
3749                 goto out;
3750         /* The OOM killer will not help higher order allocs */
3751         if (order > PAGE_ALLOC_COSTLY_ORDER)
3752                 goto out;
3753         /*
3754          * We have already exhausted all our reclaim opportunities without any
3755          * success so it is time to admit defeat. We will skip the OOM killer
3756          * because it is very likely that the caller has a more reasonable
3757          * fallback than shooting a random task.
3758          */
3759         if (gfp_mask & __GFP_RETRY_MAYFAIL)
3760                 goto out;
3761         /* The OOM killer does not needlessly kill tasks for lowmem */
3762         if (ac->high_zoneidx < ZONE_NORMAL)
3763                 goto out;
3764         if (pm_suspended_storage())
3765                 goto out;
3766         /*
3767          * XXX: GFP_NOFS allocations should rather fail than rely on
3768          * other request to make a forward progress.
3769          * We are in an unfortunate situation where out_of_memory cannot
3770          * do much for this context but let's try it to at least get
3771          * access to memory reserved if the current task is killed (see
3772          * out_of_memory). Once filesystems are ready to handle allocation
3773          * failures more gracefully we should just bail out here.
3774          */
3775 
3776         /* The OOM killer may not free memory on a specific node */
3777         if (gfp_mask & __GFP_THISNODE)
3778                 goto out;
3779 
3780         /* Exhausted what can be done so it's blame time */
3781         if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3782                 *did_some_progress = 1;
3783 
3784                 /*
3785                  * Help non-failing allocations by giving them access to memory
3786                  * reserves
3787                  */
3788                 if (gfp_mask & __GFP_NOFAIL)
3789                         page = __alloc_pages_cpuset_fallback(gfp_mask, order,
3790                                         ALLOC_NO_WATERMARKS, ac);
3791         }
3792 out:
3793         mutex_unlock(&oom_lock);
3794         return page;
3795 }
3796 
3797 /*
3798  * Maximum number of compaction retries wit a progress before OOM
3799  * killer is consider as the only way to move forward.
3800  */
3801 #define MAX_COMPACT_RETRIES 16
3802 
3803 #ifdef CONFIG_COMPACTION
3804 /* Try memory compaction for high-order allocations before reclaim */
3805 static struct page *
3806 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3807                 unsigned int alloc_flags, const struct alloc_context *ac,
3808                 enum compact_priority prio, enum compact_result *compact_result)
3809 {
3810         struct page *page = NULL;
3811         unsigned long pflags;
3812         unsigned int noreclaim_flag;
3813 
3814         if (!order)
3815                 return NULL;
3816 
3817         psi_memstall_enter(&pflags);
3818         noreclaim_flag = memalloc_noreclaim_save();
3819 
3820         *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3821                                                                 prio, &page);
3822 
3823         memalloc_noreclaim_restore(noreclaim_flag);
3824         psi_memstall_leave(&pflags);
3825 
3826         /*
3827          * At least in one zone compaction wasn't deferred or skipped, so let's
3828          * count a compaction stall
3829          */
3830         count_vm_event(COMPACTSTALL);
3831 
3832         /* Prep a captured page if available */
3833         if (page)
3834                 prep_new_page(page, order, gfp_mask, alloc_flags);
3835 
3836         /* Try get a page from the freelist if available */
3837         if (!page)
3838                 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3839 
3840         if (page) {
3841                 struct zone *zone = page_zone(page);
3842 
3843                 zone->compact_blockskip_flush = false;
3844                 compaction_defer_reset(zone, order, true);
3845                 count_vm_event(COMPACTSUCCESS);
3846                 return page;
3847         }
3848 
3849         /*
3850          * It's bad if compaction run occurs and fails. The most likely reason
3851          * is that pages exist, but not enough to satisfy watermarks.
3852          */
3853         count_vm_event(COMPACTFAIL);
3854 
3855         cond_resched();
3856 
3857         return NULL;
3858 }
3859 
3860 static inline bool
3861 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3862                      enum compact_result compact_result,
3863                      enum compact_priority *compact_priority,
3864                      int *compaction_retries)
3865 {
3866         int max_retries = MAX_COMPACT_RETRIES;
3867         int min_priority;
3868         bool ret = false;
3869         int retries = *compaction_retries;
3870         enum compact_priority priority = *compact_priority;
3871 
3872         if (!order)
3873                 return false;
3874 
3875         if (compaction_made_progress(compact_result))
3876                 (*compaction_retries)++;
3877 
3878         /*
3879          * compaction considers all the zone as desperately out of memory
3880          * so it doesn't really make much sense to retry except when the
3881          * failure could be caused by insufficient priority
3882          */
3883         if (compaction_failed(compact_result))
3884                 goto check_priority;
3885 
3886         /*
3887          * make sure the compaction wasn't deferred or didn't bail out early
3888          * due to locks contention before we declare that we should give up.
3889          * But do not retry if the given zonelist is not suitable for
3890          * compaction.
3891          */
3892         if (compaction_withdrawn(compact_result)) {
3893                 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3894                 goto out;
3895         }
3896 
3897         /*
3898          * !costly requests are much more important than __GFP_RETRY_MAYFAIL
3899          * costly ones because they are de facto nofail and invoke OOM
3900          * killer to move on while costly can fail and users are ready
3901          * to cope with that. 1/4 retries is rather arbitrary but we
3902          * would need much more detailed feedback from compaction to
3903          * make a better decision.
3904          */
3905         if (order > PAGE_ALLOC_COSTLY_ORDER)
3906                 max_retries /= 4;
3907         if (*compaction_retries <= max_retries) {
3908                 ret = true;
3909                 goto out;
3910         }
3911 
3912         /*
3913          * Make sure there are attempts at the highest priority if we exhausted
3914          * all retries or failed at the lower priorities.
3915          */
3916 check_priority:
3917         min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3918                         MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3919 
3920         if (*compact_priority > min_priority) {
3921                 (*compact_priority)--;
3922                 *compaction_retries = 0;
3923                 ret = true;
3924         }
3925 out:
3926         trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
3927         return ret;
3928 }
3929 #else
3930 static inline struct page *
3931 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3932                 unsigned int alloc_flags, const struct alloc_context *ac,
3933                 enum compact_priority prio, enum compact_result *compact_result)
3934 {
3935         *compact_result = COMPACT_SKIPPED;
3936         return NULL;
3937 }
3938 
3939 static inline bool
3940 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3941                      enum compact_result compact_result,
3942                      enum compact_priority *compact_priority,
3943                      int *compaction_retries)
3944 {
3945         struct zone *zone;
3946         struct zoneref *z;
3947 
3948         if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
3949                 return false;
3950 
3951         /*
3952          * There are setups with compaction disabled which would prefer to loop
3953          * inside the allocator rather than hit the oom killer prematurely.
3954          * Let's give them a good hope and keep retrying while the order-0
3955          * watermarks are OK.
3956          */
3957         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3958                                         ac->nodemask) {
3959                 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3960                                         ac_classzone_idx(ac), alloc_flags))
3961                         return true;
3962         }
3963         return false;
3964 }
3965 #endif /* CONFIG_COMPACTION */
3966 
3967 #ifdef CONFIG_LOCKDEP
3968 static struct lockdep_map __fs_reclaim_map =
3969         STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3970 
3971 static bool __need_fs_reclaim(gfp_t gfp_mask)
3972 {
3973         gfp_mask = current_gfp_context(gfp_mask);
3974 
3975         /* no reclaim without waiting on it */
3976         if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3977                 return false;
3978 
3979         /* this guy won't enter reclaim */
3980         if (current->flags & PF_MEMALLOC)
3981                 return false;
3982 
3983         /* We're only interested __GFP_FS allocations for now */
3984         if (!(gfp_mask & __GFP_FS))
3985                 return false;
3986 
3987         if (gfp_mask & __GFP_NOLOCKDEP)
3988                 return false;
3989 
3990         return true;
3991 }
3992 
3993 void __fs_reclaim_acquire(void)
3994 {
3995         lock_map_acquire(&__fs_reclaim_map);
3996 }
3997 
3998 void __fs_reclaim_release(void)
3999 {
4000         lock_map_release(&__fs_reclaim_map);
4001 }
4002 
4003 void fs_reclaim_acquire(gfp_t gfp_mask)
4004 {
4005         if (__need_fs_reclaim(gfp_mask))
4006                 __fs_reclaim_acquire();
4007 }
4008 EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
4009 
4010 void fs_reclaim_release(gfp_t gfp_mask)
4011 {
4012         if (__need_fs_reclaim(gfp_mask))
4013                 __fs_reclaim_release();
4014 }
4015 EXPORT_SYMBOL_GPL(fs_reclaim_release);
4016 #endif
4017 
4018 /* Perform direct synchronous page reclaim */
4019 static int
4020 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
4021                                         const struct alloc_context *ac)
4022 {
4023         struct reclaim_state reclaim_state;
4024         int progress;
4025         unsigned int noreclaim_flag;
4026         unsigned long pflags;
4027 
4028         cond_resched();
4029 
4030         /* We now go into synchronous reclaim */
4031         cpuset_memory_pressure_bump();
4032         psi_memstall_enter(&pflags);
4033         fs_reclaim_acquire(gfp_mask);
4034         noreclaim_flag = memalloc_noreclaim_save();
4035         reclaim_state.reclaimed_slab = 0;
4036         current->reclaim_state = &reclaim_state;
4037 
4038         progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
4039                                                                 ac->nodemask);
4040 
4041         current->reclaim_state = NULL;
4042         memalloc_noreclaim_restore(noreclaim_flag);
4043         fs_reclaim_release(gfp_mask);
4044         psi_memstall_leave(&pflags);
4045 
4046         cond_resched();
4047 
4048         return progress;
4049 }
4050 
4051 /* The really slow allocator path where we enter direct reclaim */
4052 static inline struct page *
4053 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
4054                 unsigned int alloc_flags, const struct alloc_context *ac,
4055                 unsigned long *did_some_progress)
4056 {
4057         struct page *page = NULL;
4058         bool drained = false;
4059 
4060         *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
4061         if (unlikely(!(*did_some_progress)))
4062                 return NULL;
4063 
4064 retry:
4065         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4066 
4067         /*
4068          * If an allocation failed after direct reclaim, it could be because
4069          * pages are pinned on the per-cpu lists or in high alloc reserves.
4070          * Shrink them them and try again
4071          */
4072         if (!page && !drained) {
4073                 unreserve_highatomic_pageblock(ac, false);
4074                 drain_all_pages(NULL);
4075                 drained = true;
4076                 goto retry;
4077         }
4078 
4079         return page;
4080 }
4081 
4082 static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
4083                              const struct alloc_context *ac)
4084 {
4085         struct zoneref *z;
4086         struct zone *zone;
4087         pg_data_t *last_pgdat = NULL;
4088         enum zone_type high_zoneidx = ac->high_zoneidx;
4089 
4090         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
4091                                         ac->nodemask) {
4092                 if (last_pgdat != zone->zone_pgdat)
4093                         wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
4094                 last_pgdat = zone->zone_pgdat;
4095         }
4096 }
4097 
4098 static inline unsigned int
4099 gfp_to_alloc_flags(gfp_t gfp_mask)
4100 {
4101         unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
4102 
4103         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
4104         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
4105 
4106         /*
4107          * The caller may dip into page reserves a bit more if the caller
4108          * cannot run direct reclaim, or if the caller has realtime scheduling
4109          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
4110          * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
4111          */
4112         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
4113 
4114         if (gfp_mask & __GFP_ATOMIC) {
4115                 /*
4116                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
4117                  * if it can't schedule.
4118                  */
4119                 if (!(gfp_mask & __GFP_NOMEMALLOC))
4120                         alloc_flags |= ALLOC_HARDER;
4121                 /*
4122                  * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
4123                  * comment for __cpuset_node_allowed().
4124                  */
4125                 alloc_flags &= ~ALLOC_CPUSET;
4126         } else if (unlikely(rt_task(current)) && !in_interrupt())
4127                 alloc_flags |= ALLOC_HARDER;
4128 
4129         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4130                 alloc_flags |= ALLOC_KSWAPD;
4131 
4132 #ifdef CONFIG_CMA
4133         if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4134                 alloc_flags |= ALLOC_CMA;
4135 #endif
4136         return alloc_flags;
4137 }
4138 
4139 static bool oom_reserves_allowed(struct task_struct *tsk)
4140 {
4141         if (!tsk_is_oom_victim(tsk))
4142                 return false;
4143 
4144         /*
4145          * !MMU doesn't have oom reaper so give access to memory reserves
4146          * only to the thread with TIF_MEMDIE set
4147          */
4148         if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
4149                 return false;
4150 
4151         return true;
4152 }
4153 
4154 /*
4155  * Distinguish requests which really need access to full memory
4156  * reserves from oom victims which can live with a portion of it
4157  */
4158 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
4159 {
4160         if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
4161                 return 0;
4162         if (gfp_mask & __GFP_MEMALLOC)
4163                 return ALLOC_NO_WATERMARKS;
4164         if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
4165                 return ALLOC_NO_WATERMARKS;
4166         if (!in_interrupt()) {
4167                 if (current->flags & PF_MEMALLOC)
4168                         return ALLOC_NO_WATERMARKS;
4169                 else if (oom_reserves_allowed(current))
4170                         return ALLOC_OOM;
4171         }
4172 
4173         return 0;
4174 }
4175 
4176 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
4177 {
4178         return !!__gfp_pfmemalloc_flags(gfp_mask);
4179 }
4180 
4181 /*
4182  * Checks whether it makes sense to retry the reclaim to make a forward progress
4183  * for the given allocation request.
4184  *
4185  * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4186  * without success, or when we couldn't even meet the watermark if we
4187  * reclaimed all remaining pages on the LRU lists.
4188  *
4189  * Returns true if a retry is viable or false to enter the oom path.
4190  */
4191 static inline bool
4192 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4193                      struct alloc_context *ac, int alloc_flags,
4194                      bool did_some_progress, int *no_progress_loops)
4195 {
4196         struct zone *zone;
4197         struct zoneref *z;
4198         bool ret = false;
4199 
4200         /*
4201          * Costly allocations might have made a progress but this doesn't mean
4202          * their order will become available due to high fragmentation so
4203          * always increment the no progress counter for them
4204          */
4205         if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4206                 *no_progress_loops = 0;
4207         else
4208                 (*no_progress_loops)++;
4209 
4210         /*
4211          * Make sure we converge to OOM if we cannot make any progress
4212          * several times in the row.
4213          */
4214         if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
4215                 /* Before OOM, exhaust highatomic_reserve */
4216                 return unreserve_highatomic_pageblock(ac, true);
4217         }
4218 
4219         /*
4220          * Keep reclaiming pages while there is a chance this will lead
4221          * somewhere.  If none of the target zones can satisfy our allocation
4222          * request even if all reclaimable pages are considered then we are
4223          * screwed and have to go OOM.
4224          */
4225         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
4226                                         ac->nodemask) {
4227                 unsigned long available;
4228                 unsigned long reclaimable;
4229                 unsigned long min_wmark = min_wmark_pages(zone);
4230                 bool wmark;
4231 
4232                 available = reclaimable = zone_reclaimable_pages(zone);
4233                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
4234 
4235                 /*
4236                  * Would the allocation succeed if we reclaimed all
4237                  * reclaimable pages?
4238                  */
4239                 wmark = __zone_watermark_ok(zone, order, min_wmark,
4240                                 ac_classzone_idx(ac), alloc_flags, available);
4241                 trace_reclaim_retry_zone(z, order, reclaimable,
4242                                 available, min_wmark, *no_progress_loops, wmark);
4243                 if (wmark) {
4244                         /*
4245                          * If we didn't make any progress and have a lot of
4246                          * dirty + writeback pages then we should wait for
4247                          * an IO to complete to slow down the reclaim and
4248                          * prevent from pre mature OOM
4249                          */
4250                         if (!did_some_progress) {
4251                                 unsigned long write_pending;
4252 
4253                                 write_pending = zone_page_state_snapshot(zone,
4254                                                         NR_ZONE_WRITE_PENDING);
4255 
4256                                 if (2 * write_pending > reclaimable) {
4257                                         congestion_wait(BLK_RW_ASYNC, HZ/10);
4258                                         return true;
4259                                 }
4260                         }
4261 
4262                         ret = true;
4263                         goto out;
4264                 }
4265         }
4266 
4267 out:
4268         /*
4269          * Memory allocation/reclaim might be called from a WQ context and the
4270          * current implementation of the WQ concurrency control doesn't
4271          * recognize that a particular WQ is congested if the worker thread is
4272          * looping without ever sleeping. Therefore we have to do a short sleep
4273          * here rather than calling cond_resched().
4274          */
4275         if (current->flags & PF_WQ_WORKER)
4276                 schedule_timeout_uninterruptible(1);
4277         else
4278                 cond_resched();
4279         return ret;
4280 }
4281 
4282 static inline bool
4283 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
4284 {
4285         /*
4286          * It's possible that cpuset's mems_allowed and the nodemask from
4287          * mempolicy don't intersect. This should be normally dealt with by
4288          * policy_nodemask(), but it's possible to race with cpuset update in
4289          * such a way the check therein was true, and then it became false
4290          * before we got our cpuset_mems_cookie here.
4291          * This assumes that for all allocations, ac->nodemask can come only
4292          * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4293          * when it does not intersect with the cpuset restrictions) or the
4294          * caller can deal with a violated nodemask.
4295          */
4296         if (cpusets_enabled() && ac->nodemask &&
4297                         !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
4298                 ac->nodemask = NULL;
4299                 return true;
4300         }
4301 
4302         /*
4303          * When updating a task's mems_allowed or mempolicy nodemask, it is
4304          * possible to race with parallel threads in such a way that our
4305          * allocation can fail while the mask is being updated. If we are about
4306          * to fail, check if the cpuset changed during allocation and if so,
4307          * retry.
4308          */
4309         if (read_mems_allowed_retry(cpuset_mems_cookie))
4310                 return true;
4311 
4312         return false;
4313 }
4314 
4315 static inline struct page *
4316 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
4317                                                 struct alloc_context *ac)
4318 {
4319         bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
4320         const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
4321         struct page *page = NULL;
4322         unsigned int alloc_flags;
4323         unsigned long did_some_progress;
4324         enum compact_priority compact_priority;
4325         enum compact_result compact_result;
4326         int compaction_retries;
4327         int no_progress_loops;
4328         unsigned int cpuset_mems_cookie;
4329         int reserve_flags;
4330 
4331         /*
4332          * We also sanity check to catch abuse of atomic reserves being used by
4333          * callers that are not in atomic context.
4334          */
4335         if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
4336                                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
4337                 gfp_mask &= ~__GFP_ATOMIC;
4338 
4339 retry_cpuset:
4340         compaction_retries = 0;
4341         no_progress_loops = 0;
4342         compact_priority = DEF_COMPACT_PRIORITY;
4343         cpuset_mems_cookie = read_mems_allowed_begin();
4344 
4345         /*
4346          * The fast path uses conservative alloc_flags to succeed only until
4347          * kswapd needs to be woken up, and to avoid the cost of setting up
4348          * alloc_flags precisely. So we do that now.
4349          */
4350         alloc_flags = gfp_to_alloc_flags(gfp_mask);
4351 
4352         /*
4353          * We need to recalculate the starting point for the zonelist iterator
4354          * because we might have used different nodemask in the fast path, or
4355          * there was a cpuset modification and we are retrying - otherwise we
4356          * could end up iterating over non-eligible zones endlessly.
4357          */
4358         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4359                                         ac->high_zoneidx, ac->nodemask);
4360         if (!ac->preferred_zoneref->zone)
4361                 goto nopage;
4362 
4363         if (alloc_flags & ALLOC_KSWAPD)
4364                 wake_all_kswapds(order, gfp_mask, ac);
4365 
4366         /*
4367          * The adjusted alloc_flags might result in immediate success, so try
4368          * that first
4369          */
4370         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4371         if (page)
4372                 goto got_pg;
4373 
4374         /*
4375          * For costly allocations, try direct compaction first, as it's likely
4376          * that we have enough base pages and don't need to reclaim. For non-
4377          * movable high-order allocations, do that as well, as compaction will
4378          * try prevent permanent fragmentation by migrating from blocks of the
4379          * same migratetype.
4380          * Don't try this for allocations that are allowed to ignore
4381          * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
4382          */
4383         if (can_direct_reclaim &&
4384                         (costly_order ||
4385                            (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
4386                         && !gfp_pfmemalloc_allowed(gfp_mask)) {
4387                 page = __alloc_pages_direct_compact(gfp_mask, order,
4388                                                 alloc_flags, ac,
4389                                                 INIT_COMPACT_PRIORITY,
4390                                                 &compact_result);
4391                 if (page)
4392                         goto got_pg;
4393 
4394                 /*
4395                  * Checks for costly allocations with __GFP_NORETRY, which
4396                  * includes THP page fault allocations
4397                  */
4398                 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4399                         /*
4400                          * If compaction is deferred for high-order allocations,
4401                          * it is because sync compaction recently failed. If
4402                          * this is the case and the caller requested a THP
4403                          * allocation, we do not want to heavily disrupt the
4404                          * system, so we fail the allocation instead of entering
4405                          * direct reclaim.
4406                          */
4407                         if (compact_result == COMPACT_DEFERRED)
4408                                 goto nopage;
4409 
4410                         /*
4411                          * Looks like reclaim/compaction is worth trying, but
4412                          * sync compaction could be very expensive, so keep
4413                          * using async compaction.
4414                          */
4415                         compact_priority = INIT_COMPACT_PRIORITY;
4416                 }
4417         }
4418 
4419 retry:
4420         /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4421         if (alloc_flags & ALLOC_KSWAPD)
4422                 wake_all_kswapds(order, gfp_mask, ac);
4423 
4424         reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4425         if (reserve_flags)
4426                 alloc_flags = reserve_flags;
4427 
4428         /*
4429          * Reset the nodemask and zonelist iterators if memory policies can be
4430          * ignored. These allocations are high priority and system rather than
4431          * user oriented.
4432          */
4433         if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
4434                 ac->nodemask = NULL;
4435                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4436                                         ac->high_zoneidx, ac->nodemask);
4437         }
4438 
4439         /* Attempt with potentially adjusted zonelist and alloc_flags */
4440         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4441         if (page)
4442                 goto got_pg;
4443 
4444         /* Caller is not willing to reclaim, we can't balance anything */
4445         if (!can_direct_reclaim)
4446                 goto nopage;
4447 
4448         /* Avoid recursion of direct reclaim */
4449         if (current->flags & PF_MEMALLOC)
4450                 goto nopage;
4451 
4452         /* Try direct reclaim and then allocating */
4453         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4454                                                         &did_some_progress);
4455         if (page)
4456                 goto got_pg;
4457 
4458         /* Try direct compaction and then allocating */
4459         page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4460                                         compact_priority, &compact_result);
4461         if (page)
4462                 goto got_pg;
4463 
4464         /* Do not loop if specifically requested */
4465         if (gfp_mask & __GFP_NORETRY)
4466                 goto nopage;
4467 
4468         /*
4469          * Do not retry costly high order allocations unless they are
4470          * __GFP_RETRY_MAYFAIL
4471          */
4472         if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4473                 goto nopage;
4474 
4475         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4476                                  did_some_progress > 0, &no_progress_loops))
4477                 goto retry;
4478 
4479         /*
4480          * It doesn't make any sense to retry for the compaction if the order-0
4481          * reclaim is not able to make any progress because the current
4482          * implementation of the compaction depends on the sufficient amount
4483          * of free memory (see __compaction_suitable)
4484          */
4485         if (did_some_progress > 0 &&
4486                         should_compact_retry(ac, order, alloc_flags,
4487                                 compact_result, &compact_priority,
4488                                 &compaction_retries))
4489                 goto retry;
4490 
4491 
4492         /* Deal with possible cpuset update races before we start OOM killing */
4493         if (check_retry_cpuset(cpuset_mems_cookie, ac))
4494                 goto retry_cpuset;
4495 
4496         /* Reclaim has failed us, start killing things */
4497         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4498         if (page)
4499                 goto got_pg;
4500 
4501         /* Avoid allocations with no watermarks from looping endlessly */
4502         if (tsk_is_oom_victim(current) &&
4503             (alloc_flags == ALLOC_OOM ||
4504              (gfp_mask & __GFP_NOMEMALLOC)))
4505                 goto nopage;
4506 
4507         /* Retry as long as the OOM killer is making progress */
4508         if (did_some_progress) {
4509                 no_progress_loops = 0;
4510                 goto retry;
4511         }
4512 
4513 nopage:
4514         /* Deal with possible cpuset update races before we fail */
4515         if (check_retry_cpuset(cpuset_mems_cookie, ac))
4516                 goto retry_cpuset;
4517 
4518         /*
4519          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4520          * we always retry
4521          */
4522         if (gfp_mask & __GFP_NOFAIL) {
4523                 /*
4524                  * All existing users of the __GFP_NOFAIL are blockable, so warn
4525                  * of any new users that actually require GFP_NOWAIT
4526                  */
4527                 if (WARN_ON_ONCE(!can_direct_reclaim))
4528                         goto fail;
4529 
4530                 /*
4531                  * PF_MEMALLOC request from this context is rather bizarre
4532                  * because we cannot reclaim anything and only can loop waiting
4533                  * for somebody to do a work for us
4534                  */
4535                 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4536 
4537                 /*
4538                  * non failing costly orders are a hard requirement which we
4539                  * are not prepared for much so let's warn about these users
4540                  * so that we can identify them and convert them to something
4541                  * else.
4542                  */
4543                 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
4544 
4545                 /*
4546                  * Help non-failing allocations by giving them access to memory
4547                  * reserves but do not use ALLOC_NO_WATERMARKS because this
4548                  * could deplete whole memory reserves which would just make
4549                  * the situation worse
4550                  */
4551                 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
4552                 if (page)
4553                         goto got_pg;
4554 
4555                 cond_resched();
4556                 goto retry;
4557         }
4558 fail:
4559         warn_alloc(gfp_mask, ac->nodemask,
4560                         "page allocation failure: order:%u", order);
4561 got_pg:
4562         return page;
4563 }
4564 
4565 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4566                 int preferred_nid, nodemask_t *nodemask,
4567                 struct alloc_context *ac, gfp_t *alloc_mask,
4568                 unsigned int *alloc_flags)
4569 {
4570         ac->high_zoneidx = gfp_zone(gfp_mask);
4571         ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4572         ac->nodemask = nodemask;
4573         ac->migratetype = gfpflags_to_migratetype(gfp_mask);
4574 
4575         if (cpusets_enabled()) {
4576                 *alloc_mask |= __GFP_HARDWALL;
4577                 if (!ac->nodemask)
4578                         ac->nodemask = &cpuset_current_mems_allowed;
4579                 else
4580                         *alloc_flags |= ALLOC_CPUSET;
4581         }
4582 
4583         fs_reclaim_acquire(gfp_mask);
4584         fs_reclaim_release(gfp_mask);
4585 
4586         might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
4587 
4588         if (should_fail_alloc_page(gfp_mask, order))
4589                 return false;
4590 
4591         if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4592                 *alloc_flags |= ALLOC_CMA;
4593 
4594         return true;
4595 }
4596 
4597 /* Determine whether to spread dirty pages and what the first usable zone */
4598 static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
4599 {
4600         /* Dirty zone balancing only done in the fast path */
4601         ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4602 
4603         /*
4604          * The preferred zone is used for statistics but crucially it is
4605          * also used as the starting point for the zonelist iterator. It
4606          * may get reset for allocations that ignore memory policies.
4607          */
4608         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4609                                         ac->high_zoneidx, ac->nodemask);
4610 }
4611 
4612 /*
4613  * This is the 'heart' of the zoned buddy allocator.
4614  */
4615 struct page *
4616 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
4617                                                         nodemask_t *nodemask)
4618 {
4619         struct page *page;
4620         unsigned int alloc_flags = ALLOC_WMARK_LOW;
4621         gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
4622         struct alloc_context ac = { };
4623 
4624         /*
4625          * There are several places where we assume that the order value is sane
4626          * so bail out early if the request is out of bound.
4627          */
4628         if (unlikely(order >= MAX_ORDER)) {
4629                 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
4630                 return NULL;
4631         }
4632 
4633         gfp_mask &= gfp_allowed_mask;
4634         alloc_mask = gfp_mask;
4635         if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4636                 return NULL;
4637 
4638         finalise_ac(gfp_mask, &ac);
4639 
4640         /*
4641          * Forbid the first pass from falling back to types that fragment
4642          * memory until all local zones are considered.
4643          */
4644         alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
4645 
4646         /* First allocation attempt */
4647         page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
4648         if (likely(page))
4649                 goto out;
4650 
4651         /*
4652          * Apply scoped allocation constraints. This is mainly about GFP_NOFS
4653          * resp. GFP_NOIO which has to be inherited for all allocation requests
4654          * from a particular context which has been marked by
4655          * memalloc_no{fs,io}_{save,restore}.
4656          */
4657         alloc_mask = current_gfp_context(gfp_mask);
4658         ac.spread_dirty_pages = false;
4659 
4660         /*
4661          * Restore the original nodemask if it was potentially replaced with
4662          * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4663          */
4664         if (unlikely(ac.nodemask != nodemask))
4665                 ac.nodemask = nodemask;
4666 
4667         page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4668 
4669 out:
4670         if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4671             unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
4672                 __free_pages(page, order);
4673                 page = NULL;
4674         }
4675 
4676         trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
4677 
4678         return page;
4679 }
4680 EXPORT_SYMBOL(__alloc_pages_nodemask);
4681 
4682 /*
4683  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
4684  * address cannot represent highmem pages. Use alloc_pages and then kmap if
4685  * you need to access high mem.
4686  */
4687 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
4688 {
4689         struct page *page;
4690 
4691         page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
4692         if (!page)
4693                 return 0;
4694         return (unsigned long) page_address(page);
4695 }
4696 EXPORT_SYMBOL(__get_free_pages);
4697 
4698 unsigned long get_zeroed_page(gfp_t gfp_mask)
4699 {
4700         return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
4701 }
4702 EXPORT_SYMBOL(get_zeroed_page);
4703 
4704 static inline void free_the_page(struct page *page, unsigned int order)
4705 {
4706         if (order == 0)         /* Via pcp? */
4707                 free_unref_page(page);
4708         else
4709                 __free_pages_ok(page, order);
4710 }
4711 
4712 void __free_pages(struct page *page, unsigned int order)
4713 {
4714         if (put_page_testzero(page))
4715                 free_the_page(page, order);
4716 }
4717 EXPORT_SYMBOL(__free_pages);
4718 
4719 void free_pages(unsigned long addr, unsigned int order)
4720 {
4721         if (addr != 0) {
4722                 VM_BUG_ON(!virt_addr_valid((void *)addr));
4723                 __free_pages(virt_to_page((void *)addr), order);
4724         }
4725 }
4726 
4727 EXPORT_SYMBOL(free_pages);
4728 
4729 /*
4730  * Page Fragment:
4731  *  An arbitrary-length arbitrary-offset area of memory which resides
4732  *  within a 0 or higher order page.  Multiple fragments within that page
4733  *  are individually refcounted, in the page's reference counter.
4734  *
4735  * The page_frag functions below provide a simple allocation framework for
4736  * page fragments.  This is used by the network stack and network device
4737  * drivers to provide a backing region of memory for use as either an
4738  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
4739  */
4740 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
4741                                              gfp_t gfp_mask)
4742 {
4743         struct page *page = NULL;
4744         gfp_t gfp = gfp_mask;
4745 
4746 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4747         gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
4748                     __GFP_NOMEMALLOC;
4749         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
4750                                 PAGE_FRAG_CACHE_MAX_ORDER);
4751         nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
4752 #endif
4753         if (unlikely(!page))
4754                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
4755 
4756         nc->va = page ? page_address(page) : NULL;
4757 
4758         return page;
4759 }
4760 
4761 void __page_frag_cache_drain(struct page *page, unsigned int count)
4762 {
4763         VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4764 
4765         if (page_ref_sub_and_test(page, count))
4766                 free_the_page(page, compound_order(page));
4767 }
4768 EXPORT_SYMBOL(__page_frag_cache_drain);
4769 
4770 void *page_frag_alloc(struct page_frag_cache *nc,
4771                       unsigned int fragsz, gfp_t gfp_mask)
4772 {
4773         unsigned int size = PAGE_SIZE;
4774         struct page *page;
4775         int offset;
4776 
4777         if (unlikely(!nc->va)) {
4778 refill:
4779                 page = __page_frag_cache_refill(nc, gfp_mask);
4780                 if (!page)
4781                         return NULL;
4782 
4783 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4784                 /* if size can vary use size else just use PAGE_SIZE */
4785                 size = nc->size;
4786 #endif
4787                 /* Even if we own the page, we do not use atomic_set().
4788                  * This would break get_page_unless_zero() users.
4789                  */
4790                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
4791 
4792                 /* reset page count bias and offset to start of new frag */
4793                 nc->pfmemalloc = page_is_pfmemalloc(page);
4794                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4795                 nc->offset = size;
4796         }
4797 
4798         offset = nc->offset - fragsz;
4799         if (unlikely(offset < 0)) {
4800                 page = virt_to_page(nc->va);
4801 
4802                 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
4803                         goto refill;
4804 
4805 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4806                 /* if size can vary use size else just use PAGE_SIZE */
4807                 size = nc->size;
4808 #endif
4809                 /* OK, page count is 0, we can safely set it */
4810                 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
4811 
4812                 /* reset page count bias and offset to start of new frag */
4813                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4814                 offset = size - fragsz;
4815         }
4816 
4817         nc->pagecnt_bias--;
4818         nc->offset = offset;
4819 
4820         return nc->va + offset;
4821 }
4822 EXPORT_SYMBOL(page_frag_alloc);
4823 
4824 /*
4825  * Frees a page fragment allocated out of either a compound or order 0 page.
4826  */
4827 void page_frag_free(void *addr)
4828 {
4829         struct page *page = virt_to_head_page(addr);
4830 
4831         if (unlikely(put_page_testzero(page)))
4832                 free_the_page(page, compound_order(page));
4833 }
4834 EXPORT_SYMBOL(page_frag_free);
4835 
4836 static void *make_alloc_exact(unsigned long addr, unsigned int order,
4837                 size_t size)
4838 {
4839         if (addr) {
4840                 unsigned long alloc_end = addr + (PAGE_SIZE << order);
4841                 unsigned long used = addr + PAGE_ALIGN(size);
4842 
4843                 split_page(virt_to_page((void *)addr), order);
4844                 while (used < alloc_end) {
4845                         free_page(used);
4846                         used += PAGE_SIZE;
4847                 }
4848         }
4849         return (void *)addr;
4850 }
4851 
4852 /**
4853  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4854  * @size: the number of bytes to allocate
4855  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4856  *
4857  * This function is similar to alloc_pages(), except that it allocates the
4858  * minimum number of pages to satisfy the request.  alloc_pages() can only
4859  * allocate memory in power-of-two pages.
4860  *
4861  * This function is also limited by MAX_ORDER.
4862  *
4863  * Memory allocated by this function must be released by free_pages_exact().
4864  *
4865  * Return: pointer to the allocated area or %NULL in case of error.
4866  */
4867 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4868 {
4869         unsigned int order = get_order(size);
4870         unsigned long addr;
4871 
4872         if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
4873                 gfp_mask &= ~__GFP_COMP;
4874 
4875         addr = __get_free_pages(gfp_mask, order);
4876         return make_alloc_exact(addr, order, size);
4877 }
4878 EXPORT_SYMBOL(alloc_pages_exact);
4879 
4880 /**
4881  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4882  *                         pages on a node.
4883  * @nid: the preferred node ID where memory should be allocated
4884  * @size: the number of bytes to allocate
4885  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4886  *
4887  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
4888  * back.
4889  *
4890  * Return: pointer to the allocated area or %NULL in case of error.
4891  */
4892 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4893 {
4894         unsigned int order = get_order(size);
4895         struct page *p;
4896 
4897         if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
4898                 gfp_mask &= ~__GFP_COMP;
4899 
4900         p = alloc_pages_node(nid, gfp_mask, order);
4901         if (!p)
4902                 return NULL;
4903         return make_alloc_exact((unsigned long)page_address(p), order, size);
4904 }
4905 
4906 /**
4907  * free_pages_exact - release memory allocated via alloc_pages_exact()
4908  * @virt: the value returned by alloc_pages_exact.
4909  * @size: size of allocation, same value as passed to alloc_pages_exact().
4910  *
4911  * Release the memory allocated by a previous call to alloc_pages_exact.
4912  */
4913 void free_pages_exact(void *virt, size_t size)
4914 {
4915         unsigned long addr = (unsigned long)virt;
4916         unsigned long end = addr + PAGE_ALIGN(size);
4917 
4918         while (addr < end) {
4919                 free_page(addr);
4920                 addr += PAGE_SIZE;
4921         }
4922 }
4923 EXPORT_SYMBOL(free_pages_exact);
4924 
4925 /**
4926  * nr_free_zone_pages - count number of pages beyond high watermark
4927  * @offset: The zone index of the highest zone
4928  *
4929  * nr_free_zone_pages() counts the number of pages which are beyond the
4930  * high watermark within all zones at or below a given zone index.  For each
4931  * zone, the number of pages is calculated as:
4932  *
4933  *     nr_free_zone_pages = managed_pages - high_pages
4934  *
4935  * Return: number of pages beyond high watermark.
4936  */
4937 static unsigned long nr_free_zone_pages(int offset)
4938 {
4939         struct zoneref *z;
4940         struct zone *zone;
4941 
4942         /* Just pick one node, since fallback list is circular */
4943         unsigned long sum = 0;
4944 
4945         struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4946 
4947         for_each_zone_zonelist(zone, z, zonelist, offset) {
4948                 unsigned long size = zone_managed_pages(zone);
4949                 unsigned long high = high_wmark_pages(zone);
4950                 if (size > high)
4951                         sum += size - high;
4952         }
4953 
4954         return sum;
4955 }
4956 
4957 /**
4958  * nr_free_buffer_pages - count number of pages beyond high watermark
4959  *
4960  * nr_free_buffer_pages() counts the number of pages which are beyond the high
4961  * watermark within ZONE_DMA and ZONE_NORMAL.
4962  *
4963  * Return: number of pages beyond high watermark within ZONE_DMA and
4964  * ZONE_NORMAL.
4965  */
4966 unsigned long nr_free_buffer_pages(void)
4967 {
4968         return nr_free_zone_pages(gfp_zone(GFP_USER));
4969 }
4970 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4971 
4972 /**
4973  * nr_free_pagecache_pages - count number of pages beyond high watermark
4974  *
4975  * nr_free_pagecache_pages() counts the number of pages which are beyond the
4976  * high watermark within all zones.
4977  *
4978  * Return: number of pages beyond high watermark within all zones.
4979  */
4980 unsigned long nr_free_pagecache_pages(void)
4981 {
4982         return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4983 }
4984 
4985 static inline void show_node(struct zone *zone)
4986 {
4987         if (IS_ENABLED(CONFIG_NUMA))
4988                 printk("Node %d ", zone_to_nid(zone));
4989 }
4990 
4991 long si_mem_available(void)
4992 {
4993         long available;
4994         unsigned long pagecache;
4995         unsigned long wmark_low = 0;
4996         unsigned long pages[NR_LRU_LISTS];
4997         unsigned long reclaimable;
4998         struct zone *zone;
4999         int lru;
5000 
5001         for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
5002                 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
5003 
5004         for_each_zone(zone)
5005                 wmark_low += low_wmark_pages(zone);
5006 
5007         /*
5008          * Estimate the amount of memory available for userspace allocations,
5009          * without causing swapping.
5010          */
5011         available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
5012 
5013         /*
5014          * Not all the page cache can be freed, otherwise the system will
5015          * start swapping. Assume at least half of the page cache, or the
5016          * low watermark worth of cache, needs to stay.
5017          */
5018         pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
5019         pagecache -= min(pagecache / 2, wmark_low);
5020         available += pagecache;
5021 
5022         /*
5023          * Part of the reclaimable slab and other kernel memory consists of
5024          * items that are in use, and cannot be freed. Cap this estimate at the
5025          * low watermark.
5026          */
5027         reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
5028                         global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
5029         available += reclaimable - min(reclaimable / 2, wmark_low);
5030 
5031         if (available < 0)
5032                 available = 0;
5033         return available;
5034 }
5035 EXPORT_SYMBOL_GPL(si_mem_available);
5036 
5037 void si_meminfo(struct sysinfo *val)
5038 {
5039         val->totalram = totalram_pages();
5040         val->sharedram = global_node_page_state(NR_SHMEM);
5041         val->freeram = global_zone_page_state(NR_FREE_PAGES);
5042         val->bufferram = nr_blockdev_pages();
5043         val->totalhigh = totalhigh_pages();
5044         val->freehigh = nr_free_highpages();
5045         val->mem_unit = PAGE_SIZE;
5046 }
5047 
5048 EXPORT_SYMBOL(si_meminfo);
5049 
5050 #ifdef CONFIG_NUMA
5051 void si_meminfo_node(struct sysinfo *val, int nid)
5052 {
5053         int zone_type;          /* needs to be signed */
5054         unsigned long managed_pages = 0;
5055         unsigned long managed_highpages = 0;
5056         unsigned long free_highpages = 0;
5057         pg_data_t *pgdat = NODE_DATA(nid);
5058 
5059         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
5060                 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
5061         val->totalram = managed_pages;
5062         val->sharedram = node_page_state(pgdat, NR_SHMEM);
5063         val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
5064 #ifdef CONFIG_HIGHMEM
5065         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
5066                 struct zone *zone = &pgdat->node_zones[zone_type];
5067 
5068                 if (is_highmem(zone)) {
5069                         managed_highpages += zone_managed_pages(zone);
5070                         free_highpages += zone_page_state(zone, NR_FREE_PAGES);
5071                 }
5072         }
5073         val->totalhigh = managed_highpages;
5074         val->freehigh = free_highpages;
5075 #else
5076         val->totalhigh = managed_highpages;
5077         val->freehigh = free_highpages;
5078 #endif
5079         val->mem_unit = PAGE_SIZE;
5080 }
5081 #endif
5082 
5083 /*
5084  * Determine whether the node should be displayed or not, depending on whether
5085  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
5086  */
5087 static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
5088 {
5089         if (!(flags & SHOW_MEM_FILTER_NODES))
5090                 return false;
5091 
5092         /*
5093          * no node mask - aka implicit memory numa policy. Do not bother with
5094          * the synchronization - read_mems_allowed_begin - because we do not
5095          * have to be precise here.
5096          */
5097         if (!nodemask)
5098                 nodemask = &cpuset_current_mems_allowed;
5099 
5100         return !node_isset(nid, *nodemask);
5101 }
5102 
5103 #define K(x) ((x) << (PAGE_SHIFT-10))
5104 
5105 static void show_migration_types(unsigned char type)
5106 {
5107         static const char types[MIGRATE_TYPES] = {
5108                 [MIGRATE_UNMOVABLE]     = 'U',
5109                 [MIGRATE_MOVABLE]       = 'M',
5110                 [MIGRATE_RECLAIMABLE]   = 'E',
5111                 [MIGRATE_HIGHATOMIC]    = 'H',
5112 #ifdef CONFIG_CMA
5113                 [MIGRATE_CMA]           = 'C',
5114 #endif
5115 #ifdef CONFIG_MEMORY_ISOLATION
5116                 [MIGRATE_ISOLATE]       = 'I',
5117 #endif
5118         };
5119         char tmp[MIGRATE_TYPES + 1];
5120         char *p = tmp;
5121         int i;
5122 
5123         for (i = 0; i < MIGRATE_TYPES; i++) {
5124                 if (type & (1 << i))
5125                         *p++ = types[i];
5126         }
5127 
5128         *p = '\0';
5129         printk(KERN_CONT "(%s) ", tmp);
5130 }
5131 
5132 /*
5133  * Show free area list (used inside shift_scroll-lock stuff)
5134  * We also calculate the percentage fragmentation. We do this by counting the
5135  * memory on each free list with the exception of the first item on the list.
5136  *
5137  * Bits in @filter:
5138  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
5139  *   cpuset.
5140  */
5141 void show_free_areas(unsigned int filter, nodemask_t *nodemask)
5142 {
5143         unsigned long free_pcp = 0;
5144         int cpu;
5145         struct zone *zone;
5146         pg_data_t *pgdat;
5147 
5148         for_each_populated_zone(zone) {
5149                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
5150                         continue;
5151 
5152                 for_each_online_cpu(cpu)
5153                         free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
5154         }
5155 
5156         printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
5157                 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
5158                 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
5159                 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
5160                 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
5161                 " free:%lu free_pcp:%lu free_cma:%lu\n",
5162                 global_node_page_state(NR_ACTIVE_ANON),
5163                 global_node_page_state(NR_INACTIVE_ANON),
5164                 global_node_page_state(NR_ISOLATED_ANON),
5165                 global_node_page_state(NR_ACTIVE_FILE),
5166                 global_node_page_state(NR_INACTIVE_FILE),
5167                 global_node_page_state(NR_ISOLATED_FILE),
5168                 global_node_page_state(NR_UNEVICTABLE),
5169                 global_node_page_state(NR_FILE_DIRTY),
5170                 global_node_page_state(NR_WRITEBACK),
5171                 global_node_page_state(NR_UNSTABLE_NFS),
5172                 global_node_page_state(NR_SLAB_RECLAIMABLE),
5173                 global_node_page_state(NR_SLAB_UNRECLAIMABLE),
5174                 global_node_page_state(NR_FILE_MAPPED),
5175                 global_node_page_state(NR_SHMEM),
5176                 global_zone_page_state(NR_PAGETABLE),
5177                 global_zone_page_state(NR_BOUNCE),
5178                 global_zone_page_state(NR_FREE_PAGES),
5179                 free_pcp,
5180                 global_zone_page_state(NR_FREE_CMA_PAGES));
5181 
5182         for_each_online_pgdat(pgdat) {
5183                 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
5184                         continue;
5185 
5186                 printk("Node %d"
5187                         " active_anon:%lukB"
5188                         " inactive_anon:%lukB"
5189                         " active_file:%lukB"
5190                         " inactive_file:%lukB"
5191                         " unevictable:%lukB"
5192                         " isolated(anon):%lukB"
5193                         " isolated(file):%lukB"
5194                         " mapped:%lukB"
5195                         " dirty:%lukB"
5196                         " writeback:%lukB"
5197                         " shmem:%lukB"
5198 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5199                         " shmem_thp: %lukB"
5200                         " shmem_pmdmapped: %lukB"
5201                         " anon_thp: %lukB"
5202 #endif
5203                         " writeback_tmp:%lukB"
5204                         " unstable:%lukB"
5205                         " all_unreclaimable? %s"
5206                         "\n",
5207                         pgdat->node_id,
5208                         K(node_page_state(pgdat, NR_ACTIVE_ANON)),
5209                         K(node_page_state(pgdat, NR_INACTIVE_ANON)),
5210                         K(node_page_state(pgdat, NR_ACTIVE_FILE)),
5211                         K(node_page_state(pgdat, NR_INACTIVE_FILE)),
5212                         K(node_page_state(pgdat, NR_UNEVICTABLE)),
5213                         K(node_page_state(pgdat, NR_ISOLATED_ANON)),
5214                         K(node_page_state(pgdat, NR_ISOLATED_FILE)),
5215                         K(node_page_state(pgdat, NR_FILE_MAPPED)),
5216                         K(node_page_state(pgdat, NR_FILE_DIRTY)),
5217                         K(node_page_state(pgdat, NR_WRITEBACK)),
5218                         K(node_page_state(pgdat, NR_SHMEM)),
5219 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5220                         K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
5221                         K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
5222                                         * HPAGE_PMD_NR),
5223                         K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
5224 #endif
5225                         K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
5226                         K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
5227                         pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
5228                                 "yes" : "no");
5229         }
5230 
5231         for_each_populated_zone(zone) {
5232                 int i;
5233 
5234                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
5235                         continue;
5236 
5237                 free_pcp = 0;
5238                 for_each_online_cpu(cpu)
5239                         free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
5240 
5241                 show_node(zone);
5242                 printk(KERN_CONT
5243                         "%s"
5244                         " free:%lukB"
5245                         " min:%lukB"
5246                         " low:%lukB"
5247                         " high:%lukB"
5248                         " active_anon:%lukB"
5249                         " inactive_anon:%lukB"
5250                         " active_file:%lukB"
5251                         " inactive_file:%lukB"
5252                         " unevictable:%lukB"
5253                         " writepending:%lukB"
5254                         " present:%lukB"
5255                         " managed:%lukB"
5256                         " mlocked:%lukB"
5257                         " kernel_stack:%lukB"
5258                         " pagetables:%lukB"
5259                         " bounce:%lukB"
5260                         " free_pcp:%lukB"
5261                         " local_pcp:%ukB"
5262                         " free_cma:%lukB"
5263                         "\n",
5264                         zone->name,
5265                         K(zone_page_state(zone, NR_FREE_PAGES)),
5266                         K(min_wmark_pages(zone)),
5267                         K(low_wmark_pages(zone)),
5268                         K(high_wmark_pages(zone)),
5269                         K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
5270                         K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
5271                         K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
5272                         K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
5273                         K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
5274                         K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
5275                         K(zone->present_pages),
5276                         K(zone_managed_pages(zone)),
5277                         K(zone_page_state(zone, NR_MLOCK)),
5278                         zone_page_state(zone, NR_KERNEL_STACK_KB),
5279                         K(zone_page_state(zone, NR_PAGETABLE)),
5280                         K(zone_page_state(zone, NR_BOUNCE)),
5281                         K(free_pcp),
5282                         K(this_cpu_read(zone->pageset->pcp.count)),
5283                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
5284                 printk("lowmem_reserve[]:");
5285                 for (i = 0; i < MAX_NR_ZONES; i++)
5286                         printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
5287                 printk(KERN_CONT "\n");
5288         }
5289 
5290         for_each_populated_zone(zone) {
5291                 unsigned int order;
5292                 unsigned long nr[MAX_ORDER], flags, total = 0;
5293                 unsigned char types[MAX_ORDER];
5294 
5295                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
5296                         continue;
5297                 show_node(zone);
5298                 printk(KERN_CONT "%s: ", zone->name);
5299 
5300                 spin_lock_irqsave(&zone->lock, flags);
5301                 for (order = 0; order < MAX_ORDER; order++) {
5302                         struct free_area *area = &zone->free_area[order];
5303                         int type;
5304 
5305                         nr[order] = area->nr_free;
5306                         total += nr[order] << order;
5307 
5308                         types[order] = 0;
5309                         for (type = 0; type < MIGRATE_TYPES; type++) {
5310                                 if (!free_area_empty(area, type))
5311                                         types[order] |= 1 << type;
5312                         }
5313                 }
5314                 spin_unlock_irqrestore(&zone->lock, flags);
5315                 for (order = 0; order < MAX_ORDER; order++) {
5316                         printk(KERN_CONT "%lu*%lukB ",
5317                                nr[order], K(1UL) << order);
5318                         if (nr[order])
5319                                 show_migration_types(types[order]);
5320                 }
5321                 printk(KERN_CONT "= %lukB\n", K(total));
5322         }
5323 
5324         hugetlb_show_meminfo();
5325 
5326         printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
5327 
5328         show_swap_cache_info();
5329 }
5330 
5331 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
5332 {
5333         zoneref->zone = zone;
5334         zoneref->zone_idx = zone_idx(zone);
5335 }
5336 
5337 /*
5338  * Builds allocation fallback zone lists.
5339  *
5340  * Add all populated zones of a node to the zonelist.
5341  */
5342 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
5343 {
5344         struct zone *zone;
5345         enum zone_type zone_type = MAX_NR_ZONES;
5346         int nr_zones = 0;
5347 
5348         do {
5349                 zone_type--;
5350                 zone = pgdat->node_zones + zone_type;
5351                 if (managed_zone(zone)) {
5352                         zoneref_set_zone(zone, &zonerefs[nr_zones++]);
5353                         check_highest_zone(zone_type);
5354                 }
5355         } while (zone_type);
5356 
5357         return nr_zones;
5358 }
5359 
5360 #ifdef CONFIG_NUMA
5361 
5362 static int __parse_numa_zonelist_order(char *s)
5363 {
5364         /*
5365          * We used to support different zonlists modes but they turned
5366          * out to be just not useful. Let's keep the warning in place
5367          * if somebody still use the cmd line parameter so that we do
5368          * not fail it silently
5369          */
5370         if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
5371                 pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
5372                 return -EINVAL;
5373         }
5374         return 0;
5375 }
5376 
5377 static __init int setup_numa_zonelist_order(char *s)
5378 {
5379         if (!s)
5380                 return 0;
5381 
5382         return __parse_numa_zonelist_order(s);
5383 }
5384 early_param("numa_zonelist_order", setup_numa_zonelist_order);
5385 
5386 char numa_zonelist_order[] = "Node";
5387 
5388 /*
5389  * sysctl handler for numa_zonelist_order
5390  */
5391 int numa_zonelist_order_handler(struct ctl_table *table, int write,
5392                 void __user *buffer, size_t *length,
5393                 loff_t *ppos)
5394 {
5395         char *str;
5396         int ret;
5397 
5398         if (!write)
5399                 return proc_dostring(table, write, buffer, length, ppos);
5400         str = memdup_user_nul(buffer, 16);
5401         if (IS_ERR(str))
5402                 return PTR_ERR(str);
5403 
5404         ret = __parse_numa_zonelist_order(str);
5405         kfree(str);
5406         return ret;
5407 }
5408 
5409 
5410 #define MAX_NODE_LOAD (nr_online_nodes)
5411 static int node_load[MAX_NUMNODES];
5412 
5413 /**
5414  * find_next_best_node - find the next node that should appear in a given node's fallback list
5415  * @node: node whose fallback list we're appending
5416  * @used_node_mask: nodemask_t of already used nodes
5417  *
5418  * We use a number of factors to determine which is the next node that should
5419  * appear on a given node's fallback list.  The node should not have appeared
5420  * already in @node's fallback list, and it should be the next closest node
5421  * according to the distance array (which contains arbitrary distance values
5422  * from each node to each node in the system), and should also prefer nodes
5423  * with no CPUs, since presumably they'll have very little allocation pressure
5424  * on them otherwise.
5425  *
5426  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5427  */
5428 static int find_next_best_node(int node, nodemask_t *used_node_mask)
5429 {
5430         int n, val;
5431         int min_val = INT_MAX;
5432         int best_node = NUMA_NO_NODE;
5433         const struct cpumask *tmp = cpumask_of_node(0);
5434 
5435         /* Use the local node if we haven't already */
5436         if (!node_isset(node, *used_node_mask)) {
5437                 node_set(node, *used_node_mask);
5438                 return node;
5439         }
5440 
5441         for_each_node_state(n, N_MEMORY) {
5442 
5443                 /* Don't want a node to appear more than once */
5444                 if (node_isset(n, *used_node_mask))
5445                         continue;
5446 
5447                 /* Use the distance array to find the distance */
5448                 val = node_distance(node, n);
5449 
5450                 /* Penalize nodes under us ("prefer the next node") */
5451                 val += (n < node);
5452 
5453                 /* Give preference to headless and unused nodes */
5454                 tmp = cpumask_of_node(n);
5455                 if (!cpumask_empty(tmp))
5456                         val += PENALTY_FOR_NODE_WITH_CPUS;
5457 
5458                 /* Slight preference for less loaded node */
5459                 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
5460                 val += node_load[n];
5461 
5462                 if (val < min_val) {
5463                         min_val = val;
5464                         best_node = n;
5465                 }
5466         }
5467 
5468         if (best_node >= 0)
5469                 node_set(best_node, *used_node_mask);
5470 
5471         return best_node;
5472 }
5473 
5474 
5475 /*
5476  * Build zonelists ordered by node and zones within node.
5477  * This results in maximum locality--normal zone overflows into local
5478  * DMA zone, if any--but risks exhausting DMA zone.
5479  */
5480 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
5481                 unsigned nr_nodes)
5482 {
5483         struct zoneref *zonerefs;
5484         int i;
5485 
5486         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5487 
5488         for (i = 0; i < nr_nodes; i++) {
5489                 int nr_zones;
5490 
5491                 pg_data_t *node = NODE_DATA(node_order[i]);
5492 
5493                 nr_zones = build_zonerefs_node(node, zonerefs);
5494                 zonerefs += nr_zones;
5495         }
5496         zonerefs->zone = NULL;
5497         zonerefs->zone_idx = 0;
5498 }
5499 
5500 /*
5501  * Build gfp_thisnode zonelists
5502  */
5503 static void build_thisnode_zonelists(pg_data_t *pgdat)
5504 {
5505         struct zoneref *zonerefs;
5506         int nr_zones;
5507 
5508         zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5509         nr_zones = build_zonerefs_node(pgdat, zonerefs);
5510         zonerefs += nr_zones;
5511         zonerefs->zone = NULL;
5512         zonerefs->zone_idx = 0;
5513 }
5514 
5515 /*
5516  * Build zonelists ordered by zone and nodes within zones.
5517  * This results in conserving DMA zone[s] until all Normal memory is
5518  * exhausted, but results in overflowing to remote node while memory
5519  * may still exist in local DMA zone.
5520  */
5521 
5522 static void build_zonelists(pg_data_t *pgdat)
5523 {
5524         static int node_order[MAX_NUMNODES];
5525         int node, load, nr_nodes = 0;
5526         nodemask_t used_mask;
5527         int local_node, prev_node;
5528 
5529         /* NUMA-aware ordering of nodes */
5530         local_node = pgdat->node_id;
5531         load = nr_online_nodes;
5532         prev_node = local_node;
5533         nodes_clear(used_mask);
5534 
5535         memset(node_order, 0, sizeof(node_order));
5536         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5537                 /*
5538                  * We don't want to pressure a particular node.
5539                  * So adding penalty to the first node in same
5540                  * distance group to make it round-robin.
5541                  */
5542                 if (node_distance(local_node, node) !=
5543                     node_distance(local_node, prev_node))
5544                         node_load[node] = load;
5545 
5546                 node_order[nr_nodes++] = node;
5547                 prev_node = node;
5548                 load--;
5549         }
5550 
5551         build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5552         build_thisnode_zonelists(pgdat);
5553 }
5554 
5555 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5556 /*
5557  * Return node id of node used for "local" allocations.
5558  * I.e., first node id of first zone in arg node's generic zonelist.
5559  * Used for initializing percpu 'numa_mem', which is used primarily
5560  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5561  */
5562 int local_memory_node(int node)
5563 {
5564         struct zoneref *z;
5565 
5566         z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5567                                    gfp_zone(GFP_KERNEL),
5568                                    NULL);
5569         return zone_to_nid(z->zone);
5570 }
5571 #endif
5572 
5573 static void setup_min_unmapped_ratio(void);
5574 static void setup_min_slab_ratio(void);
5575 #else   /* CONFIG_NUMA */
5576 
5577 static void build_zonelists(pg_data_t *pgdat)
5578 {
5579         int node, local_node;
5580         struct zoneref *zonerefs;
5581         int nr_zones;
5582 
5583         local_node = pgdat->node_id;
5584 
5585         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5586         nr_zones = build_zonerefs_node(pgdat, zonerefs);
5587         zonerefs += nr_zones;
5588 
5589         /*
5590          * Now we build the zonelist so that it contains the zones
5591          * of all the other nodes.
5592          * We don't want to pressure a particular node, so when
5593          * building the zones for node N, we make sure that the
5594          * zones coming right after the local ones are those from
5595          * node N+1 (modulo N)
5596          */
5597         for (node = local_node + 1; node < MAX_NUMNODES; node++) {
5598                 if (!node_online(node))
5599                         continue;
5600                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5601                 zonerefs += nr_zones;
5602         }
5603         for (node = 0; node < local_node; node++) {
5604                 if (!node_online(node))
5605                         continue;
5606                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5607                 zonerefs += nr_zones;
5608         }
5609 
5610         zonerefs->zone = NULL;
5611         zonerefs->zone_idx = 0;
5612 }
5613 
5614 #endif  /* CONFIG_NUMA */
5615 
5616 /*
5617  * Boot pageset table. One per cpu which is going to be used for all
5618  * zones and all nodes. The parameters will be set in such a way
5619  * that an item put on a list will immediately be handed over to
5620  * the buddy list. This is safe since pageset manipulation is done
5621  * with interrupts disabled.
5622  *
5623  * The boot_pagesets must be kept even after bootup is complete for
5624  * unused processors and/or zones. They do play a role for bootstrapping
5625  * hotplugged processors.
5626  *
5627  * zoneinfo_show() and maybe other functions do
5628  * not check if the processor is online before following the pageset pointer.
5629  * Other parts of the kernel may not check if the zone is available.
5630  */
5631 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
5632 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
5633 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
5634 
5635 static void __build_all_zonelists(void *data)
5636 {
5637         int nid;
5638         int __maybe_unused cpu;
5639         pg_data_t *self = data;
5640         static DEFINE_SPINLOCK(lock);
5641 
5642         spin_lock(&lock);
5643 
5644 #ifdef CONFIG_NUMA
5645         memset(node_load, 0, sizeof(node_load));
5646 #endif
5647 
5648         /*
5649          * This node is hotadded and no memory is yet present.   So just
5650          * building zonelists is fine - no need to touch other nodes.
5651          */
5652         if (self && !node_online(self->node_id)) {
5653                 build_zonelists(self);
5654         } else {
5655                 for_each_online_node(nid) {
5656                         pg_data_t *pgdat = NODE_DATA(nid);
5657 
5658                         build_zonelists(pgdat);
5659                 }
5660 
5661 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5662                 /*
5663                  * We now know the "local memory node" for each node--
5664                  * i.e., the node of the first zone in the generic zonelist.
5665                  * Set up numa_mem percpu variable for on-line cpus.  During
5666                  * boot, only the boot cpu should be on-line;  we'll init the
5667                  * secondary cpus' numa_mem as they come on-line.  During
5668                  * node/memory hotplug, we'll fixup all on-line cpus.
5669                  */
5670                 for_each_online_cpu(cpu)
5671                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5672 #endif
5673         }
5674 
5675         spin_unlock(&lock);
5676 }
5677 
5678 static noinline void __init
5679 build_all_zonelists_init(void)
5680 {
5681         int cpu;
5682 
5683         __build_all_zonelists(NULL);
5684 
5685         /*
5686          * Initialize the boot_pagesets that are going to be used
5687          * for bootstrapping processors. The real pagesets for
5688          * each zone will be allocated later when the per cpu
5689          * allocator is available.
5690          *
5691          * boot_pagesets are used also for bootstrapping offline
5692          * cpus if the system is already booted because the pagesets
5693          * are needed to initialize allocators on a specific cpu too.
5694          * F.e. the percpu allocator needs the page allocator which
5695          * needs the percpu allocator in order to allocate its pagesets
5696          * (a chicken-egg dilemma).
5697          */
5698         for_each_possible_cpu(cpu)
5699                 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
5700 
5701         mminit_verify_zonelist();
5702         cpuset_init_current_mems_allowed();
5703 }
5704 
5705 /*
5706  * unless system_state == SYSTEM_BOOTING.
5707  *
5708  * __ref due to call of __init annotated helper build_all_zonelists_init
5709  * [protected by SYSTEM_BOOTING].
5710  */
5711 void __ref build_all_zonelists(pg_data_t *pgdat)
5712 {
5713         if (system_state == SYSTEM_BOOTING) {
5714                 build_all_zonelists_init();
5715         } else {
5716                 __build_all_zonelists(pgdat);
5717                 /* cpuset refresh routine should be here */
5718         }
5719         vm_total_pages = nr_free_pagecache_pages();
5720         /*
5721          * Disable grouping by mobility if the number of pages in the
5722          * system is too low to allow the mechanism to work. It would be
5723          * more accurate, but expensive to check per-zone. This check is
5724          * made on memory-hotadd so a system can start with mobility
5725          * disabled and enable it later
5726          */
5727         if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5728                 page_group_by_mobility_disabled = 1;
5729         else
5730                 page_group_by_mobility_disabled = 0;
5731 
5732         pr_info("Built %u zonelists, mobility grouping %s.  Total pages: %ld\n",
5733                 nr_online_nodes,
5734                 page_group_by_mobility_disabled ? "off" : "on",
5735                 vm_total_pages);
5736 #ifdef CONFIG_NUMA
5737         pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5738 #endif
5739 }
5740 
5741 /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
5742 static bool __meminit
5743 overlap_memmap_init(unsigned long zone, unsigned long *pfn)
5744 {
5745 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5746         static struct memblock_region *r;
5747 
5748         if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5749                 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
5750                         for_each_memblock(memory, r) {
5751                                 if (*pfn < memblock_region_memory_end_pfn(r))
5752                                         break;
5753                         }
5754                 }
5755                 if (*pfn >= memblock_region_memory_base_pfn(r) &&
5756                     memblock_is_mirror(r)) {
5757                         *pfn = memblock_region_memory_end_pfn(r);
5758                         return true;
5759                 }
5760         }
5761 #endif
5762         return false;
5763 }
5764 
5765 /*
5766  * Initially all pages are reserved - free ones are freed
5767  * up by memblock_free_all() once the early boot process is
5768  * done. Non-atomic initialization, single-pass.
5769  */
5770 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5771                 unsigned long start_pfn, enum memmap_context context,
5772                 struct vmem_altmap *altmap)
5773 {
5774         unsigned long pfn, end_pfn = start_pfn + size;
5775         struct page *page;
5776 
5777         if (highest_memmap_pfn < end_pfn - 1)
5778                 highest_memmap_pfn = end_pfn - 1;
5779 
5780 #ifdef CONFIG_ZONE_DEVICE
5781         /*
5782          * Honor reservation requested by the driver for this ZONE_DEVICE
5783          * memory. We limit the total number of pages to initialize to just
5784          * those that might contain the memory mapping. We will defer the
5785          * ZONE_DEVICE page initialization until after we have released
5786          * the hotplug lock.
5787          */
5788         if (zone == ZONE_DEVICE) {
5789                 if (!altmap)
5790                         return;
5791 
5792                 if (start_pfn == altmap->base_pfn)
5793                         start_pfn += altmap->reserve;
5794                 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
5795         }
5796 #endif
5797 
5798         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5799                 /*
5800                  * There can be holes in boot-time mem_map[]s handed to this
5801                  * function.  They do not exist on hotplugged memory.
5802                  */
5803                 if (context == MEMMAP_EARLY) {
5804                         if (!early_pfn_valid(pfn))
5805                                 continue;
5806                         if (!early_pfn_in_nid(pfn, nid))
5807                                 continue;
5808                         if (overlap_memmap_init(zone, &pfn))
5809                                 continue;
5810                         if (defer_init(nid, pfn, end_pfn))
5811                                 break;
5812                 }
5813 
5814                 page = pfn_to_page(pfn);
5815                 __init_single_page(page, pfn, zone, nid);
5816                 if (context == MEMMAP_HOTPLUG)
5817                         __SetPageReserved(page);
5818 
5819                 /*
5820                  * Mark the block movable so that blocks are reserved for
5821                  * movable at startup. This will force kernel allocations
5822                  * to reserve their blocks rather than leaking throughout
5823                  * the address space during boot when many long-lived
5824                  * kernel allocations are made.
5825                  *
5826                  * bitmap is created for zone's valid pfn range. but memmap
5827                  * can be created for invalid pages (for alignment)
5828                  * check here not to call set_pageblock_migratetype() against
5829                  * pfn out of zone.
5830                  */
5831                 if (!(pfn & (pageblock_nr_pages - 1))) {
5832                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5833                         cond_resched();
5834                 }
5835         }
5836 }
5837 
5838 #ifdef CONFIG_ZONE_DEVICE
5839 void __ref memmap_init_zone_device(struct zone *zone,
5840                                    unsigned long start_pfn,
5841                                    unsigned long size,
5842                                    struct dev_pagemap *pgmap)
5843 {
5844         unsigned long pfn, end_pfn = start_pfn + size;
5845         struct pglist_data *pgdat = zone->zone_pgdat;
5846         unsigned long zone_idx = zone_idx(zone);
5847         unsigned long start = jiffies;
5848         int nid = pgdat->node_id;
5849 
5850         if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
5851                 return;
5852 
5853         /*
5854          * The call to memmap_init_zone should have already taken care
5855          * of the pages reserved for the memmap, so we can just jump to
5856          * the end of that region and start processing the device pages.
5857          */
5858         if (pgmap->altmap_valid) {
5859                 struct vmem_altmap *altmap = &pgmap->altmap;
5860 
5861                 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
5862                 size = end_pfn - start_pfn;
5863         }
5864 
5865         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5866                 struct page *page = pfn_to_page(pfn);
5867 
5868                 __init_single_page(page, pfn, zone_idx, nid);
5869 
5870                 /*
5871                  * Mark page reserved as it will need to wait for onlining
5872                  * phase for it to be fully associated with a zone.
5873                  *
5874                  * We can use the non-atomic __set_bit operation for setting
5875                  * the flag as we are still initializing the pages.
5876                  */
5877                 __SetPageReserved(page);
5878 
5879                 /*
5880                  * ZONE_DEVICE pages union ->lru with a ->pgmap back
5881                  * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
5882                  * page is ever freed or placed on a driver-private list.
5883                  */
5884                 page->pgmap = pgmap;
5885                 page->hmm_data = 0;
5886 
5887                 /*
5888                  * Mark the block movable so that blocks are reserved for
5889                  * movable at startup. This will force kernel allocations
5890                  * to reserve their blocks rather than leaking throughout
5891                  * the address space during boot when many long-lived
5892                  * kernel allocations are made.
5893                  *
5894                  * bitmap is created for zone's valid pfn range. but memmap
5895                  * can be created for invalid pages (for alignment)
5896                  * check here not to call set_pageblock_migratetype() against
5897                  * pfn out of zone.
5898                  *
5899                  * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
5900                  * because this is done early in sparse_add_one_section
5901                  */
5902                 if (!(pfn & (pageblock_nr_pages - 1))) {
5903                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5904                         cond_resched();
5905                 }
5906         }
5907 
5908         pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
5909                 size, jiffies_to_msecs(jiffies - start));
5910 }
5911 
5912 #endif
5913 static void __meminit zone_init_free_lists(struct zone *zone)
5914 {
5915         unsigned int order, t;
5916         for_each_migratetype_order(order, t) {
5917                 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
5918                 zone->free_area[order].nr_free = 0;
5919         }
5920 }
5921 
5922 void __meminit __weak memmap_init(unsigned long size, int nid,
5923                                   unsigned long zone, unsigned long start_pfn)
5924 {
5925         memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
5926 }
5927 
5928 static int zone_batchsize(struct zone *zone)
5929 {
5930 #ifdef CONFIG_MMU
5931         int batch;
5932 
5933         /*
5934          * The per-cpu-pages pools are set to around 1000th of the
5935          * size of the zone.
5936          */
5937         batch = zone_managed_pages(zone) / 1024;
5938         /* But no more than a meg. */
5939         if (batch * PAGE_SIZE > 1024 * 1024)
5940                 batch = (1024 * 1024) / PAGE_SIZE;
5941         batch /= 4;             /* We effectively *= 4 below */
5942         if (batch < 1)
5943                 batch = 1;
5944 
5945         /*
5946          * Clamp the batch to a 2^n - 1 value. Having a power
5947          * of 2 value was found to be more likely to have
5948          * suboptimal cache aliasing properties in some cases.
5949          *
5950          * For example if 2 tasks are alternately allocating
5951          * batches of pages, one task can end up with a lot
5952          * of pages of one half of the possible page colors
5953          * and the other with pages of the other colors.
5954          */
5955         batch = rounddown_pow_of_two(batch + batch/2) - 1;
5956 
5957         return batch;
5958 
5959 #else
5960         /* The deferral and batching of frees should be suppressed under NOMMU
5961          * conditions.
5962          *
5963          * The problem is that NOMMU needs to be able to allocate large chunks
5964          * of contiguous memory as there's no hardware page translation to
5965          * assemble apparent contiguous memory from discontiguous pages.
5966          *
5967          * Queueing large contiguous runs of pages for batching, however,
5968          * causes the pages to actually be freed in smaller chunks.  As there
5969          * can be a significant delay between the individual batches being
5970          * recycled, this leads to the once large chunks of space being
5971          * fragmented and becoming unavailable for high-order allocations.
5972          */
5973         return 0;
5974 #endif
5975 }
5976 
5977 /*
5978  * pcp->high and pcp->batch values are related and dependent on one another:
5979  * ->batch must never be higher then ->high.
5980  * The following function updates them in a safe manner without read side
5981  * locking.
5982  *
5983  * Any new users of pcp->batch and pcp->high should ensure they can cope with
5984  * those fields changing asynchronously (acording the the above rule).
5985  *
5986  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5987  * outside of boot time (or some other assurance that no concurrent updaters
5988  * exist).
5989  */
5990 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
5991                 unsigned long batch)
5992 {
5993        /* start with a fail safe value for batch */
5994         pcp->batch = 1;
5995         smp_wmb();
5996 
5997        /* Update high, then batch, in order */
5998         pcp->high = high;
5999         smp_wmb();
6000 
6001         pcp->batch = batch;
6002 }
6003 
6004 /* a companion to pageset_set_high() */
6005 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
6006 {
6007         pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
6008 }
6009 
6010 static void pageset_init(struct per_cpu_pageset *p)
6011 {
6012         struct per_cpu_pages *pcp;
6013         int migratetype;
6014 
6015         memset(p, 0, sizeof(*p));
6016 
6017         pcp = &p->pcp;
6018         for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
6019                 INIT_LIST_HEAD(&pcp->lists[migratetype]);
6020 }
6021 
6022 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
6023 {
6024         pageset_init(p);
6025         pageset_set_batch(p, batch);
6026 }
6027 
6028 /*
6029  * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
6030  * to the value high for the pageset p.
6031  */
6032 static void pageset_set_high(struct per_cpu_pageset *p,
6033                                 unsigned long high)
6034 {
6035         unsigned long batch = max(1UL, high / 4);
6036         if ((high / 4) > (PAGE_SHIFT * 8))
6037                 batch = PAGE_SHIFT * 8;
6038 
6039         pageset_update(&p->pcp, high, batch);
6040 }
6041 
6042 static void pageset_set_high_and_batch(struct zone *zone,
6043                                        struct per_cpu_pageset *pcp)
6044 {
6045         if (percpu_pagelist_fraction)
6046                 pageset_set_high(pcp,
6047                         (zone_managed_pages(zone) /
6048                                 percpu_pagelist_fraction));
6049         else
6050                 pageset_set_batch(pcp, zone_batchsize(zone));
6051 }
6052 
6053 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
6054 {
6055         struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
6056 
6057         pageset_init(pcp);
6058         pageset_set_high_and_batch(zone, pcp);
6059 }
6060 
6061 void __meminit setup_zone_pageset(struct zone *zone)
6062 {
6063         int cpu;
6064         zone->pageset = alloc_percpu(struct per_cpu_pageset);
6065         for_each_possible_cpu(cpu)
6066                 zone_pageset_init(zone, cpu);
6067 }
6068 
6069 /*
6070  * Allocate per cpu pagesets and initialize them.
6071  * Before this call only boot pagesets were available.
6072  */
6073 void __init setup_per_cpu_pageset(void)
6074 {
6075         struct pglist_data *pgdat;
6076         struct zone *zone;
6077 
6078         for_each_populated_zone(zone)
6079                 setup_zone_pageset(zone);
6080 
6081         for_each_online_pgdat(pgdat)
6082                 pgdat->per_cpu_nodestats =
6083                         alloc_percpu(struct per_cpu_nodestat);
6084 }
6085 
6086 static __meminit void zone_pcp_init(struct zone *zone)
6087 {
6088         /*
6089          * per cpu subsystem is not up at this point. The following code
6090          * relies on the ability of the linker to provide the
6091          * offset of a (static) per cpu variable into the per cpu area.
6092          */
6093         zone->pageset = &boot_pageset;
6094 
6095         if (populated_zone(zone))
6096                 printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
6097                         zone->name, zone->present_pages,
6098                                          zone_batchsize(zone));
6099 }
6100 
6101 void __meminit init_currently_empty_zone(struct zone *zone,
6102                                         unsigned long zone_start_pfn,
6103                                         unsigned long size)
6104 {
6105         struct pglist_data *pgdat = zone->zone_pgdat;
6106         int zone_idx = zone_idx(zone) + 1;
6107 
6108         if (zone_idx > pgdat->nr_zones)
6109                 pgdat->nr_zones = zone_idx;
6110 
6111         zone->zone_start_pfn = zone_start_pfn;
6112 
6113         mminit_dprintk(MMINIT_TRACE, "memmap_init",
6114                         "Initialising map node %d zone %lu pfns %lu -> %lu\n",
6115                         pgdat->node_id,
6116                         (unsigned long)zone_idx(zone),
6117                         zone_start_pfn, (zone_start_pfn + size));
6118 
6119         zone_init_free_lists(zone);
6120         zone->initialized = 1;
6121 }
6122 
6123 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6124 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
6125 
6126 /*
6127  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
6128  */
6129 int __meminit __early_pfn_to_nid(unsigned long pfn,
6130                                         struct mminit_pfnnid_cache *state)
6131 {
6132         unsigned long start_pfn, end_pfn;
6133         int nid;
6134 
6135         if (state->last_start <= pfn && pfn < state->last_end)
6136                 return state->last_nid;
6137 
6138         nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
6139         if (nid != NUMA_NO_NODE) {
6140                 state->last_start = start_pfn;
6141                 state->last_end = end_pfn;
6142                 state->last_nid = nid;
6143         }
6144 
6145         return nid;
6146 }
6147 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
6148 
6149 /**
6150  * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
6151  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
6152  * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
6153  *
6154  * If an architecture guarantees that all ranges registered contain no holes
6155  * and may be freed, this this function may be used instead of calling
6156  * memblock_free_early_nid() manually.
6157  */
6158 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
6159 {
6160         unsigned long start_pfn, end_pfn;
6161         int i, this_nid;
6162 
6163         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
6164                 start_pfn = min(start_pfn, max_low_pfn);
6165                 end_pfn = min(end_pfn, max_low_pfn);
6166 
6167                 if (start_pfn < end_pfn)
6168                         memblock_free_early_nid(PFN_PHYS(start_pfn),
6169                                         (end_pfn - start_pfn) << PAGE_SHIFT,
6170                                         this_nid);
6171         }
6172 }
6173 
6174 /**
6175  * sparse_memory_present_with_active_regions - Call memory_present for each active range
6176  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
6177  *
6178  * If an architecture guarantees that all ranges registered contain no holes and may
6179  * be freed, this function may be used instead of calling memory_present() manually.
6180  */
6181 void __init sparse_memory_present_with_active_regions(int nid)
6182 {
6183         unsigned long start_pfn, end_pfn;
6184         int i, this_nid;
6185 
6186         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
6187                 memory_present(this_nid, start_pfn, end_pfn);
6188 }
6189 
6190 /**
6191  * get_pfn_range_for_nid - Return the start and end page frames for a node
6192  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
6193  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
6194  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
6195  *
6196  * It returns the start and end page frame of a node based on information
6197  * provided by memblock_set_node(). If called for a node
6198  * with no available memory, a warning is printed and the start and end
6199  * PFNs will be 0.
6200  */
6201 void __init get_pfn_range_for_nid(unsigned int nid,
6202                         unsigned long *start_pfn, unsigned long *end_pfn)
6203 {
6204         unsigned long this_start_pfn, this_end_pfn;
6205         int i;
6206 
6207         *start_pfn = -1UL;
6208         *end_pfn = 0;
6209 
6210         for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
6211                 *start_pfn = min(*start_pfn, this_start_pfn);
6212                 *end_pfn = max(*end_pfn, this_end_pfn);
6213         }
6214 
6215         if (*start_pfn == -1UL)
6216                 *start_pfn = 0;
6217 }
6218 
6219 /*
6220  * This finds a zone that can be used for ZONE_MOVABLE pages. The
6221  * assumption is made that zones within a node are ordered in monotonic
6222  * increasing memory addresses so that the "highest" populated zone is used
6223  */
6224 static void __init find_usable_zone_for_movable(void)
6225 {
6226         int zone_index;
6227         for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
6228                 if (zone_index == ZONE_MOVABLE)
6229                         continue;
6230 
6231                 if (arch_zone_highest_possible_pfn[zone_index] >
6232                                 arch_zone_lowest_possible_pfn[zone_index])
6233                         break;
6234         }
6235 
6236         VM_BUG_ON(zone_index == -1);
6237         movable_zone = zone_index;
6238 }
6239 
6240 /*
6241  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
6242  * because it is sized independent of architecture. Unlike the other zones,
6243  * the starting point for ZONE_MOVABLE is not fixed. It may be different
6244  * in each node depending on the size of each node and how evenly kernelcore
6245  * is distributed. This helper function adjusts the zone ranges
6246  * provided by the architecture for a given node by using the end of the
6247  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
6248  * zones within a node are in order of monotonic increases memory addresses
6249  */
6250 static void __init adjust_zone_range_for_zone_movable(int nid,
6251                                         unsigned long zone_type,
6252                                         unsigned long node_start_pfn,
6253                                         unsigned long node_end_pfn,
6254                                         unsigned long *zone_start_pfn,
6255                                         unsigned long *zone_end_pfn)
6256 {
6257         /* Only adjust if ZONE_MOVABLE is on this node */
6258         if (zone_movable_pfn[nid]) {
6259                 /* Size ZONE_MOVABLE */
6260                 if (zone_type == ZONE_MOVABLE) {
6261                         *zone_start_pfn = zone_movable_pfn[nid];
6262                         *zone_end_pfn = min(node_end_pfn,
6263                                 arch_zone_highest_possible_pfn[movable_zone]);
6264 
6265                 /* Adjust for ZONE_MOVABLE starting within this range */
6266                 } else if (!mirrored_kernelcore &&
6267                         *zone_start_pfn < zone_movable_pfn[nid] &&
6268                         *zone_end_pfn > zone_movable_pfn[nid]) {
6269                         *zone_end_pfn = zone_movable_pfn[nid];
6270 
6271                 /* Check if this whole range is within ZONE_MOVABLE */
6272                 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
6273                         *zone_start_pfn = *zone_end_pfn;
6274         }
6275 }
6276 
6277 /*
6278  * Return the number of pages a zone spans in a node, including holes
6279  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6280  */
6281 static unsigned long __init zone_spanned_pages_in_node(int nid,
6282                                         unsigned long zone_type,
6283                                         unsigned long node_start_pfn,
6284                                         unsigned long node_end_pfn,
6285                                         unsigned long *zone_start_pfn,
6286                                         unsigned long *zone_end_pfn,
6287                                         unsigned long *ignored)
6288 {
6289         unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6290         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
6291         /* When hotadd a new node from cpu_up(), the node should be empty */
6292         if (!node_start_pfn && !node_end_pfn)
6293                 return 0;
6294 
6295         /* Get the start and end of the zone */
6296         *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
6297         *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
6298         adjust_zone_range_for_zone_movable(nid, zone_type,
6299                                 node_start_pfn, node_end_pfn,
6300                                 zone_start_pfn, zone_end_pfn);
6301 
6302         /* Check that this node has pages within the zone's required range */
6303         if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
6304                 return 0;
6305 
6306         /* Move the zone boundaries inside the node if necessary */
6307         *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
6308         *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
6309 
6310         /* Return the spanned pages */
6311         return *zone_end_pfn - *zone_start_pfn;
6312 }
6313 
6314 /*
6315  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
6316  * then all holes in the requested range will be accounted for.
6317  */
6318 unsigned long __init __absent_pages_in_range(int nid,
6319                                 unsigned long range_start_pfn,
6320                                 unsigned long range_end_pfn)
6321 {
6322         unsigned long nr_absent = range_end_pfn - range_start_pfn;
6323         unsigned long start_pfn, end_pfn;
6324         int i;
6325 
6326         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
6327                 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
6328                 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
6329                 nr_absent -= end_pfn - start_pfn;
6330         }
6331         return nr_absent;
6332 }
6333 
6334 /**
6335  * absent_pages_in_range - Return number of page frames in holes within a range
6336  * @start_pfn: The start PFN to start searching for holes
6337  * @end_pfn: The end PFN to stop searching for holes
6338  *
6339  * Return: the number of pages frames in memory holes within a range.
6340  */
6341 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6342                                                         unsigned long end_pfn)
6343 {
6344         return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
6345 }
6346 
6347 /* Return the number of page frames in holes in a zone on a node */
6348 static unsigned long __init zone_absent_pages_in_node(int nid,
6349                                         unsigned long zone_type,
6350                                         unsigned long node_start_pfn,
6351                                         unsigned long node_end_pfn,
6352                                         unsigned long *ignored)
6353 {
6354         unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6355         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
6356         unsigned long zone_start_pfn, zone_end_pfn;
6357         unsigned long nr_absent;
6358 
6359         /* When hotadd a new node from cpu_up(), the node should be empty */
6360         if (!node_start_pfn && !node_end_pfn)
6361                 return 0;
6362 
6363         zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
6364         zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
6365 
6366         adjust_zone_range_for_zone_movable(nid, zone_type,
6367                         node_start_pfn, node_end_pfn,
6368                         &zone_start_pfn, &zone_end_pfn);
6369         nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
6370 
6371         /*
6372          * ZONE_MOVABLE handling.
6373          * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
6374          * and vice versa.
6375          */
6376         if (mirrored_kernelcore && zone_movable_pfn[nid]) {
6377                 unsigned long start_pfn, end_pfn;
6378                 struct memblock_region *r;
6379 
6380                 for_each_memblock(memory, r) {
6381                         start_pfn = clamp(memblock_region_memory_base_pfn(r),
6382                                           zone_start_pfn, zone_end_pfn);
6383                         end_pfn = clamp(memblock_region_memory_end_pfn(r),
6384                                         zone_start_pfn, zone_end_pfn);
6385 
6386                         if (zone_type == ZONE_MOVABLE &&
6387                             memblock_is_mirror(r))
6388                                 nr_absent += end_pfn - start_pfn;
6389 
6390                         if (zone_type == ZONE_NORMAL &&
6391                             !memblock_is_mirror(r))
6392                                 nr_absent += end_pfn - start_pfn;
6393                 }
6394         }
6395 
6396         return nr_absent;
6397 }
6398 
6399 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6400 static inline unsigned long __init zone_spanned_pages_in_node(int nid,
6401                                         unsigned long zone_type,
6402                                         unsigned long node_start_pfn,
6403                                         unsigned long node_end_pfn,
6404                                         unsigned long *zone_start_pfn,
6405                                         unsigned long *zone_end_pfn,
6406                                         unsigned long *zones_size)
6407 {
6408         unsigned int zone;
6409 
6410         *zone_start_pfn = node_start_pfn;
6411         for (zone = 0; zone < zone_type; zone++)
6412                 *zone_start_pfn += zones_size[zone];
6413 
6414         *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
6415 
6416         return zones_size[zone_type];
6417 }
6418 
6419 static inline unsigned long __init zone_absent_pages_in_node(int nid,
6420                                                 unsigned long zone_type,
6421                                                 unsigned long node_start_pfn,
6422                                                 unsigned long node_end_pfn,
6423                                                 unsigned long *zholes_size)
6424 {
6425         if (!zholes_size)
6426                 return 0;
6427 
6428         return zholes_size[zone_type];
6429 }
6430 
6431 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6432 
6433 static void __init calculate_node_totalpages(struct pglist_data *pgdat,
6434                                                 unsigned long node_start_pfn,
6435                                                 unsigned long node_end_pfn,
6436                                                 unsigned long *zones_size,
6437                                                 unsigned long *zholes_size)
6438 {
6439         unsigned long realtotalpages = 0, totalpages = 0;
6440         enum zone_type i;
6441 
6442         for (i = 0; i < MAX_NR_ZONES; i++) {
6443                 struct zone *zone = pgdat->node_zones + i;
6444                 unsigned long zone_start_pfn, zone_end_pfn;
6445                 unsigned long size, real_size;
6446 
6447                 size = zone_spanned_pages_in_node(pgdat->node_id, i,
6448                                                   node_start_pfn,
6449                                                   node_end_pfn,
6450                                                   &zone_start_pfn,
6451                                                   &zone_end_pfn,
6452                                                   zones_size);
6453                 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
6454                                                   node_start_pfn, node_end_pfn,
6455                                                   zholes_size);
6456                 if (size)
6457                         zone->zone_start_pfn = zone_start_pfn;
6458                 else
6459                         zone->zone_start_pfn = 0;
6460                 zone->spanned_pages = size;
6461                 zone->present_pages = real_size;
6462 
6463                 totalpages += size;
6464                 realtotalpages += real_size;
6465         }
6466 
6467         pgdat->node_spanned_pages = totalpages;
6468         pgdat->node_present_pages = realtotalpages;
6469         printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
6470                                                         realtotalpages);
6471 }
6472 
6473 #ifndef CONFIG_SPARSEMEM
6474 /*
6475  * Calculate the size of the zone->blockflags rounded to an unsigned long
6476  * Start by making sure zonesize is a multiple of pageblock_order by rounding
6477  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
6478  * round what is now in bits to nearest long in bits, then return it in
6479  * bytes.
6480  */
6481 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
6482 {
6483         unsigned long usemapsize;
6484 
6485         zonesize += zone_start_pfn & (pageblock_nr_pages-1);
6486         usemapsize = roundup(zonesize, pageblock_nr_pages);
6487         usemapsize = usemapsize >> pageblock_order;
6488         usemapsize *= NR_PAGEBLOCK_BITS;
6489         usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
6490 
6491         return usemapsize / 8;
6492 }
6493 
6494 static void __ref setup_usemap(struct pglist_data *pgdat,
6495                                 struct zone *zone,
6496                                 unsigned long zone_start_pfn,
6497                                 unsigned long zonesize)
6498 {
6499         unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
6500         zone->pageblock_flags = NULL;
6501         if (usemapsize) {
6502                 zone->pageblock_flags =
6503                         memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
6504                                             pgdat->node_id);
6505                 if (!zone->pageblock_flags)
6506                         panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
6507                               usemapsize, zone->name, pgdat->node_id);
6508         }
6509 }
6510 #else
6511 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
6512                                 unsigned long zone_start_pfn, unsigned long zonesize) {}
6513 #endif /* CONFIG_SPARSEMEM */
6514 
6515 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
6516 
6517 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
6518 void __init set_pageblock_order(void)
6519 {
6520         unsigned int order;
6521 
6522         /* Check that pageblock_nr_pages has not already been setup */
6523         if (pageblock_order)
6524                 return;
6525 
6526         if (HPAGE_SHIFT > PAGE_SHIFT)
6527                 order = HUGETLB_PAGE_ORDER;
6528         else
6529                 order = MAX_ORDER - 1;
6530 
6531         /*
6532          * Assume the largest contiguous order of interest is a huge page.
6533          * This value may be variable depending on boot parameters on IA64 and
6534          * powerpc.
6535          */
6536         pageblock_order = order;
6537 }
6538 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
6539 
6540 /*
6541  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
6542  * is unused as pageblock_order is set at compile-time. See
6543  * include/linux/pageblock-flags.h for the values of pageblock_order based on
6544  * the kernel config
6545  */
6546 void __init set_pageblock_order(void)
6547 {
6548 }
6549 
6550 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
6551 
6552 static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
6553                                                 unsigned long present_pages)
6554 {
6555         unsigned long pages = spanned_pages;
6556 
6557         /*
6558          * Provide a more accurate estimation if there are holes within
6559          * the zone and SPARSEMEM is in use. If there are holes within the
6560          * zone, each populated memory region may cost us one or two extra
6561          * memmap pages due to alignment because memmap pages for each
6562          * populated regions may not be naturally aligned on page boundary.
6563          * So the (present_pages >> 4) heuristic is a tradeoff for that.
6564          */
6565         if (spanned_pages > present_pages + (present_pages >> 4) &&
6566             IS_ENABLED(CONFIG_SPARSEMEM))
6567                 pages = present_pages;
6568 
6569         return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
6570 }
6571 
6572 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6573 static void pgdat_init_split_queue(struct pglist_data *pgdat)
6574 {
6575         spin_lock_init(&pgdat->split_queue_lock);
6576         INIT_LIST_HEAD(&pgdat->split_queue);
6577         pgdat->split_queue_len = 0;
6578 }
6579 #else
6580 static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
6581 #endif
6582 
6583 #ifdef CONFIG_COMPACTION
6584 static void pgdat_init_kcompactd(struct pglist_data *pgdat)
6585 {
6586         init_waitqueue_head(&pgdat->kcompactd_wait);
6587 }
6588 #else
6589 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
6590 #endif
6591 
6592 static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
6593 {
6594         pgdat_resize_init(pgdat);
6595 
6596         pgdat_init_split_queue(pgdat);
6597         pgdat_init_kcompactd(pgdat);
6598 
6599         init_waitqueue_head(&pgdat->kswapd_wait);
6600         init_waitqueue_head(&pgdat->pfmemalloc_wait);
6601 
6602         pgdat_page_ext_init(pgdat);
6603         spin_lock_init(&pgdat->lru_lock);
6604         lruvec_init(node_lruvec(pgdat));
6605 }
6606 
6607 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6608                                                         unsigned long remaining_pages)
6609 {
6610         atomic_long_set(&zone->managed_pages, remaining_pages);
6611         zone_set_nid(zone, nid);
6612         zone->name = zone_names[idx];
6613         zone->zone_pgdat = NODE_DATA(nid);
6614         spin_lock_init(&zone->lock);
6615         zone_seqlock_init(zone);
6616         zone_pcp_init(zone);
6617 }
6618 
6619 /*
6620  * Set up the zone data structures
6621  * - init pgdat internals
6622  * - init all zones belonging to this node
6623  *
6624  * NOTE: this function is only called during memory hotplug
6625  */
6626 #ifdef CONFIG_MEMORY_HOTPLUG
6627 void __ref free_area_init_core_hotplug(int nid)
6628 {
6629         enum zone_type z;
6630         pg_data_t *pgdat = NODE_DATA(nid);
6631 
6632         pgdat_init_internals(pgdat);
6633         for (z = 0; z < MAX_NR_ZONES; z++)
6634                 zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
6635 }
6636 #endif
6637 
6638 /*
6639  * Set up the zone data structures:
6640  *   - mark all pages reserved
6641  *   - mark all memory queues empty
6642  *   - clear the memory bitmaps
6643  *
6644  * NOTE: pgdat should get zeroed by caller.
6645  * NOTE: this function is only called during early init.
6646  */
6647 static void __init free_area_init_core(struct pglist_data *pgdat)
6648 {
6649         enum zone_type j;
6650         int nid = pgdat->node_id;
6651 
6652         pgdat_init_internals(pgdat);
6653         pgdat->per_cpu_nodestats = &boot_nodestats;
6654 
6655         for (j = 0; j < MAX_NR_ZONES; j++) {
6656                 struct zone *zone = pgdat->node_zones + j;
6657                 unsigned long size, freesize, memmap_pages;
6658                 unsigned long zone_start_pfn = zone->zone_start_pfn;
6659 
6660                 size = zone->spanned_pages;
6661                 freesize = zone->present_pages;
6662 
6663                 /*
6664                  * Adjust freesize so that it accounts for how much memory
6665                  * is used by this zone for memmap. This affects the watermark
6666                  * and per-cpu initialisations
6667                  */
6668                 memmap_pages = calc_memmap_size(size, freesize);
6669                 if (!is_highmem_idx(j)) {
6670                         if (freesize >= memmap_pages) {
6671                                 freesize -= memmap_pages;
6672                                 if (memmap_pages)
6673                                         printk(KERN_DEBUG
6674                                                "  %s zone: %lu pages used for memmap\n",
6675                                                zone_names[j], memmap_pages);
6676                         } else
6677                                 pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
6678                                         zone_names[j], memmap_pages, freesize);
6679                 }
6680 
6681                 /* Account for reserved pages */
6682                 if (j == 0 && freesize > dma_reserve) {
6683                         freesize -= dma_reserve;
6684                         printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
6685                                         zone_names[0], dma_reserve);
6686                 }
6687 
6688                 if (!is_highmem_idx(j))
6689                         nr_kernel_pages += freesize;
6690                 /* Charge for highmem memmap if there are enough kernel pages */
6691                 else if (nr_kernel_pages > memmap_pages * 2)
6692                         nr_kernel_pages -= memmap_pages;
6693                 nr_all_pages += freesize;
6694 
6695                 /*
6696                  * Set an approximate value for lowmem here, it will be adjusted
6697                  * when the bootmem allocator frees pages into the buddy system.
6698                  * And all highmem pages will be managed by the buddy system.
6699                  */
6700                 zone_init_internals(zone, j, nid, freesize);
6701 
6702                 if (!size)
6703                         continue;
6704 
6705                 set_pageblock_order();
6706                 setup_usemap(pgdat, zone, zone_start_pfn, size);
6707                 init_currently_empty_zone(zone, zone_start_pfn, size);
6708                 memmap_init(size, nid, j, zone_start_pfn);
6709         }
6710 }
6711 
6712 #ifdef CONFIG_FLAT_NODE_MEM_MAP
6713 static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6714 {
6715         unsigned long __maybe_unused start = 0;
6716         unsigned long __maybe_unused offset = 0;
6717 
6718         /* Skip empty nodes */
6719         if (!pgdat->node_spanned_pages)
6720                 return;
6721 
6722         start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
6723         offset = pgdat->node_start_pfn - start;
6724         /* ia64 gets its own node_mem_map, before this, without bootmem */
6725         if (!pgdat->node_mem_map) {
6726                 unsigned long size, end;
6727                 struct page *map;
6728 
6729                 /*
6730                  * The zone's endpoints aren't required to be MAX_ORDER
6731                  * aligned but the node_mem_map endpoints must be in order
6732                  * for the buddy allocator to function correctly.
6733                  */
6734                 end = pgdat_end_pfn(pgdat);
6735                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
6736                 size =  (end - start) * sizeof(struct page);
6737                 map = memblock_alloc_node(size, SMP_CACHE_BYTES,
6738                                           pgdat->node_id);
6739                 if (!map)
6740                         panic("Failed to allocate %ld bytes for node %d memory map\n",
6741                               size, pgdat->node_id);
6742                 pgdat->node_mem_map = map + offset;
6743         }
6744         pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
6745                                 __func__, pgdat->node_id, (unsigned long)pgdat,
6746                                 (unsigned long)pgdat->node_mem_map);
6747 #ifndef CONFIG_NEED_MULTIPLE_NODES
6748         /*
6749          * With no DISCONTIG, the global mem_map is just set as node 0's
6750          */
6751         if (pgdat == NODE_DATA(0)) {
6752                 mem_map = NODE_DATA(0)->node_mem_map;
6753 #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
6754                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
6755                         mem_map -= offset;
6756 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6757         }
6758 #endif
6759 }
6760 #else
6761 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
6762 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
6763 
6764 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6765 static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6766 {
6767         pgdat->first_deferred_pfn = ULONG_MAX;
6768 }
6769 #else
6770 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
6771 #endif
6772 
6773 void __init free_area_init_node(int nid, unsigned long *zones_size,
6774                                    unsigned long node_start_pfn,
6775                                    unsigned long *zholes_size)
6776 {
6777         pg_data_t *pgdat = NODE_DATA(nid);
6778         unsigned long start_pfn = 0;
6779         unsigned long end_pfn = 0;
6780 
6781         /* pg_data_t should be reset to zero when it's allocated */
6782         WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
6783 
6784         pgdat->node_id = nid;
6785         pgdat->node_start_pfn = node_start_pfn;
6786         pgdat->per_cpu_nodestats = NULL;
6787 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6788         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6789         pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
6790                 (u64)start_pfn << PAGE_SHIFT,
6791                 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6792 #else
6793         start_pfn = node_start_pfn;
6794 #endif
6795         calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6796                                   zones_size, zholes_size);
6797 
6798         alloc_node_mem_map(pgdat);
6799         pgdat_set_deferred_range(pgdat);
6800 
6801         free_area_init_core(pgdat);
6802 }
6803 
6804 #if !defined(CONFIG_FLAT_NODE_MEM_MAP)
6805 /*
6806  * Zero all valid struct pages in range [spfn, epfn), return number of struct
6807  * pages zeroed
6808  */
6809 static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
6810 {
6811         unsigned long pfn;
6812         u64 pgcnt = 0;
6813 
6814         for (pfn = spfn; pfn < epfn; pfn++) {
6815                 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6816                         pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6817                                 + pageblock_nr_pages - 1;
6818                         continue;
6819                 }
6820                 mm_zero_struct_page(pfn_to_page(pfn));
6821                 pgcnt++;
6822         }
6823 
6824         return pgcnt;
6825 }
6826 
6827 /*
6828  * Only struct pages that are backed by physical memory are zeroed and
6829  * initialized by going through __init_single_page(). But, there are some
6830  * struct pages which are reserved in memblock allocator and their fields
6831  * may be accessed (for example page_to_pfn() on some configuration accesses
6832  * flags). We must explicitly zero those struct pages.
6833  *
6834  * This function also addresses a similar issue where struct pages are left
6835  * uninitialized because the physical address range is not covered by
6836  * memblock.memory or memblock.reserved. That could happen when memblock
6837  * layout is manually configured via memmap=.
6838  */
6839 void __init zero_resv_unavail(void)
6840 {
6841         phys_addr_t start, end;
6842         u64 i, pgcnt;
6843         phys_addr_t next = 0;
6844 
6845         /*
6846          * Loop through unavailable ranges not covered by memblock.memory.
6847          */
6848         pgcnt = 0;
6849         for_each_mem_range(i, &memblock.memory, NULL,
6850                         NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6851                 if (next < start)
6852                         pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6853                 next = end;
6854         }
6855         pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
6856 
6857         /*
6858          * Struct pages that do not have backing memory. This could be because
6859          * firmware is using some of this memory, or for some other reasons.
6860          */
6861         if (pgcnt)
6862                 pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6863 }
6864 #endif /* !CONFIG_FLAT_NODE_MEM_MAP */
6865 
6866 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6867 
6868 #if MAX_NUMNODES > 1
6869 /*
6870  * Figure out the number of possible node ids.
6871  */
6872 void __init setup_nr_node_ids(void)
6873 {
6874         unsigned int highest;
6875 
6876         highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
6877         nr_node_ids = highest + 1;
6878 }
6879 #endif
6880 
6881 /**
6882  * node_map_pfn_alignment - determine the maximum internode alignment
6883  *
6884  * This function should be called after node map is populated and sorted.
6885  * It calculates the maximum power of two alignment which can distinguish
6886  * all the nodes.
6887  *
6888  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
6889  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
6890  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
6891  * shifted, 1GiB is enough and this function will indicate so.
6892  *
6893  * This is used to test whether pfn -> nid mapping of the chosen memory
6894  * model has fine enough granularity to avoid incorrect mapping for the
6895  * populated node map.
6896  *
6897  * Return: the determined alignment in pfn's.  0 if there is no alignment
6898  * requirement (single node).
6899  */
6900 unsigned long __init node_map_pfn_alignment(void)
6901 {
6902         unsigned long accl_mask = 0, last_end = 0;
6903         unsigned long start, end, mask;
6904         int last_nid = NUMA_NO_NODE;
6905         int i, nid;
6906 
6907         for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
6908                 if (!start || last_nid < 0 || last_nid == nid) {
6909                         last_nid = nid;
6910                         last_end = end;
6911                         continue;
6912                 }
6913 
6914                 /*
6915                  * Start with a mask granular enough to pin-point to the
6916                  * start pfn and tick off bits one-by-one until it becomes
6917                  * too coarse to separate the current node from the last.
6918                  */
6919                 mask = ~((1 << __ffs(start)) - 1);
6920                 while (mask && last_end <= (start & (mask << 1)))
6921                         mask <<= 1;
6922 
6923                 /* accumulate all internode masks */
6924                 accl_mask |= mask;
6925         }
6926 
6927         /* convert mask to number of pages */
6928         return ~accl_mask + 1;
6929 }
6930 
6931 /* Find the lowest pfn for a node */
6932 static unsigned long __init find_min_pfn_for_node(int nid)
6933 {
6934         unsigned long min_pfn = ULONG_MAX;
6935         unsigned long start_pfn;
6936         int i;
6937 
6938         for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6939                 min_pfn = min(min_pfn, start_pfn);
6940 
6941         if (min_pfn == ULONG_MAX) {
6942                 pr_warn("Could not find start_pfn for node %d\n", nid);
6943                 return 0;
6944         }
6945 
6946         return min_pfn;
6947 }
6948 
6949 /**
6950  * find_min_pfn_with_active_regions - Find the minimum PFN registered
6951  *
6952  * Return: the minimum PFN based on information provided via
6953  * memblock_set_node().
6954  */
6955 unsigned long __init find_min_pfn_with_active_regions(void)
6956 {
6957         return find_min_pfn_for_node(MAX_NUMNODES);
6958 }
6959 
6960 /*
6961  * early_calculate_totalpages()
6962  * Sum pages in active regions for movable zone.
6963  * Populate N_MEMORY for calculating usable_nodes.
6964  */
6965 static unsigned long __init early_calculate_totalpages(void)
6966 {
6967         unsigned long totalpages = 0;
6968         unsigned long start_pfn, end_pfn;
6969         int i, nid;
6970 
6971         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
6972                 unsigned long pages = end_pfn - start_pfn;
6973 
6974                 totalpages += pages;
6975                 if (pages)
6976                         node_set_state(nid, N_MEMORY);
6977         }
6978         return totalpages;
6979 }
6980 
6981 /*
6982  * Find the PFN the Movable zone begins in each node. Kernel memory
6983  * is spread evenly between nodes as long as the nodes have enough
6984  * memory. When they don't, some nodes will have more kernelcore than
6985  * others
6986  */
6987 static void