1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 34 #include <asm/tlbflush.h> 35 36 #include "internal.h" 37 38 /* 39 * online_page_callback contains pointer to current page onlining function. 40 * Initially it is generic_online_page(). If it is required it could be 41 * changed by calling set_online_page_callback() for callback registration 42 * and restore_online_page_callback() for generic callback restore. 43 */ 44 45 static void generic_online_page(struct page *page); 46 47 static online_page_callback_t online_page_callback = generic_online_page; 48 49 DEFINE_MUTEX(mem_hotplug_mutex); 50 51 void lock_memory_hotplug(void) 52 { 53 mutex_lock(&mem_hotplug_mutex); 54 55 /* for exclusive hibernation if CONFIG_HIBERNATION=y */ 56 lock_system_sleep(); 57 } 58 59 void unlock_memory_hotplug(void) 60 { 61 unlock_system_sleep(); 62 mutex_unlock(&mem_hotplug_mutex); 63 } 64 65 66 /* add this memory to iomem resource */ 67 static struct resource *register_memory_resource(u64 start, u64 size) 68 { 69 struct resource *res; 70 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 71 BUG_ON(!res); 72 73 res->name = "System RAM"; 74 res->start = start; 75 res->end = start + size - 1; 76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 77 if (request_resource(&iomem_resource, res) < 0) { 78 printk("System RAM resource %pR cannot be added\n", res); 79 kfree(res); 80 res = NULL; 81 } 82 return res; 83 } 84 85 static void release_memory_resource(struct resource *res) 86 { 87 if (!res) 88 return; 89 release_resource(res); 90 kfree(res); 91 return; 92 } 93 94 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 95 void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type) 97 { 98 page->lru.next = (struct list_head *) type; 99 SetPagePrivate(page); 100 set_page_private(page, info); 101 atomic_inc(&page->_count); 102 } 103 104 /* reference to __meminit __free_pages_bootmem is valid 105 * so use __ref to tell modpost not to generate a warning */ 106 void __ref put_page_bootmem(struct page *page) 107 { 108 unsigned long type; 109 static DEFINE_MUTEX(ppb_lock); 110 111 type = (unsigned long) page->lru.next; 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 113 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 114 115 if (atomic_dec_return(&page->_count) == 1) { 116 ClearPagePrivate(page); 117 set_page_private(page, 0); 118 INIT_LIST_HEAD(&page->lru); 119 120 /* 121 * Please refer to comment for __free_pages_bootmem() 122 * for why we serialize here. 123 */ 124 mutex_lock(&ppb_lock); 125 __free_pages_bootmem(page, 0); 126 mutex_unlock(&ppb_lock); 127 totalram_pages++; 128 } 129 130 } 131 132 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 133 #ifndef CONFIG_SPARSEMEM_VMEMMAP 134 static void register_page_bootmem_info_section(unsigned long start_pfn) 135 { 136 unsigned long *usemap, mapsize, section_nr, i; 137 struct mem_section *ms; 138 struct page *page, *memmap; 139 140 section_nr = pfn_to_section_nr(start_pfn); 141 ms = __nr_to_section(section_nr); 142 143 /* Get section's memmap address */ 144 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 145 146 /* 147 * Get page for the memmap's phys address 148 * XXX: need more consideration for sparse_vmemmap... 149 */ 150 page = virt_to_page(memmap); 151 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 152 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 153 154 /* remember memmap's page */ 155 for (i = 0; i < mapsize; i++, page++) 156 get_page_bootmem(section_nr, page, SECTION_INFO); 157 158 usemap = __nr_to_section(section_nr)->pageblock_flags; 159 page = virt_to_page(usemap); 160 161 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 162 163 for (i = 0; i < mapsize; i++, page++) 164 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 165 166 } 167 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 168 static void register_page_bootmem_info_section(unsigned long start_pfn) 169 { 170 unsigned long *usemap, mapsize, section_nr, i; 171 struct mem_section *ms; 172 struct page *page, *memmap; 173 174 if (!pfn_valid(start_pfn)) 175 return; 176 177 section_nr = pfn_to_section_nr(start_pfn); 178 ms = __nr_to_section(section_nr); 179 180 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 181 182 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 183 184 usemap = __nr_to_section(section_nr)->pageblock_flags; 185 page = virt_to_page(usemap); 186 187 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 188 189 for (i = 0; i < mapsize; i++, page++) 190 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 191 } 192 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 193 194 void register_page_bootmem_info_node(struct pglist_data *pgdat) 195 { 196 unsigned long i, pfn, end_pfn, nr_pages; 197 int node = pgdat->node_id; 198 struct page *page; 199 struct zone *zone; 200 201 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 202 page = virt_to_page(pgdat); 203 204 for (i = 0; i < nr_pages; i++, page++) 205 get_page_bootmem(node, page, NODE_INFO); 206 207 zone = &pgdat->node_zones[0]; 208 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 209 if (zone->wait_table) { 210 nr_pages = zone->wait_table_hash_nr_entries 211 * sizeof(wait_queue_head_t); 212 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 213 page = virt_to_page(zone->wait_table); 214 215 for (i = 0; i < nr_pages; i++, page++) 216 get_page_bootmem(node, page, NODE_INFO); 217 } 218 } 219 220 pfn = pgdat->node_start_pfn; 221 end_pfn = pgdat_end_pfn(pgdat); 222 223 /* register_section info */ 224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 225 /* 226 * Some platforms can assign the same pfn to multiple nodes - on 227 * node0 as well as nodeN. To avoid registering a pfn against 228 * multiple nodes we check that this pfn does not already 229 * reside in some other node. 230 */ 231 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 232 register_page_bootmem_info_section(pfn); 233 } 234 } 235 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 236 237 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 238 unsigned long end_pfn) 239 { 240 unsigned long old_zone_end_pfn; 241 242 zone_span_writelock(zone); 243 244 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 245 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) 246 zone->zone_start_pfn = start_pfn; 247 248 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 249 zone->zone_start_pfn; 250 251 zone_span_writeunlock(zone); 252 } 253 254 static void resize_zone(struct zone *zone, unsigned long start_pfn, 255 unsigned long end_pfn) 256 { 257 zone_span_writelock(zone); 258 259 if (end_pfn - start_pfn) { 260 zone->zone_start_pfn = start_pfn; 261 zone->spanned_pages = end_pfn - start_pfn; 262 } else { 263 /* 264 * make it consist as free_area_init_core(), 265 * if spanned_pages = 0, then keep start_pfn = 0 266 */ 267 zone->zone_start_pfn = 0; 268 zone->spanned_pages = 0; 269 } 270 271 zone_span_writeunlock(zone); 272 } 273 274 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 275 unsigned long end_pfn) 276 { 277 enum zone_type zid = zone_idx(zone); 278 int nid = zone->zone_pgdat->node_id; 279 unsigned long pfn; 280 281 for (pfn = start_pfn; pfn < end_pfn; pfn++) 282 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 283 } 284 285 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 286 * alloc_bootmem_node_nopanic() */ 287 static int __ref ensure_zone_is_initialized(struct zone *zone, 288 unsigned long start_pfn, unsigned long num_pages) 289 { 290 if (!zone_is_initialized(zone)) 291 return init_currently_empty_zone(zone, start_pfn, num_pages, 292 MEMMAP_HOTPLUG); 293 return 0; 294 } 295 296 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 297 unsigned long start_pfn, unsigned long end_pfn) 298 { 299 int ret; 300 unsigned long flags; 301 unsigned long z1_start_pfn; 302 303 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 304 if (ret) 305 return ret; 306 307 pgdat_resize_lock(z1->zone_pgdat, &flags); 308 309 /* can't move pfns which are higher than @z2 */ 310 if (end_pfn > zone_end_pfn(z2)) 311 goto out_fail; 312 /* the move out part mast at the left most of @z2 */ 313 if (start_pfn > z2->zone_start_pfn) 314 goto out_fail; 315 /* must included/overlap */ 316 if (end_pfn <= z2->zone_start_pfn) 317 goto out_fail; 318 319 /* use start_pfn for z1's start_pfn if z1 is empty */ 320 if (z1->spanned_pages) 321 z1_start_pfn = z1->zone_start_pfn; 322 else 323 z1_start_pfn = start_pfn; 324 325 resize_zone(z1, z1_start_pfn, end_pfn); 326 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 327 328 pgdat_resize_unlock(z1->zone_pgdat, &flags); 329 330 fix_zone_id(z1, start_pfn, end_pfn); 331 332 return 0; 333 out_fail: 334 pgdat_resize_unlock(z1->zone_pgdat, &flags); 335 return -1; 336 } 337 338 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 339 unsigned long start_pfn, unsigned long end_pfn) 340 { 341 int ret; 342 unsigned long flags; 343 unsigned long z2_end_pfn; 344 345 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 346 if (ret) 347 return ret; 348 349 pgdat_resize_lock(z1->zone_pgdat, &flags); 350 351 /* can't move pfns which are lower than @z1 */ 352 if (z1->zone_start_pfn > start_pfn) 353 goto out_fail; 354 /* the move out part mast at the right most of @z1 */ 355 if (zone_end_pfn(z1) > end_pfn) 356 goto out_fail; 357 /* must included/overlap */ 358 if (start_pfn >= zone_end_pfn(z1)) 359 goto out_fail; 360 361 /* use end_pfn for z2's end_pfn if z2 is empty */ 362 if (z2->spanned_pages) 363 z2_end_pfn = zone_end_pfn(z2); 364 else 365 z2_end_pfn = end_pfn; 366 367 resize_zone(z1, z1->zone_start_pfn, start_pfn); 368 resize_zone(z2, start_pfn, z2_end_pfn); 369 370 pgdat_resize_unlock(z1->zone_pgdat, &flags); 371 372 fix_zone_id(z2, start_pfn, end_pfn); 373 374 return 0; 375 out_fail: 376 pgdat_resize_unlock(z1->zone_pgdat, &flags); 377 return -1; 378 } 379 380 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 381 unsigned long end_pfn) 382 { 383 unsigned long old_pgdat_end_pfn = 384 pgdat->node_start_pfn + pgdat->node_spanned_pages; 385 386 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 387 pgdat->node_start_pfn = start_pfn; 388 389 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 390 pgdat->node_start_pfn; 391 } 392 393 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 394 { 395 struct pglist_data *pgdat = zone->zone_pgdat; 396 int nr_pages = PAGES_PER_SECTION; 397 int nid = pgdat->node_id; 398 int zone_type; 399 unsigned long flags; 400 int ret; 401 402 zone_type = zone - pgdat->node_zones; 403 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 404 if (ret) 405 return ret; 406 407 pgdat_resize_lock(zone->zone_pgdat, &flags); 408 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 409 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 410 phys_start_pfn + nr_pages); 411 pgdat_resize_unlock(zone->zone_pgdat, &flags); 412 memmap_init_zone(nr_pages, nid, zone_type, 413 phys_start_pfn, MEMMAP_HOTPLUG); 414 return 0; 415 } 416 417 static int __meminit __add_section(int nid, struct zone *zone, 418 unsigned long phys_start_pfn) 419 { 420 int nr_pages = PAGES_PER_SECTION; 421 int ret; 422 423 if (pfn_valid(phys_start_pfn)) 424 return -EEXIST; 425 426 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 427 428 if (ret < 0) 429 return ret; 430 431 ret = __add_zone(zone, phys_start_pfn); 432 433 if (ret < 0) 434 return ret; 435 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 437 } 438 439 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 440 static int find_smallest_section_pfn(int nid, struct zone *zone, 441 unsigned long start_pfn, 442 unsigned long end_pfn) 443 { 444 struct mem_section *ms; 445 446 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 447 ms = __pfn_to_section(start_pfn); 448 449 if (unlikely(!valid_section(ms))) 450 continue; 451 452 if (unlikely(pfn_to_nid(start_pfn) != nid)) 453 continue; 454 455 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 456 continue; 457 458 return start_pfn; 459 } 460 461 return 0; 462 } 463 464 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 465 static int find_biggest_section_pfn(int nid, struct zone *zone, 466 unsigned long start_pfn, 467 unsigned long end_pfn) 468 { 469 struct mem_section *ms; 470 unsigned long pfn; 471 472 /* pfn is the end pfn of a memory section. */ 473 pfn = end_pfn - 1; 474 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 475 ms = __pfn_to_section(pfn); 476 477 if (unlikely(!valid_section(ms))) 478 continue; 479 480 if (unlikely(pfn_to_nid(pfn) != nid)) 481 continue; 482 483 if (zone && zone != page_zone(pfn_to_page(pfn))) 484 continue; 485 486 return pfn; 487 } 488 489 return 0; 490 } 491 492 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 493 unsigned long end_pfn) 494 { 495 unsigned long zone_start_pfn = zone->zone_start_pfn; 496 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 497 unsigned long pfn; 498 struct mem_section *ms; 499 int nid = zone_to_nid(zone); 500 501 zone_span_writelock(zone); 502 if (zone_start_pfn == start_pfn) { 503 /* 504 * If the section is smallest section in the zone, it need 505 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 506 * In this case, we find second smallest valid mem_section 507 * for shrinking zone. 508 */ 509 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 510 zone_end_pfn); 511 if (pfn) { 512 zone->zone_start_pfn = pfn; 513 zone->spanned_pages = zone_end_pfn - pfn; 514 } 515 } else if (zone_end_pfn == end_pfn) { 516 /* 517 * If the section is biggest section in the zone, it need 518 * shrink zone->spanned_pages. 519 * In this case, we find second biggest valid mem_section for 520 * shrinking zone. 521 */ 522 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 523 start_pfn); 524 if (pfn) 525 zone->spanned_pages = pfn - zone_start_pfn + 1; 526 } 527 528 /* 529 * The section is not biggest or smallest mem_section in the zone, it 530 * only creates a hole in the zone. So in this case, we need not 531 * change the zone. But perhaps, the zone has only hole data. Thus 532 * it check the zone has only hole or not. 533 */ 534 pfn = zone_start_pfn; 535 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 536 ms = __pfn_to_section(pfn); 537 538 if (unlikely(!valid_section(ms))) 539 continue; 540 541 if (page_zone(pfn_to_page(pfn)) != zone) 542 continue; 543 544 /* If the section is current section, it continues the loop */ 545 if (start_pfn == pfn) 546 continue; 547 548 /* If we find valid section, we have nothing to do */ 549 zone_span_writeunlock(zone); 550 return; 551 } 552 553 /* The zone has no valid section */ 554 zone->zone_start_pfn = 0; 555 zone->spanned_pages = 0; 556 zone_span_writeunlock(zone); 557 } 558 559 static void shrink_pgdat_span(struct pglist_data *pgdat, 560 unsigned long start_pfn, unsigned long end_pfn) 561 { 562 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 563 unsigned long pgdat_end_pfn = 564 pgdat->node_start_pfn + pgdat->node_spanned_pages; 565 unsigned long pfn; 566 struct mem_section *ms; 567 int nid = pgdat->node_id; 568 569 if (pgdat_start_pfn == start_pfn) { 570 /* 571 * If the section is smallest section in the pgdat, it need 572 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 573 * In this case, we find second smallest valid mem_section 574 * for shrinking zone. 575 */ 576 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 577 pgdat_end_pfn); 578 if (pfn) { 579 pgdat->node_start_pfn = pfn; 580 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 581 } 582 } else if (pgdat_end_pfn == end_pfn) { 583 /* 584 * If the section is biggest section in the pgdat, it need 585 * shrink pgdat->node_spanned_pages. 586 * In this case, we find second biggest valid mem_section for 587 * shrinking zone. 588 */ 589 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 590 start_pfn); 591 if (pfn) 592 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 593 } 594 595 /* 596 * If the section is not biggest or smallest mem_section in the pgdat, 597 * it only creates a hole in the pgdat. So in this case, we need not 598 * change the pgdat. 599 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 600 * has only hole or not. 601 */ 602 pfn = pgdat_start_pfn; 603 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 604 ms = __pfn_to_section(pfn); 605 606 if (unlikely(!valid_section(ms))) 607 continue; 608 609 if (pfn_to_nid(pfn) != nid) 610 continue; 611 612 /* If the section is current section, it continues the loop */ 613 if (start_pfn == pfn) 614 continue; 615 616 /* If we find valid section, we have nothing to do */ 617 return; 618 } 619 620 /* The pgdat has no valid section */ 621 pgdat->node_start_pfn = 0; 622 pgdat->node_spanned_pages = 0; 623 } 624 625 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 626 { 627 struct pglist_data *pgdat = zone->zone_pgdat; 628 int nr_pages = PAGES_PER_SECTION; 629 int zone_type; 630 unsigned long flags; 631 632 zone_type = zone - pgdat->node_zones; 633 634 pgdat_resize_lock(zone->zone_pgdat, &flags); 635 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 636 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 637 pgdat_resize_unlock(zone->zone_pgdat, &flags); 638 } 639 640 static int __remove_section(struct zone *zone, struct mem_section *ms) 641 { 642 unsigned long start_pfn; 643 int scn_nr; 644 int ret = -EINVAL; 645 646 if (!valid_section(ms)) 647 return ret; 648 649 ret = unregister_memory_section(ms); 650 if (ret) 651 return ret; 652 653 scn_nr = __section_nr(ms); 654 start_pfn = section_nr_to_pfn(scn_nr); 655 __remove_zone(zone, start_pfn); 656 657 sparse_remove_one_section(zone, ms); 658 return 0; 659 } 660 661 /* 662 * Reasonably generic function for adding memory. It is 663 * expected that archs that support memory hotplug will 664 * call this function after deciding the zone to which to 665 * add the new pages. 666 */ 667 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 668 unsigned long nr_pages) 669 { 670 unsigned long i; 671 int err = 0; 672 int start_sec, end_sec; 673 /* during initialize mem_map, align hot-added range to section */ 674 start_sec = pfn_to_section_nr(phys_start_pfn); 675 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 676 677 for (i = start_sec; i <= end_sec; i++) { 678 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 679 680 /* 681 * EEXIST is finally dealt with by ioresource collision 682 * check. see add_memory() => register_memory_resource() 683 * Warning will be printed if there is collision. 684 */ 685 if (err && (err != -EEXIST)) 686 break; 687 err = 0; 688 } 689 690 return err; 691 } 692 EXPORT_SYMBOL_GPL(__add_pages); 693 694 /** 695 * __remove_pages() - remove sections of pages from a zone 696 * @zone: zone from which pages need to be removed 697 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 698 * @nr_pages: number of pages to remove (must be multiple of section size) 699 * 700 * Generic helper function to remove section mappings and sysfs entries 701 * for the section of the memory we are removing. Caller needs to make 702 * sure that pages are marked reserved and zones are adjust properly by 703 * calling offline_pages(). 704 */ 705 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 706 unsigned long nr_pages) 707 { 708 unsigned long i, ret = 0; 709 int sections_to_remove; 710 711 /* 712 * We can only remove entire sections 713 */ 714 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 715 BUG_ON(nr_pages % PAGES_PER_SECTION); 716 717 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); 718 719 sections_to_remove = nr_pages / PAGES_PER_SECTION; 720 for (i = 0; i < sections_to_remove; i++) { 721 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 722 ret = __remove_section(zone, __pfn_to_section(pfn)); 723 if (ret) 724 break; 725 } 726 return ret; 727 } 728 EXPORT_SYMBOL_GPL(__remove_pages); 729 730 int set_online_page_callback(online_page_callback_t callback) 731 { 732 int rc = -EINVAL; 733 734 lock_memory_hotplug(); 735 736 if (online_page_callback == generic_online_page) { 737 online_page_callback = callback; 738 rc = 0; 739 } 740 741 unlock_memory_hotplug(); 742 743 return rc; 744 } 745 EXPORT_SYMBOL_GPL(set_online_page_callback); 746 747 int restore_online_page_callback(online_page_callback_t callback) 748 { 749 int rc = -EINVAL; 750 751 lock_memory_hotplug(); 752 753 if (online_page_callback == callback) { 754 online_page_callback = generic_online_page; 755 rc = 0; 756 } 757 758 unlock_memory_hotplug(); 759 760 return rc; 761 } 762 EXPORT_SYMBOL_GPL(restore_online_page_callback); 763 764 void __online_page_set_limits(struct page *page) 765 { 766 unsigned long pfn = page_to_pfn(page); 767 768 if (pfn >= num_physpages) 769 num_physpages = pfn + 1; 770 } 771 EXPORT_SYMBOL_GPL(__online_page_set_limits); 772 773 void __online_page_increment_counters(struct page *page) 774 { 775 totalram_pages++; 776 777 #ifdef CONFIG_HIGHMEM 778 if (PageHighMem(page)) 779 totalhigh_pages++; 780 #endif 781 } 782 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 783 784 void __online_page_free(struct page *page) 785 { 786 ClearPageReserved(page); 787 init_page_count(page); 788 __free_page(page); 789 } 790 EXPORT_SYMBOL_GPL(__online_page_free); 791 792 static void generic_online_page(struct page *page) 793 { 794 __online_page_set_limits(page); 795 __online_page_increment_counters(page); 796 __online_page_free(page); 797 } 798 799 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 800 void *arg) 801 { 802 unsigned long i; 803 unsigned long onlined_pages = *(unsigned long *)arg; 804 struct page *page; 805 if (PageReserved(pfn_to_page(start_pfn))) 806 for (i = 0; i < nr_pages; i++) { 807 page = pfn_to_page(start_pfn + i); 808 (*online_page_callback)(page); 809 onlined_pages++; 810 } 811 *(unsigned long *)arg = onlined_pages; 812 return 0; 813 } 814 815 #ifdef CONFIG_MOVABLE_NODE 816 /* 817 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 818 * normal memory. 819 */ 820 static bool can_online_high_movable(struct zone *zone) 821 { 822 return true; 823 } 824 #else /* CONFIG_MOVABLE_NODE */ 825 /* ensure every online node has NORMAL memory */ 826 static bool can_online_high_movable(struct zone *zone) 827 { 828 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 829 } 830 #endif /* CONFIG_MOVABLE_NODE */ 831 832 /* check which state of node_states will be changed when online memory */ 833 static void node_states_check_changes_online(unsigned long nr_pages, 834 struct zone *zone, struct memory_notify *arg) 835 { 836 int nid = zone_to_nid(zone); 837 enum zone_type zone_last = ZONE_NORMAL; 838 839 /* 840 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 841 * contains nodes which have zones of 0...ZONE_NORMAL, 842 * set zone_last to ZONE_NORMAL. 843 * 844 * If we don't have HIGHMEM nor movable node, 845 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 846 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 847 */ 848 if (N_MEMORY == N_NORMAL_MEMORY) 849 zone_last = ZONE_MOVABLE; 850 851 /* 852 * if the memory to be online is in a zone of 0...zone_last, and 853 * the zones of 0...zone_last don't have memory before online, we will 854 * need to set the node to node_states[N_NORMAL_MEMORY] after 855 * the memory is online. 856 */ 857 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 858 arg->status_change_nid_normal = nid; 859 else 860 arg->status_change_nid_normal = -1; 861 862 #ifdef CONFIG_HIGHMEM 863 /* 864 * If we have movable node, node_states[N_HIGH_MEMORY] 865 * contains nodes which have zones of 0...ZONE_HIGHMEM, 866 * set zone_last to ZONE_HIGHMEM. 867 * 868 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 869 * contains nodes which have zones of 0...ZONE_MOVABLE, 870 * set zone_last to ZONE_MOVABLE. 871 */ 872 zone_last = ZONE_HIGHMEM; 873 if (N_MEMORY == N_HIGH_MEMORY) 874 zone_last = ZONE_MOVABLE; 875 876 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 877 arg->status_change_nid_high = nid; 878 else 879 arg->status_change_nid_high = -1; 880 #else 881 arg->status_change_nid_high = arg->status_change_nid_normal; 882 #endif 883 884 /* 885 * if the node don't have memory befor online, we will need to 886 * set the node to node_states[N_MEMORY] after the memory 887 * is online. 888 */ 889 if (!node_state(nid, N_MEMORY)) 890 arg->status_change_nid = nid; 891 else 892 arg->status_change_nid = -1; 893 } 894 895 static void node_states_set_node(int node, struct memory_notify *arg) 896 { 897 if (arg->status_change_nid_normal >= 0) 898 node_set_state(node, N_NORMAL_MEMORY); 899 900 if (arg->status_change_nid_high >= 0) 901 node_set_state(node, N_HIGH_MEMORY); 902 903 node_set_state(node, N_MEMORY); 904 } 905 906 907 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 908 { 909 unsigned long onlined_pages = 0; 910 struct zone *zone; 911 int need_zonelists_rebuild = 0; 912 int nid; 913 int ret; 914 struct memory_notify arg; 915 916 lock_memory_hotplug(); 917 /* 918 * This doesn't need a lock to do pfn_to_page(). 919 * The section can't be removed here because of the 920 * memory_block->state_mutex. 921 */ 922 zone = page_zone(pfn_to_page(pfn)); 923 924 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 925 !can_online_high_movable(zone)) { 926 unlock_memory_hotplug(); 927 return -1; 928 } 929 930 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 931 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 932 unlock_memory_hotplug(); 933 return -1; 934 } 935 } 936 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 937 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 938 unlock_memory_hotplug(); 939 return -1; 940 } 941 } 942 943 /* Previous code may changed the zone of the pfn range */ 944 zone = page_zone(pfn_to_page(pfn)); 945 946 arg.start_pfn = pfn; 947 arg.nr_pages = nr_pages; 948 node_states_check_changes_online(nr_pages, zone, &arg); 949 950 nid = page_to_nid(pfn_to_page(pfn)); 951 952 ret = memory_notify(MEM_GOING_ONLINE, &arg); 953 ret = notifier_to_errno(ret); 954 if (ret) { 955 memory_notify(MEM_CANCEL_ONLINE, &arg); 956 unlock_memory_hotplug(); 957 return ret; 958 } 959 /* 960 * If this zone is not populated, then it is not in zonelist. 961 * This means the page allocator ignores this zone. 962 * So, zonelist must be updated after online. 963 */ 964 mutex_lock(&zonelists_mutex); 965 if (!populated_zone(zone)) { 966 need_zonelists_rebuild = 1; 967 build_all_zonelists(NULL, zone); 968 } 969 970 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 971 online_pages_range); 972 if (ret) { 973 if (need_zonelists_rebuild) 974 zone_pcp_reset(zone); 975 mutex_unlock(&zonelists_mutex); 976 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 977 (unsigned long long) pfn << PAGE_SHIFT, 978 (((unsigned long long) pfn + nr_pages) 979 << PAGE_SHIFT) - 1); 980 memory_notify(MEM_CANCEL_ONLINE, &arg); 981 unlock_memory_hotplug(); 982 return ret; 983 } 984 985 zone->managed_pages += onlined_pages; 986 zone->present_pages += onlined_pages; 987 zone->zone_pgdat->node_present_pages += onlined_pages; 988 if (onlined_pages) { 989 node_states_set_node(zone_to_nid(zone), &arg); 990 if (need_zonelists_rebuild) 991 build_all_zonelists(NULL, NULL); 992 else 993 zone_pcp_update(zone); 994 } 995 996 mutex_unlock(&zonelists_mutex); 997 998 init_per_zone_wmark_min(); 999 1000 if (onlined_pages) 1001 kswapd_run(zone_to_nid(zone)); 1002 1003 vm_total_pages = nr_free_pagecache_pages(); 1004 1005 writeback_set_ratelimit(); 1006 1007 if (onlined_pages) 1008 memory_notify(MEM_ONLINE, &arg); 1009 unlock_memory_hotplug(); 1010 1011 return 0; 1012 } 1013 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1014 1015 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1016 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1017 { 1018 struct pglist_data *pgdat; 1019 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1020 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1021 unsigned long start_pfn = start >> PAGE_SHIFT; 1022 1023 pgdat = NODE_DATA(nid); 1024 if (!pgdat) { 1025 pgdat = arch_alloc_nodedata(nid); 1026 if (!pgdat) 1027 return NULL; 1028 1029 arch_refresh_nodedata(nid, pgdat); 1030 } 1031 1032 /* we can use NODE_DATA(nid) from here */ 1033 1034 /* init node's zones as empty zones, we don't have any present pages.*/ 1035 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1036 1037 /* 1038 * The node we allocated has no zone fallback lists. For avoiding 1039 * to access not-initialized zonelist, build here. 1040 */ 1041 mutex_lock(&zonelists_mutex); 1042 build_all_zonelists(pgdat, NULL); 1043 mutex_unlock(&zonelists_mutex); 1044 1045 return pgdat; 1046 } 1047 1048 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1049 { 1050 arch_refresh_nodedata(nid, NULL); 1051 arch_free_nodedata(pgdat); 1052 return; 1053 } 1054 1055 1056 /* 1057 * called by cpu_up() to online a node without onlined memory. 1058 */ 1059 int mem_online_node(int nid) 1060 { 1061 pg_data_t *pgdat; 1062 int ret; 1063 1064 lock_memory_hotplug(); 1065 pgdat = hotadd_new_pgdat(nid, 0); 1066 if (!pgdat) { 1067 ret = -ENOMEM; 1068 goto out; 1069 } 1070 node_set_online(nid); 1071 ret = register_one_node(nid); 1072 BUG_ON(ret); 1073 1074 out: 1075 unlock_memory_hotplug(); 1076 return ret; 1077 } 1078 1079 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1080 int __ref add_memory(int nid, u64 start, u64 size) 1081 { 1082 pg_data_t *pgdat = NULL; 1083 bool new_pgdat; 1084 bool new_node; 1085 struct resource *res; 1086 int ret; 1087 1088 lock_memory_hotplug(); 1089 1090 res = register_memory_resource(start, size); 1091 ret = -EEXIST; 1092 if (!res) 1093 goto out; 1094 1095 { /* Stupid hack to suppress address-never-null warning */ 1096 void *p = NODE_DATA(nid); 1097 new_pgdat = !p; 1098 } 1099 new_node = !node_online(nid); 1100 if (new_node) { 1101 pgdat = hotadd_new_pgdat(nid, start); 1102 ret = -ENOMEM; 1103 if (!pgdat) 1104 goto error; 1105 } 1106 1107 /* call arch's memory hotadd */ 1108 ret = arch_add_memory(nid, start, size); 1109 1110 if (ret < 0) 1111 goto error; 1112 1113 /* we online node here. we can't roll back from here. */ 1114 node_set_online(nid); 1115 1116 if (new_node) { 1117 ret = register_one_node(nid); 1118 /* 1119 * If sysfs file of new node can't create, cpu on the node 1120 * can't be hot-added. There is no rollback way now. 1121 * So, check by BUG_ON() to catch it reluctantly.. 1122 */ 1123 BUG_ON(ret); 1124 } 1125 1126 /* create new memmap entry */ 1127 firmware_map_add_hotplug(start, start + size, "System RAM"); 1128 1129 goto out; 1130 1131 error: 1132 /* rollback pgdat allocation and others */ 1133 if (new_pgdat) 1134 rollback_node_hotadd(nid, pgdat); 1135 release_memory_resource(res); 1136 1137 out: 1138 unlock_memory_hotplug(); 1139 return ret; 1140 } 1141 EXPORT_SYMBOL_GPL(add_memory); 1142 1143 #ifdef CONFIG_MEMORY_HOTREMOVE 1144 /* 1145 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1146 * set and the size of the free page is given by page_order(). Using this, 1147 * the function determines if the pageblock contains only free pages. 1148 * Due to buddy contraints, a free page at least the size of a pageblock will 1149 * be located at the start of the pageblock 1150 */ 1151 static inline int pageblock_free(struct page *page) 1152 { 1153 return PageBuddy(page) && page_order(page) >= pageblock_order; 1154 } 1155 1156 /* Return the start of the next active pageblock after a given page */ 1157 static struct page *next_active_pageblock(struct page *page) 1158 { 1159 /* Ensure the starting page is pageblock-aligned */ 1160 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1161 1162 /* If the entire pageblock is free, move to the end of free page */ 1163 if (pageblock_free(page)) { 1164 int order; 1165 /* be careful. we don't have locks, page_order can be changed.*/ 1166 order = page_order(page); 1167 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1168 return page + (1 << order); 1169 } 1170 1171 return page + pageblock_nr_pages; 1172 } 1173 1174 /* Checks if this range of memory is likely to be hot-removable. */ 1175 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1176 { 1177 struct page *page = pfn_to_page(start_pfn); 1178 struct page *end_page = page + nr_pages; 1179 1180 /* Check the starting page of each pageblock within the range */ 1181 for (; page < end_page; page = next_active_pageblock(page)) { 1182 if (!is_pageblock_removable_nolock(page)) 1183 return 0; 1184 cond_resched(); 1185 } 1186 1187 /* All pageblocks in the memory block are likely to be hot-removable */ 1188 return 1; 1189 } 1190 1191 /* 1192 * Confirm all pages in a range [start, end) is belongs to the same zone. 1193 */ 1194 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1195 { 1196 unsigned long pfn; 1197 struct zone *zone = NULL; 1198 struct page *page; 1199 int i; 1200 for (pfn = start_pfn; 1201 pfn < end_pfn; 1202 pfn += MAX_ORDER_NR_PAGES) { 1203 i = 0; 1204 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1205 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1206 i++; 1207 if (i == MAX_ORDER_NR_PAGES) 1208 continue; 1209 page = pfn_to_page(pfn + i); 1210 if (zone && page_zone(page) != zone) 1211 return 0; 1212 zone = page_zone(page); 1213 } 1214 return 1; 1215 } 1216 1217 /* 1218 * Scanning pfn is much easier than scanning lru list. 1219 * Scan pfn from start to end and Find LRU page. 1220 */ 1221 static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 1222 { 1223 unsigned long pfn; 1224 struct page *page; 1225 for (pfn = start; pfn < end; pfn++) { 1226 if (pfn_valid(pfn)) { 1227 page = pfn_to_page(pfn); 1228 if (PageLRU(page)) 1229 return pfn; 1230 } 1231 } 1232 return 0; 1233 } 1234 1235 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1236 static int 1237 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1238 { 1239 unsigned long pfn; 1240 struct page *page; 1241 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1242 int not_managed = 0; 1243 int ret = 0; 1244 LIST_HEAD(source); 1245 1246 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1247 if (!pfn_valid(pfn)) 1248 continue; 1249 page = pfn_to_page(pfn); 1250 if (!get_page_unless_zero(page)) 1251 continue; 1252 /* 1253 * We can skip free pages. And we can only deal with pages on 1254 * LRU. 1255 */ 1256 ret = isolate_lru_page(page); 1257 if (!ret) { /* Success */ 1258 put_page(page); 1259 list_add_tail(&page->lru, &source); 1260 move_pages--; 1261 inc_zone_page_state(page, NR_ISOLATED_ANON + 1262 page_is_file_cache(page)); 1263 1264 } else { 1265 #ifdef CONFIG_DEBUG_VM 1266 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1267 pfn); 1268 dump_page(page); 1269 #endif 1270 put_page(page); 1271 /* Because we don't have big zone->lock. we should 1272 check this again here. */ 1273 if (page_count(page)) { 1274 not_managed++; 1275 ret = -EBUSY; 1276 break; 1277 } 1278 } 1279 } 1280 if (!list_empty(&source)) { 1281 if (not_managed) { 1282 putback_lru_pages(&source); 1283 goto out; 1284 } 1285 1286 /* 1287 * alloc_migrate_target should be improooooved!! 1288 * migrate_pages returns # of failed pages. 1289 */ 1290 ret = migrate_pages(&source, alloc_migrate_target, 0, 1291 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1292 if (ret) 1293 putback_lru_pages(&source); 1294 } 1295 out: 1296 return ret; 1297 } 1298 1299 /* 1300 * remove from free_area[] and mark all as Reserved. 1301 */ 1302 static int 1303 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1304 void *data) 1305 { 1306 __offline_isolated_pages(start, start + nr_pages); 1307 return 0; 1308 } 1309 1310 static void 1311 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1312 { 1313 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1314 offline_isolated_pages_cb); 1315 } 1316 1317 /* 1318 * Check all pages in range, recoreded as memory resource, are isolated. 1319 */ 1320 static int 1321 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1322 void *data) 1323 { 1324 int ret; 1325 long offlined = *(long *)data; 1326 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1327 offlined = nr_pages; 1328 if (!ret) 1329 *(long *)data += offlined; 1330 return ret; 1331 } 1332 1333 static long 1334 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1335 { 1336 long offlined = 0; 1337 int ret; 1338 1339 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1340 check_pages_isolated_cb); 1341 if (ret < 0) 1342 offlined = (long)ret; 1343 return offlined; 1344 } 1345 1346 #ifdef CONFIG_MOVABLE_NODE 1347 /* 1348 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1349 * normal memory. 1350 */ 1351 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1352 { 1353 return true; 1354 } 1355 #else /* CONFIG_MOVABLE_NODE */ 1356 /* ensure the node has NORMAL memory if it is still online */ 1357 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1358 { 1359 struct pglist_data *pgdat = zone->zone_pgdat; 1360 unsigned long present_pages = 0; 1361 enum zone_type zt; 1362 1363 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1364 present_pages += pgdat->node_zones[zt].present_pages; 1365 1366 if (present_pages > nr_pages) 1367 return true; 1368 1369 present_pages = 0; 1370 for (; zt <= ZONE_MOVABLE; zt++) 1371 present_pages += pgdat->node_zones[zt].present_pages; 1372 1373 /* 1374 * we can't offline the last normal memory until all 1375 * higher memory is offlined. 1376 */ 1377 return present_pages == 0; 1378 } 1379 #endif /* CONFIG_MOVABLE_NODE */ 1380 1381 /* check which state of node_states will be changed when offline memory */ 1382 static void node_states_check_changes_offline(unsigned long nr_pages, 1383 struct zone *zone, struct memory_notify *arg) 1384 { 1385 struct pglist_data *pgdat = zone->zone_pgdat; 1386 unsigned long present_pages = 0; 1387 enum zone_type zt, zone_last = ZONE_NORMAL; 1388 1389 /* 1390 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1391 * contains nodes which have zones of 0...ZONE_NORMAL, 1392 * set zone_last to ZONE_NORMAL. 1393 * 1394 * If we don't have HIGHMEM nor movable node, 1395 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1396 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1397 */ 1398 if (N_MEMORY == N_NORMAL_MEMORY) 1399 zone_last = ZONE_MOVABLE; 1400 1401 /* 1402 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1403 * If the memory to be offline is in a zone of 0...zone_last, 1404 * and it is the last present memory, 0...zone_last will 1405 * become empty after offline , thus we can determind we will 1406 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1407 */ 1408 for (zt = 0; zt <= zone_last; zt++) 1409 present_pages += pgdat->node_zones[zt].present_pages; 1410 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1411 arg->status_change_nid_normal = zone_to_nid(zone); 1412 else 1413 arg->status_change_nid_normal = -1; 1414 1415 #ifdef CONFIG_HIGHMEM 1416 /* 1417 * If we have movable node, node_states[N_HIGH_MEMORY] 1418 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1419 * set zone_last to ZONE_HIGHMEM. 1420 * 1421 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1422 * contains nodes which have zones of 0...ZONE_MOVABLE, 1423 * set zone_last to ZONE_MOVABLE. 1424 */ 1425 zone_last = ZONE_HIGHMEM; 1426 if (N_MEMORY == N_HIGH_MEMORY) 1427 zone_last = ZONE_MOVABLE; 1428 1429 for (; zt <= zone_last; zt++) 1430 present_pages += pgdat->node_zones[zt].present_pages; 1431 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1432 arg->status_change_nid_high = zone_to_nid(zone); 1433 else 1434 arg->status_change_nid_high = -1; 1435 #else 1436 arg->status_change_nid_high = arg->status_change_nid_normal; 1437 #endif 1438 1439 /* 1440 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1441 */ 1442 zone_last = ZONE_MOVABLE; 1443 1444 /* 1445 * check whether node_states[N_HIGH_MEMORY] will be changed 1446 * If we try to offline the last present @nr_pages from the node, 1447 * we can determind we will need to clear the node from 1448 * node_states[N_HIGH_MEMORY]. 1449 */ 1450 for (; zt <= zone_last; zt++) 1451 present_pages += pgdat->node_zones[zt].present_pages; 1452 if (nr_pages >= present_pages) 1453 arg->status_change_nid = zone_to_nid(zone); 1454 else 1455 arg->status_change_nid = -1; 1456 } 1457 1458 static void node_states_clear_node(int node, struct memory_notify *arg) 1459 { 1460 if (arg->status_change_nid_normal >= 0) 1461 node_clear_state(node, N_NORMAL_MEMORY); 1462 1463 if ((N_MEMORY != N_NORMAL_MEMORY) && 1464 (arg->status_change_nid_high >= 0)) 1465 node_clear_state(node, N_HIGH_MEMORY); 1466 1467 if ((N_MEMORY != N_HIGH_MEMORY) && 1468 (arg->status_change_nid >= 0)) 1469 node_clear_state(node, N_MEMORY); 1470 } 1471 1472 static int __ref __offline_pages(unsigned long start_pfn, 1473 unsigned long end_pfn, unsigned long timeout) 1474 { 1475 unsigned long pfn, nr_pages, expire; 1476 long offlined_pages; 1477 int ret, drain, retry_max, node; 1478 struct zone *zone; 1479 struct memory_notify arg; 1480 1481 BUG_ON(start_pfn >= end_pfn); 1482 /* at least, alignment against pageblock is necessary */ 1483 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1484 return -EINVAL; 1485 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1486 return -EINVAL; 1487 /* This makes hotplug much easier...and readable. 1488 we assume this for now. .*/ 1489 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1490 return -EINVAL; 1491 1492 lock_memory_hotplug(); 1493 1494 zone = page_zone(pfn_to_page(start_pfn)); 1495 node = zone_to_nid(zone); 1496 nr_pages = end_pfn - start_pfn; 1497 1498 ret = -EINVAL; 1499 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1500 goto out; 1501 1502 /* set above range as isolated */ 1503 ret = start_isolate_page_range(start_pfn, end_pfn, 1504 MIGRATE_MOVABLE, true); 1505 if (ret) 1506 goto out; 1507 1508 arg.start_pfn = start_pfn; 1509 arg.nr_pages = nr_pages; 1510 node_states_check_changes_offline(nr_pages, zone, &arg); 1511 1512 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1513 ret = notifier_to_errno(ret); 1514 if (ret) 1515 goto failed_removal; 1516 1517 pfn = start_pfn; 1518 expire = jiffies + timeout; 1519 drain = 0; 1520 retry_max = 5; 1521 repeat: 1522 /* start memory hot removal */ 1523 ret = -EAGAIN; 1524 if (time_after(jiffies, expire)) 1525 goto failed_removal; 1526 ret = -EINTR; 1527 if (signal_pending(current)) 1528 goto failed_removal; 1529 ret = 0; 1530 if (drain) { 1531 lru_add_drain_all(); 1532 cond_resched(); 1533 drain_all_pages(); 1534 } 1535 1536 pfn = scan_lru_pages(start_pfn, end_pfn); 1537 if (pfn) { /* We have page on LRU */ 1538 ret = do_migrate_range(pfn, end_pfn); 1539 if (!ret) { 1540 drain = 1; 1541 goto repeat; 1542 } else { 1543 if (ret < 0) 1544 if (--retry_max == 0) 1545 goto failed_removal; 1546 yield(); 1547 drain = 1; 1548 goto repeat; 1549 } 1550 } 1551 /* drain all zone's lru pagevec, this is asynchronous... */ 1552 lru_add_drain_all(); 1553 yield(); 1554 /* drain pcp pages, this is synchronous. */ 1555 drain_all_pages(); 1556 /* check again */ 1557 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1558 if (offlined_pages < 0) { 1559 ret = -EBUSY; 1560 goto failed_removal; 1561 } 1562 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1563 /* Ok, all of our target is isolated. 1564 We cannot do rollback at this point. */ 1565 offline_isolated_pages(start_pfn, end_pfn); 1566 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1567 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1568 /* removal success */ 1569 zone->managed_pages -= offlined_pages; 1570 zone->present_pages -= offlined_pages; 1571 zone->zone_pgdat->node_present_pages -= offlined_pages; 1572 totalram_pages -= offlined_pages; 1573 1574 init_per_zone_wmark_min(); 1575 1576 if (!populated_zone(zone)) { 1577 zone_pcp_reset(zone); 1578 mutex_lock(&zonelists_mutex); 1579 build_all_zonelists(NULL, NULL); 1580 mutex_unlock(&zonelists_mutex); 1581 } else 1582 zone_pcp_update(zone); 1583 1584 node_states_clear_node(node, &arg); 1585 if (arg.status_change_nid >= 0) 1586 kswapd_stop(node); 1587 1588 vm_total_pages = nr_free_pagecache_pages(); 1589 writeback_set_ratelimit(); 1590 1591 memory_notify(MEM_OFFLINE, &arg); 1592 unlock_memory_hotplug(); 1593 return 0; 1594 1595 failed_removal: 1596 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1597 (unsigned long long) start_pfn << PAGE_SHIFT, 1598 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1599 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1600 /* pushback to free area */ 1601 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1602 1603 out: 1604 unlock_memory_hotplug(); 1605 return ret; 1606 } 1607 1608 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1609 { 1610 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1611 } 1612 1613 /** 1614 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1615 * @start_pfn: start pfn of the memory range 1616 * @end_pfn: end pft of the memory range 1617 * @arg: argument passed to func 1618 * @func: callback for each memory section walked 1619 * 1620 * This function walks through all present mem sections in range 1621 * [start_pfn, end_pfn) and call func on each mem section. 1622 * 1623 * Returns the return value of func. 1624 */ 1625 static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1626 void *arg, int (*func)(struct memory_block *, void *)) 1627 { 1628 struct memory_block *mem = NULL; 1629 struct mem_section *section; 1630 unsigned long pfn, section_nr; 1631 int ret; 1632 1633 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1634 section_nr = pfn_to_section_nr(pfn); 1635 if (!present_section_nr(section_nr)) 1636 continue; 1637 1638 section = __nr_to_section(section_nr); 1639 /* same memblock? */ 1640 if (mem) 1641 if ((section_nr >= mem->start_section_nr) && 1642 (section_nr <= mem->end_section_nr)) 1643 continue; 1644 1645 mem = find_memory_block_hinted(section, mem); 1646 if (!mem) 1647 continue; 1648 1649 ret = func(mem, arg); 1650 if (ret) { 1651 kobject_put(&mem->dev.kobj); 1652 return ret; 1653 } 1654 } 1655 1656 if (mem) 1657 kobject_put(&mem->dev.kobj); 1658 1659 return 0; 1660 } 1661 1662 /** 1663 * offline_memory_block_cb - callback function for offlining memory block 1664 * @mem: the memory block to be offlined 1665 * @arg: buffer to hold error msg 1666 * 1667 * Always return 0, and put the error msg in arg if any. 1668 */ 1669 static int offline_memory_block_cb(struct memory_block *mem, void *arg) 1670 { 1671 int *ret = arg; 1672 int error = offline_memory_block(mem); 1673 1674 if (error != 0 && *ret == 0) 1675 *ret = error; 1676 1677 return 0; 1678 } 1679 1680 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1681 { 1682 int ret = !is_memblock_offlined(mem); 1683 1684 if (unlikely(ret)) 1685 pr_warn("removing memory fails, because memory " 1686 "[%#010llx-%#010llx] is onlined\n", 1687 PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), 1688 PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); 1689 1690 return ret; 1691 } 1692 1693 static int check_cpu_on_node(void *data) 1694 { 1695 struct pglist_data *pgdat = data; 1696 int cpu; 1697 1698 for_each_present_cpu(cpu) { 1699 if (cpu_to_node(cpu) == pgdat->node_id) 1700 /* 1701 * the cpu on this node isn't removed, and we can't 1702 * offline this node. 1703 */ 1704 return -EBUSY; 1705 } 1706 1707 return 0; 1708 } 1709 1710 static void unmap_cpu_on_node(void *data) 1711 { 1712 #ifdef CONFIG_ACPI_NUMA 1713 struct pglist_data *pgdat = data; 1714 int cpu; 1715 1716 for_each_possible_cpu(cpu) 1717 if (cpu_to_node(cpu) == pgdat->node_id) 1718 numa_clear_node(cpu); 1719 #endif 1720 } 1721 1722 static int check_and_unmap_cpu_on_node(void *data) 1723 { 1724 int ret = check_cpu_on_node(data); 1725 1726 if (ret) 1727 return ret; 1728 1729 /* 1730 * the node will be offlined when we come here, so we can clear 1731 * the cpu_to_node() now. 1732 */ 1733 1734 unmap_cpu_on_node(data); 1735 return 0; 1736 } 1737 1738 /* offline the node if all memory sections of this node are removed */ 1739 void try_offline_node(int nid) 1740 { 1741 pg_data_t *pgdat = NODE_DATA(nid); 1742 unsigned long start_pfn = pgdat->node_start_pfn; 1743 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1744 unsigned long pfn; 1745 struct page *pgdat_page = virt_to_page(pgdat); 1746 int i; 1747 1748 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1749 unsigned long section_nr = pfn_to_section_nr(pfn); 1750 1751 if (!present_section_nr(section_nr)) 1752 continue; 1753 1754 if (pfn_to_nid(pfn) != nid) 1755 continue; 1756 1757 /* 1758 * some memory sections of this node are not removed, and we 1759 * can't offline node now. 1760 */ 1761 return; 1762 } 1763 1764 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) 1765 return; 1766 1767 /* 1768 * all memory/cpu of this node are removed, we can offline this 1769 * node now. 1770 */ 1771 node_set_offline(nid); 1772 unregister_one_node(nid); 1773 1774 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1775 /* node data is allocated from boot memory */ 1776 return; 1777 1778 /* free waittable in each zone */ 1779 for (i = 0; i < MAX_NR_ZONES; i++) { 1780 struct zone *zone = pgdat->node_zones + i; 1781 1782 /* 1783 * wait_table may be allocated from boot memory, 1784 * here only free if it's allocated by vmalloc. 1785 */ 1786 if (is_vmalloc_addr(zone->wait_table)) 1787 vfree(zone->wait_table); 1788 } 1789 1790 /* 1791 * Since there is no way to guarentee the address of pgdat/zone is not 1792 * on stack of any kernel threads or used by other kernel objects 1793 * without reference counting or other symchronizing method, do not 1794 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1795 * the memory when the node is online again. 1796 */ 1797 memset(pgdat, 0, sizeof(*pgdat)); 1798 } 1799 EXPORT_SYMBOL(try_offline_node); 1800 1801 int __ref remove_memory(int nid, u64 start, u64 size) 1802 { 1803 unsigned long start_pfn, end_pfn; 1804 int ret = 0; 1805 int retry = 1; 1806 1807 start_pfn = PFN_DOWN(start); 1808 end_pfn = PFN_UP(start + size - 1); 1809 1810 /* 1811 * When CONFIG_MEMCG is on, one memory block may be used by other 1812 * blocks to store page cgroup when onlining pages. But we don't know 1813 * in what order pages are onlined. So we iterate twice to offline 1814 * memory: 1815 * 1st iterate: offline every non primary memory block. 1816 * 2nd iterate: offline primary (i.e. first added) memory block. 1817 */ 1818 repeat: 1819 walk_memory_range(start_pfn, end_pfn, &ret, 1820 offline_memory_block_cb); 1821 if (ret) { 1822 if (!retry) 1823 return ret; 1824 1825 retry = 0; 1826 ret = 0; 1827 goto repeat; 1828 } 1829 1830 lock_memory_hotplug(); 1831 1832 /* 1833 * we have offlined all memory blocks like this: 1834 * 1. lock memory hotplug 1835 * 2. offline a memory block 1836 * 3. unlock memory hotplug 1837 * 1838 * repeat step1-3 to offline the memory block. All memory blocks 1839 * must be offlined before removing memory. But we don't hold the 1840 * lock in the whole operation. So we should check whether all 1841 * memory blocks are offlined. 1842 */ 1843 1844 ret = walk_memory_range(start_pfn, end_pfn, NULL, 1845 is_memblock_offlined_cb); 1846 if (ret) { 1847 unlock_memory_hotplug(); 1848 return ret; 1849 } 1850 1851 /* remove memmap entry */ 1852 firmware_map_remove(start, start + size, "System RAM"); 1853 1854 arch_remove_memory(start, size); 1855 1856 try_offline_node(nid); 1857 1858 unlock_memory_hotplug(); 1859 1860 return 0; 1861 } 1862 #else 1863 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1864 { 1865 return -EINVAL; 1866 } 1867 int remove_memory(int nid, u64 start, u64 size) 1868 { 1869 return -EINVAL; 1870 } 1871 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1872 EXPORT_SYMBOL_GPL(remove_memory); 1873
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.