1 /* 2 * PowerPC64 port by Mike Corrigan and Dave Engebretsen 3 * {mikejc|engebret}@us.ibm.com 4 * 5 * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com> 6 * 7 * SMP scalability work: 8 * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM 9 * 10 * Module name: htab.c 11 * 12 * Description: 13 * PowerPC Hashed Page Table functions 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 */ 20 21 #undef DEBUG 22 #undef DEBUG_LOW 23 24 #include <linux/spinlock.h> 25 #include <linux/errno.h> 26 #include <linux/sched.h> 27 #include <linux/proc_fs.h> 28 #include <linux/stat.h> 29 #include <linux/sysctl.h> 30 #include <linux/ctype.h> 31 #include <linux/cache.h> 32 #include <linux/init.h> 33 #include <linux/signal.h> 34 #include <linux/lmb.h> 35 36 #include <asm/processor.h> 37 #include <asm/pgtable.h> 38 #include <asm/mmu.h> 39 #include <asm/mmu_context.h> 40 #include <asm/page.h> 41 #include <asm/types.h> 42 #include <asm/system.h> 43 #include <asm/uaccess.h> 44 #include <asm/machdep.h> 45 #include <asm/prom.h> 46 #include <asm/abs_addr.h> 47 #include <asm/tlbflush.h> 48 #include <asm/io.h> 49 #include <asm/eeh.h> 50 #include <asm/tlb.h> 51 #include <asm/cacheflush.h> 52 #include <asm/cputable.h> 53 #include <asm/sections.h> 54 #include <asm/spu.h> 55 #include <asm/udbg.h> 56 57 #ifdef DEBUG 58 #define DBG(fmt...) udbg_printf(fmt) 59 #else 60 #define DBG(fmt...) 61 #endif 62 63 #ifdef DEBUG_LOW 64 #define DBG_LOW(fmt...) udbg_printf(fmt) 65 #else 66 #define DBG_LOW(fmt...) 67 #endif 68 69 #define KB (1024) 70 #define MB (1024*KB) 71 #define GB (1024L*MB) 72 73 /* 74 * Note: pte --> Linux PTE 75 * HPTE --> PowerPC Hashed Page Table Entry 76 * 77 * Execution context: 78 * htab_initialize is called with the MMU off (of course), but 79 * the kernel has been copied down to zero so it can directly 80 * reference global data. At this point it is very difficult 81 * to print debug info. 82 * 83 */ 84 85 #ifdef CONFIG_U3_DART 86 extern unsigned long dart_tablebase; 87 #endif /* CONFIG_U3_DART */ 88 89 static unsigned long _SDR1; 90 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; 91 92 struct hash_pte *htab_address; 93 unsigned long htab_size_bytes; 94 unsigned long htab_hash_mask; 95 int mmu_linear_psize = MMU_PAGE_4K; 96 int mmu_virtual_psize = MMU_PAGE_4K; 97 int mmu_vmalloc_psize = MMU_PAGE_4K; 98 #ifdef CONFIG_SPARSEMEM_VMEMMAP 99 int mmu_vmemmap_psize = MMU_PAGE_4K; 100 #endif 101 int mmu_io_psize = MMU_PAGE_4K; 102 int mmu_kernel_ssize = MMU_SEGSIZE_256M; 103 int mmu_highuser_ssize = MMU_SEGSIZE_256M; 104 u16 mmu_slb_size = 64; 105 #ifdef CONFIG_HUGETLB_PAGE 106 unsigned int HPAGE_SHIFT; 107 #endif 108 #ifdef CONFIG_PPC_64K_PAGES 109 int mmu_ci_restrictions; 110 #endif 111 #ifdef CONFIG_DEBUG_PAGEALLOC 112 static u8 *linear_map_hash_slots; 113 static unsigned long linear_map_hash_count; 114 static DEFINE_SPINLOCK(linear_map_hash_lock); 115 #endif /* CONFIG_DEBUG_PAGEALLOC */ 116 117 /* There are definitions of page sizes arrays to be used when none 118 * is provided by the firmware. 119 */ 120 121 /* Pre-POWER4 CPUs (4k pages only) 122 */ 123 static struct mmu_psize_def mmu_psize_defaults_old[] = { 124 [MMU_PAGE_4K] = { 125 .shift = 12, 126 .sllp = 0, 127 .penc = 0, 128 .avpnm = 0, 129 .tlbiel = 0, 130 }, 131 }; 132 133 /* POWER4, GPUL, POWER5 134 * 135 * Support for 16Mb large pages 136 */ 137 static struct mmu_psize_def mmu_psize_defaults_gp[] = { 138 [MMU_PAGE_4K] = { 139 .shift = 12, 140 .sllp = 0, 141 .penc = 0, 142 .avpnm = 0, 143 .tlbiel = 1, 144 }, 145 [MMU_PAGE_16M] = { 146 .shift = 24, 147 .sllp = SLB_VSID_L, 148 .penc = 0, 149 .avpnm = 0x1UL, 150 .tlbiel = 0, 151 }, 152 }; 153 154 static unsigned long htab_convert_pte_flags(unsigned long pteflags) 155 { 156 unsigned long rflags = pteflags & 0x1fa; 157 158 /* _PAGE_EXEC -> NOEXEC */ 159 if ((pteflags & _PAGE_EXEC) == 0) 160 rflags |= HPTE_R_N; 161 162 /* PP bits. PAGE_USER is already PP bit 0x2, so we only 163 * need to add in 0x1 if it's a read-only user page 164 */ 165 if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) && 166 (pteflags & _PAGE_DIRTY))) 167 rflags |= 1; 168 169 /* Always add C */ 170 return rflags | HPTE_R_C; 171 } 172 173 int htab_bolt_mapping(unsigned long vstart, unsigned long vend, 174 unsigned long pstart, unsigned long prot, 175 int psize, int ssize) 176 { 177 unsigned long vaddr, paddr; 178 unsigned int step, shift; 179 int ret = 0; 180 181 shift = mmu_psize_defs[psize].shift; 182 step = 1 << shift; 183 184 prot = htab_convert_pte_flags(prot); 185 186 DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", 187 vstart, vend, pstart, prot, psize, ssize); 188 189 for (vaddr = vstart, paddr = pstart; vaddr < vend; 190 vaddr += step, paddr += step) { 191 unsigned long hash, hpteg; 192 unsigned long vsid = get_kernel_vsid(vaddr, ssize); 193 unsigned long va = hpt_va(vaddr, vsid, ssize); 194 unsigned long tprot = prot; 195 196 /* Make kernel text executable */ 197 if (overlaps_kernel_text(vaddr, vaddr + step)) 198 tprot &= ~HPTE_R_N; 199 200 hash = hpt_hash(va, shift, ssize); 201 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); 202 203 BUG_ON(!ppc_md.hpte_insert); 204 ret = ppc_md.hpte_insert(hpteg, va, paddr, tprot, 205 HPTE_V_BOLTED, psize, ssize); 206 207 if (ret < 0) 208 break; 209 #ifdef CONFIG_DEBUG_PAGEALLOC 210 if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) 211 linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; 212 #endif /* CONFIG_DEBUG_PAGEALLOC */ 213 } 214 return ret < 0 ? ret : 0; 215 } 216 217 #ifdef CONFIG_MEMORY_HOTPLUG 218 static int htab_remove_mapping(unsigned long vstart, unsigned long vend, 219 int psize, int ssize) 220 { 221 unsigned long vaddr; 222 unsigned int step, shift; 223 224 shift = mmu_psize_defs[psize].shift; 225 step = 1 << shift; 226 227 if (!ppc_md.hpte_removebolted) { 228 printk(KERN_WARNING "Platform doesn't implement " 229 "hpte_removebolted\n"); 230 return -EINVAL; 231 } 232 233 for (vaddr = vstart; vaddr < vend; vaddr += step) 234 ppc_md.hpte_removebolted(vaddr, psize, ssize); 235 236 return 0; 237 } 238 #endif /* CONFIG_MEMORY_HOTPLUG */ 239 240 static int __init htab_dt_scan_seg_sizes(unsigned long node, 241 const char *uname, int depth, 242 void *data) 243 { 244 char *type = of_get_flat_dt_prop(node, "device_type", NULL); 245 u32 *prop; 246 unsigned long size = 0; 247 248 /* We are scanning "cpu" nodes only */ 249 if (type == NULL || strcmp(type, "cpu") != 0) 250 return 0; 251 252 prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", 253 &size); 254 if (prop == NULL) 255 return 0; 256 for (; size >= 4; size -= 4, ++prop) { 257 if (prop[0] == 40) { 258 DBG("1T segment support detected\n"); 259 cur_cpu_spec->cpu_features |= CPU_FTR_1T_SEGMENT; 260 return 1; 261 } 262 } 263 cur_cpu_spec->cpu_features &= ~CPU_FTR_NO_SLBIE_B; 264 return 0; 265 } 266 267 static void __init htab_init_seg_sizes(void) 268 { 269 of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); 270 } 271 272 static int __init htab_dt_scan_page_sizes(unsigned long node, 273 const char *uname, int depth, 274 void *data) 275 { 276 char *type = of_get_flat_dt_prop(node, "device_type", NULL); 277 u32 *prop; 278 unsigned long size = 0; 279 280 /* We are scanning "cpu" nodes only */ 281 if (type == NULL || strcmp(type, "cpu") != 0) 282 return 0; 283 284 prop = (u32 *)of_get_flat_dt_prop(node, 285 "ibm,segment-page-sizes", &size); 286 if (prop != NULL) { 287 DBG("Page sizes from device-tree:\n"); 288 size /= 4; 289 cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE); 290 while(size > 0) { 291 unsigned int shift = prop[0]; 292 unsigned int slbenc = prop[1]; 293 unsigned int lpnum = prop[2]; 294 unsigned int lpenc = 0; 295 struct mmu_psize_def *def; 296 int idx = -1; 297 298 size -= 3; prop += 3; 299 while(size > 0 && lpnum) { 300 if (prop[0] == shift) 301 lpenc = prop[1]; 302 prop += 2; size -= 2; 303 lpnum--; 304 } 305 switch(shift) { 306 case 0xc: 307 idx = MMU_PAGE_4K; 308 break; 309 case 0x10: 310 idx = MMU_PAGE_64K; 311 break; 312 case 0x14: 313 idx = MMU_PAGE_1M; 314 break; 315 case 0x18: 316 idx = MMU_PAGE_16M; 317 cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE; 318 break; 319 case 0x22: 320 idx = MMU_PAGE_16G; 321 break; 322 } 323 if (idx < 0) 324 continue; 325 def = &mmu_psize_defs[idx]; 326 def->shift = shift; 327 if (shift <= 23) 328 def->avpnm = 0; 329 else 330 def->avpnm = (1 << (shift - 23)) - 1; 331 def->sllp = slbenc; 332 def->penc = lpenc; 333 /* We don't know for sure what's up with tlbiel, so 334 * for now we only set it for 4K and 64K pages 335 */ 336 if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K) 337 def->tlbiel = 1; 338 else 339 def->tlbiel = 0; 340 341 DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, " 342 "tlbiel=%d, penc=%d\n", 343 idx, shift, def->sllp, def->avpnm, def->tlbiel, 344 def->penc); 345 } 346 return 1; 347 } 348 return 0; 349 } 350 351 #ifdef CONFIG_HUGETLB_PAGE 352 /* Scan for 16G memory blocks that have been set aside for huge pages 353 * and reserve those blocks for 16G huge pages. 354 */ 355 static int __init htab_dt_scan_hugepage_blocks(unsigned long node, 356 const char *uname, int depth, 357 void *data) { 358 char *type = of_get_flat_dt_prop(node, "device_type", NULL); 359 unsigned long *addr_prop; 360 u32 *page_count_prop; 361 unsigned int expected_pages; 362 long unsigned int phys_addr; 363 long unsigned int block_size; 364 365 /* We are scanning "memory" nodes only */ 366 if (type == NULL || strcmp(type, "memory") != 0) 367 return 0; 368 369 /* This property is the log base 2 of the number of virtual pages that 370 * will represent this memory block. */ 371 page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); 372 if (page_count_prop == NULL) 373 return 0; 374 expected_pages = (1 << page_count_prop[0]); 375 addr_prop = of_get_flat_dt_prop(node, "reg", NULL); 376 if (addr_prop == NULL) 377 return 0; 378 phys_addr = addr_prop[0]; 379 block_size = addr_prop[1]; 380 if (block_size != (16 * GB)) 381 return 0; 382 printk(KERN_INFO "Huge page(16GB) memory: " 383 "addr = 0x%lX size = 0x%lX pages = %d\n", 384 phys_addr, block_size, expected_pages); 385 if (phys_addr + (16 * GB) <= lmb_end_of_DRAM()) { 386 lmb_reserve(phys_addr, block_size * expected_pages); 387 add_gpage(phys_addr, block_size, expected_pages); 388 } 389 return 0; 390 } 391 #endif /* CONFIG_HUGETLB_PAGE */ 392 393 static void __init htab_init_page_sizes(void) 394 { 395 int rc; 396 397 /* Default to 4K pages only */ 398 memcpy(mmu_psize_defs, mmu_psize_defaults_old, 399 sizeof(mmu_psize_defaults_old)); 400 401 /* 402 * Try to find the available page sizes in the device-tree 403 */ 404 rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); 405 if (rc != 0) /* Found */ 406 goto found; 407 408 /* 409 * Not in the device-tree, let's fallback on known size 410 * list for 16M capable GP & GR 411 */ 412 if (cpu_has_feature(CPU_FTR_16M_PAGE)) 413 memcpy(mmu_psize_defs, mmu_psize_defaults_gp, 414 sizeof(mmu_psize_defaults_gp)); 415 found: 416 #ifndef CONFIG_DEBUG_PAGEALLOC 417 /* 418 * Pick a size for the linear mapping. Currently, we only support 419 * 16M, 1M and 4K which is the default 420 */ 421 if (mmu_psize_defs[MMU_PAGE_16M].shift) 422 mmu_linear_psize = MMU_PAGE_16M; 423 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 424 mmu_linear_psize = MMU_PAGE_1M; 425 #endif /* CONFIG_DEBUG_PAGEALLOC */ 426 427 #ifdef CONFIG_PPC_64K_PAGES 428 /* 429 * Pick a size for the ordinary pages. Default is 4K, we support 430 * 64K for user mappings and vmalloc if supported by the processor. 431 * We only use 64k for ioremap if the processor 432 * (and firmware) support cache-inhibited large pages. 433 * If not, we use 4k and set mmu_ci_restrictions so that 434 * hash_page knows to switch processes that use cache-inhibited 435 * mappings to 4k pages. 436 */ 437 if (mmu_psize_defs[MMU_PAGE_64K].shift) { 438 mmu_virtual_psize = MMU_PAGE_64K; 439 mmu_vmalloc_psize = MMU_PAGE_64K; 440 if (mmu_linear_psize == MMU_PAGE_4K) 441 mmu_linear_psize = MMU_PAGE_64K; 442 if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) { 443 /* 444 * Don't use 64k pages for ioremap on pSeries, since 445 * that would stop us accessing the HEA ethernet. 446 */ 447 if (!machine_is(pseries)) 448 mmu_io_psize = MMU_PAGE_64K; 449 } else 450 mmu_ci_restrictions = 1; 451 } 452 #endif /* CONFIG_PPC_64K_PAGES */ 453 454 #ifdef CONFIG_SPARSEMEM_VMEMMAP 455 /* We try to use 16M pages for vmemmap if that is supported 456 * and we have at least 1G of RAM at boot 457 */ 458 if (mmu_psize_defs[MMU_PAGE_16M].shift && 459 lmb_phys_mem_size() >= 0x40000000) 460 mmu_vmemmap_psize = MMU_PAGE_16M; 461 else if (mmu_psize_defs[MMU_PAGE_64K].shift) 462 mmu_vmemmap_psize = MMU_PAGE_64K; 463 else 464 mmu_vmemmap_psize = MMU_PAGE_4K; 465 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 466 467 printk(KERN_DEBUG "Page orders: linear mapping = %d, " 468 "virtual = %d, io = %d" 469 #ifdef CONFIG_SPARSEMEM_VMEMMAP 470 ", vmemmap = %d" 471 #endif 472 "\n", 473 mmu_psize_defs[mmu_linear_psize].shift, 474 mmu_psize_defs[mmu_virtual_psize].shift, 475 mmu_psize_defs[mmu_io_psize].shift 476 #ifdef CONFIG_SPARSEMEM_VMEMMAP 477 ,mmu_psize_defs[mmu_vmemmap_psize].shift 478 #endif 479 ); 480 481 #ifdef CONFIG_HUGETLB_PAGE 482 /* Reserve 16G huge page memory sections for huge pages */ 483 of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); 484 485 /* Set default large page size. Currently, we pick 16M or 1M depending 486 * on what is available 487 */ 488 if (mmu_psize_defs[MMU_PAGE_16M].shift) 489 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; 490 /* With 4k/4level pagetables, we can't (for now) cope with a 491 * huge page size < PMD_SIZE */ 492 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 493 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; 494 #endif /* CONFIG_HUGETLB_PAGE */ 495 } 496 497 static int __init htab_dt_scan_pftsize(unsigned long node, 498 const char *uname, int depth, 499 void *data) 500 { 501 char *type = of_get_flat_dt_prop(node, "device_type", NULL); 502 u32 *prop; 503 504 /* We are scanning "cpu" nodes only */ 505 if (type == NULL || strcmp(type, "cpu") != 0) 506 return 0; 507 508 prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL); 509 if (prop != NULL) { 510 /* pft_size[0] is the NUMA CEC cookie */ 511 ppc64_pft_size = prop[1]; 512 return 1; 513 } 514 return 0; 515 } 516 517 static unsigned long __init htab_get_table_size(void) 518 { 519 unsigned long mem_size, rnd_mem_size, pteg_count, psize; 520 521 /* If hash size isn't already provided by the platform, we try to 522 * retrieve it from the device-tree. If it's not there neither, we 523 * calculate it now based on the total RAM size 524 */ 525 if (ppc64_pft_size == 0) 526 of_scan_flat_dt(htab_dt_scan_pftsize, NULL); 527 if (ppc64_pft_size) 528 return 1UL << ppc64_pft_size; 529 530 /* round mem_size up to next power of 2 */ 531 mem_size = lmb_phys_mem_size(); 532 rnd_mem_size = 1UL << __ilog2(mem_size); 533 if (rnd_mem_size < mem_size) 534 rnd_mem_size <<= 1; 535 536 /* # pages / 2 */ 537 psize = mmu_psize_defs[mmu_virtual_psize].shift; 538 pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11); 539 540 return pteg_count << 7; 541 } 542 543 #ifdef CONFIG_MEMORY_HOTPLUG 544 void create_section_mapping(unsigned long start, unsigned long end) 545 { 546 BUG_ON(htab_bolt_mapping(start, end, __pa(start), 547 pgprot_val(PAGE_KERNEL), mmu_linear_psize, 548 mmu_kernel_ssize)); 549 } 550 551 int remove_section_mapping(unsigned long start, unsigned long end) 552 { 553 return htab_remove_mapping(start, end, mmu_linear_psize, 554 mmu_kernel_ssize); 555 } 556 #endif /* CONFIG_MEMORY_HOTPLUG */ 557 558 static inline void make_bl(unsigned int *insn_addr, void *func) 559 { 560 unsigned long funcp = *((unsigned long *)func); 561 int offset = funcp - (unsigned long)insn_addr; 562 563 *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc)); 564 flush_icache_range((unsigned long)insn_addr, 4+ 565 (unsigned long)insn_addr); 566 } 567 568 static void __init htab_finish_init(void) 569 { 570 extern unsigned int *htab_call_hpte_insert1; 571 extern unsigned int *htab_call_hpte_insert2; 572 extern unsigned int *htab_call_hpte_remove; 573 extern unsigned int *htab_call_hpte_updatepp; 574 575 #ifdef CONFIG_PPC_HAS_HASH_64K 576 extern unsigned int *ht64_call_hpte_insert1; 577 extern unsigned int *ht64_call_hpte_insert2; 578 extern unsigned int *ht64_call_hpte_remove; 579 extern unsigned int *ht64_call_hpte_updatepp; 580 581 make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert); 582 make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert); 583 make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove); 584 make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp); 585 #endif /* CONFIG_PPC_HAS_HASH_64K */ 586 587 make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert); 588 make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert); 589 make_bl(htab_call_hpte_remove, ppc_md.hpte_remove); 590 make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp); 591 } 592 593 static void __init htab_initialize(void) 594 { 595 unsigned long table; 596 unsigned long pteg_count; 597 unsigned long prot; 598 unsigned long base = 0, size = 0, limit; 599 int i; 600 601 DBG(" -> htab_initialize()\n"); 602 603 /* Initialize segment sizes */ 604 htab_init_seg_sizes(); 605 606 /* Initialize page sizes */ 607 htab_init_page_sizes(); 608 609 if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) { 610 mmu_kernel_ssize = MMU_SEGSIZE_1T; 611 mmu_highuser_ssize = MMU_SEGSIZE_1T; 612 printk(KERN_INFO "Using 1TB segments\n"); 613 } 614 615 /* 616 * Calculate the required size of the htab. We want the number of 617 * PTEGs to equal one half the number of real pages. 618 */ 619 htab_size_bytes = htab_get_table_size(); 620 pteg_count = htab_size_bytes >> 7; 621 622 htab_hash_mask = pteg_count - 1; 623 624 if (firmware_has_feature(FW_FEATURE_LPAR)) { 625 /* Using a hypervisor which owns the htab */ 626 htab_address = NULL; 627 _SDR1 = 0; 628 } else { 629 /* Find storage for the HPT. Must be contiguous in 630 * the absolute address space. On cell we want it to be 631 * in the first 2 Gig so we can use it for IOMMU hacks. 632 */ 633 if (machine_is(cell)) 634 limit = 0x80000000; 635 else 636 limit = 0; 637 638 table = lmb_alloc_base(htab_size_bytes, htab_size_bytes, limit); 639 640 DBG("Hash table allocated at %lx, size: %lx\n", table, 641 htab_size_bytes); 642 643 htab_address = abs_to_virt(table); 644 645 /* htab absolute addr + encoded htabsize */ 646 _SDR1 = table + __ilog2(pteg_count) - 11; 647 648 /* Initialize the HPT with no entries */ 649 memset((void *)table, 0, htab_size_bytes); 650 651 /* Set SDR1 */ 652 mtspr(SPRN_SDR1, _SDR1); 653 } 654 655 prot = pgprot_val(PAGE_KERNEL); 656 657 #ifdef CONFIG_DEBUG_PAGEALLOC 658 linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT; 659 linear_map_hash_slots = __va(lmb_alloc_base(linear_map_hash_count, 660 1, lmb.rmo_size)); 661 memset(linear_map_hash_slots, 0, linear_map_hash_count); 662 #endif /* CONFIG_DEBUG_PAGEALLOC */ 663 664 /* On U3 based machines, we need to reserve the DART area and 665 * _NOT_ map it to avoid cache paradoxes as it's remapped non 666 * cacheable later on 667 */ 668 669 /* create bolted the linear mapping in the hash table */ 670 for (i=0; i < lmb.memory.cnt; i++) { 671 base = (unsigned long)__va(lmb.memory.region[i].base); 672 size = lmb.memory.region[i].size; 673 674 DBG("creating mapping for region: %lx..%lx (prot: %x)\n", 675 base, size, prot); 676 677 #ifdef CONFIG_U3_DART 678 /* Do not map the DART space. Fortunately, it will be aligned 679 * in such a way that it will not cross two lmb regions and 680 * will fit within a single 16Mb page. 681 * The DART space is assumed to be a full 16Mb region even if 682 * we only use 2Mb of that space. We will use more of it later 683 * for AGP GART. We have to use a full 16Mb large page. 684 */ 685 DBG("DART base: %lx\n", dart_tablebase); 686 687 if (dart_tablebase != 0 && dart_tablebase >= base 688 && dart_tablebase < (base + size)) { 689 unsigned long dart_table_end = dart_tablebase + 16 * MB; 690 if (base != dart_tablebase) 691 BUG_ON(htab_bolt_mapping(base, dart_tablebase, 692 __pa(base), prot, 693 mmu_linear_psize, 694 mmu_kernel_ssize)); 695 if ((base + size) > dart_table_end) 696 BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB, 697 base + size, 698 __pa(dart_table_end), 699 prot, 700 mmu_linear_psize, 701 mmu_kernel_ssize)); 702 continue; 703 } 704 #endif /* CONFIG_U3_DART */ 705 BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), 706 prot, mmu_linear_psize, mmu_kernel_ssize)); 707 } 708 709 /* 710 * If we have a memory_limit and we've allocated TCEs then we need to 711 * explicitly map the TCE area at the top of RAM. We also cope with the 712 * case that the TCEs start below memory_limit. 713 * tce_alloc_start/end are 16MB aligned so the mapping should work 714 * for either 4K or 16MB pages. 715 */ 716 if (tce_alloc_start) { 717 tce_alloc_start = (unsigned long)__va(tce_alloc_start); 718 tce_alloc_end = (unsigned long)__va(tce_alloc_end); 719 720 if (base + size >= tce_alloc_start) 721 tce_alloc_start = base + size + 1; 722 723 BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, 724 __pa(tce_alloc_start), prot, 725 mmu_linear_psize, mmu_kernel_ssize)); 726 } 727 728 htab_finish_init(); 729 730 DBG(" <- htab_initialize()\n"); 731 } 732 #undef KB 733 #undef MB 734 735 void __init early_init_mmu(void) 736 { 737 /* Setup initial STAB address in the PACA */ 738 get_paca()->stab_real = __pa((u64)&initial_stab); 739 get_paca()->stab_addr = (u64)&initial_stab; 740 741 /* Initialize the MMU Hash table and create the linear mapping 742 * of memory. Has to be done before stab/slb initialization as 743 * this is currently where the page size encoding is obtained 744 */ 745 htab_initialize(); 746 747 /* Initialize stab / SLB management except on iSeries 748 */ 749 if (cpu_has_feature(CPU_FTR_SLB)) 750 slb_initialize(); 751 else if (!firmware_has_feature(FW_FEATURE_ISERIES)) 752 stab_initialize(get_paca()->stab_real); 753 } 754 755 #ifdef CONFIG_SMP 756 void __cpuinit early_init_mmu_secondary(void) 757 { 758 /* Initialize hash table for that CPU */ 759 if (!firmware_has_feature(FW_FEATURE_LPAR)) 760 mtspr(SPRN_SDR1, _SDR1); 761 762 /* Initialize STAB/SLB. We use a virtual address as it works 763 * in real mode on pSeries and we want a virutal address on 764 * iSeries anyway 765 */ 766 if (cpu_has_feature(CPU_FTR_SLB)) 767 slb_initialize(); 768 else 769 stab_initialize(get_paca()->stab_addr); 770 } 771 #endif /* CONFIG_SMP */ 772 773 /* 774 * Called by asm hashtable.S for doing lazy icache flush 775 */ 776 unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) 777 { 778 struct page *page; 779 780 if (!pfn_valid(pte_pfn(pte))) 781 return pp; 782 783 page = pte_page(pte); 784 785 /* page is dirty */ 786 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 787 if (trap == 0x400) { 788 __flush_dcache_icache(page_address(page)); 789 set_bit(PG_arch_1, &page->flags); 790 } else 791 pp |= HPTE_R_N; 792 } 793 return pp; 794 } 795 796 #ifdef CONFIG_PPC_MM_SLICES 797 unsigned int get_paca_psize(unsigned long addr) 798 { 799 unsigned long index, slices; 800 801 if (addr < SLICE_LOW_TOP) { 802 slices = get_paca()->context.low_slices_psize; 803 index = GET_LOW_SLICE_INDEX(addr); 804 } else { 805 slices = get_paca()->context.high_slices_psize; 806 index = GET_HIGH_SLICE_INDEX(addr); 807 } 808 return (slices >> (index * 4)) & 0xF; 809 } 810 811 #else 812 unsigned int get_paca_psize(unsigned long addr) 813 { 814 return get_paca()->context.user_psize; 815 } 816 #endif 817 818 /* 819 * Demote a segment to using 4k pages. 820 * For now this makes the whole process use 4k pages. 821 */ 822 #ifdef CONFIG_PPC_64K_PAGES 823 void demote_segment_4k(struct mm_struct *mm, unsigned long addr) 824 { 825 if (get_slice_psize(mm, addr) == MMU_PAGE_4K) 826 return; 827 slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); 828 #ifdef CONFIG_SPU_BASE 829 spu_flush_all_slbs(mm); 830 #endif 831 if (get_paca_psize(addr) != MMU_PAGE_4K) { 832 get_paca()->context = mm->context; 833 slb_flush_and_rebolt(); 834 } 835 } 836 #endif /* CONFIG_PPC_64K_PAGES */ 837 838 #ifdef CONFIG_PPC_SUBPAGE_PROT 839 /* 840 * This looks up a 2-bit protection code for a 4k subpage of a 64k page. 841 * Userspace sets the subpage permissions using the subpage_prot system call. 842 * 843 * Result is 0: full permissions, _PAGE_RW: read-only, 844 * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. 845 */ 846 static int subpage_protection(pgd_t *pgdir, unsigned long ea) 847 { 848 struct subpage_prot_table *spt = pgd_subpage_prot(pgdir); 849 u32 spp = 0; 850 u32 **sbpm, *sbpp; 851 852 if (ea >= spt->maxaddr) 853 return 0; 854 if (ea < 0x100000000) { 855 /* addresses below 4GB use spt->low_prot */ 856 sbpm = spt->low_prot; 857 } else { 858 sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; 859 if (!sbpm) 860 return 0; 861 } 862 sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; 863 if (!sbpp) 864 return 0; 865 spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; 866 867 /* extract 2-bit bitfield for this 4k subpage */ 868 spp >>= 30 - 2 * ((ea >> 12) & 0xf); 869 870 /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ 871 spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); 872 return spp; 873 } 874 875 #else /* CONFIG_PPC_SUBPAGE_PROT */ 876 static inline int subpage_protection(pgd_t *pgdir, unsigned long ea) 877 { 878 return 0; 879 } 880 #endif 881 882 /* Result code is: 883 * 0 - handled 884 * 1 - normal page fault 885 * -1 - critical hash insertion error 886 * -2 - access not permitted by subpage protection mechanism 887 */ 888 int hash_page(unsigned long ea, unsigned long access, unsigned long trap) 889 { 890 void *pgdir; 891 unsigned long vsid; 892 struct mm_struct *mm; 893 pte_t *ptep; 894 const struct cpumask *tmp; 895 int rc, user_region = 0, local = 0; 896 int psize, ssize; 897 898 DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", 899 ea, access, trap); 900 901 if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) { 902 DBG_LOW(" out of pgtable range !\n"); 903 return 1; 904 } 905 906 /* Get region & vsid */ 907 switch (REGION_ID(ea)) { 908 case USER_REGION_ID: 909 user_region = 1; 910 mm = current->mm; 911 if (! mm) { 912 DBG_LOW(" user region with no mm !\n"); 913 return 1; 914 } 915 psize = get_slice_psize(mm, ea); 916 ssize = user_segment_size(ea); 917 vsid = get_vsid(mm->context.id, ea, ssize); 918 break; 919 case VMALLOC_REGION_ID: 920 mm = &init_mm; 921 vsid = get_kernel_vsid(ea, mmu_kernel_ssize); 922 if (ea < VMALLOC_END) 923 psize = mmu_vmalloc_psize; 924 else 925 psize = mmu_io_psize; 926 ssize = mmu_kernel_ssize; 927 break; 928 default: 929 /* Not a valid range 930 * Send the problem up to do_page_fault 931 */ 932 return 1; 933 } 934 DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); 935 936 /* Get pgdir */ 937 pgdir = mm->pgd; 938 if (pgdir == NULL) 939 return 1; 940 941 /* Check CPU locality */ 942 tmp = cpumask_of(smp_processor_id()); 943 if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) 944 local = 1; 945 946 #ifdef CONFIG_HUGETLB_PAGE 947 /* Handle hugepage regions */ 948 if (HPAGE_SHIFT && mmu_huge_psizes[psize]) { 949 DBG_LOW(" -> huge page !\n"); 950 return hash_huge_page(mm, access, ea, vsid, local, trap); 951 } 952 #endif /* CONFIG_HUGETLB_PAGE */ 953 954 #ifndef CONFIG_PPC_64K_PAGES 955 /* If we use 4K pages and our psize is not 4K, then we are hitting 956 * a special driver mapping, we need to align the address before 957 * we fetch the PTE 958 */ 959 if (psize != MMU_PAGE_4K) 960 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); 961 #endif /* CONFIG_PPC_64K_PAGES */ 962 963 /* Get PTE and page size from page tables */ 964 ptep = find_linux_pte(pgdir, ea); 965 if (ptep == NULL || !pte_present(*ptep)) { 966 DBG_LOW(" no PTE !\n"); 967 return 1; 968 } 969 970 #ifndef CONFIG_PPC_64K_PAGES 971 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 972 #else 973 DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), 974 pte_val(*(ptep + PTRS_PER_PTE))); 975 #endif 976 /* Pre-check access permissions (will be re-checked atomically 977 * in __hash_page_XX but this pre-check is a fast path 978 */ 979 if (access & ~pte_val(*ptep)) { 980 DBG_LOW(" no access !\n"); 981 return 1; 982 } 983 984 /* Do actual hashing */ 985 #ifdef CONFIG_PPC_64K_PAGES 986 /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ 987 if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { 988 demote_segment_4k(mm, ea); 989 psize = MMU_PAGE_4K; 990 } 991 992 /* If this PTE is non-cacheable and we have restrictions on 993 * using non cacheable large pages, then we switch to 4k 994 */ 995 if (mmu_ci_restrictions && psize == MMU_PAGE_64K && 996 (pte_val(*ptep) & _PAGE_NO_CACHE)) { 997 if (user_region) { 998 demote_segment_4k(mm, ea); 999 psize = MMU_PAGE_4K; 1000 } else if (ea < VMALLOC_END) { 1001 /* 1002 * some driver did a non-cacheable mapping 1003 * in vmalloc space, so switch vmalloc 1004 * to 4k pages 1005 */ 1006 printk(KERN_ALERT "Reducing vmalloc segment " 1007 "to 4kB pages because of " 1008 "non-cacheable mapping\n"); 1009 psize = mmu_vmalloc_psize = MMU_PAGE_4K; 1010 #ifdef CONFIG_SPU_BASE 1011 spu_flush_all_slbs(mm); 1012 #endif 1013 } 1014 } 1015 if (user_region) { 1016 if (psize != get_paca_psize(ea)) { 1017 get_paca()->context = mm->context; 1018 slb_flush_and_rebolt(); 1019 } 1020 } else if (get_paca()->vmalloc_sllp != 1021 mmu_psize_defs[mmu_vmalloc_psize].sllp) { 1022 get_paca()->vmalloc_sllp = 1023 mmu_psize_defs[mmu_vmalloc_psize].sllp; 1024 slb_vmalloc_update(); 1025 } 1026 #endif /* CONFIG_PPC_64K_PAGES */ 1027 1028 #ifdef CONFIG_PPC_HAS_HASH_64K 1029 if (psize == MMU_PAGE_64K) 1030 rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); 1031 else 1032 #endif /* CONFIG_PPC_HAS_HASH_64K */ 1033 { 1034 int spp = subpage_protection(pgdir, ea); 1035 if (access & spp) 1036 rc = -2; 1037 else 1038 rc = __hash_page_4K(ea, access, vsid, ptep, trap, 1039 local, ssize, spp); 1040 } 1041 1042 #ifndef CONFIG_PPC_64K_PAGES 1043 DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); 1044 #else 1045 DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), 1046 pte_val(*(ptep + PTRS_PER_PTE))); 1047 #endif 1048 DBG_LOW(" -> rc=%d\n", rc); 1049 return rc; 1050 } 1051 EXPORT_SYMBOL_GPL(hash_page); 1052 1053 void hash_preload(struct mm_struct *mm, unsigned long ea, 1054 unsigned long access, unsigned long trap) 1055 { 1056 unsigned long vsid; 1057 void *pgdir; 1058 pte_t *ptep; 1059 unsigned long flags; 1060 int local = 0; 1061 int ssize; 1062 1063 BUG_ON(REGION_ID(ea) != USER_REGION_ID); 1064 1065 #ifdef CONFIG_PPC_MM_SLICES 1066 /* We only prefault standard pages for now */ 1067 if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) 1068 return; 1069 #endif 1070 1071 DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," 1072 " trap=%lx\n", mm, mm->pgd, ea, access, trap); 1073 1074 /* Get Linux PTE if available */ 1075 pgdir = mm->pgd; 1076 if (pgdir == NULL) 1077 return; 1078 ptep = find_linux_pte(pgdir, ea); 1079 if (!ptep) 1080 return; 1081 1082 #ifdef CONFIG_PPC_64K_PAGES 1083 /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on 1084 * a 64K kernel), then we don't preload, hash_page() will take 1085 * care of it once we actually try to access the page. 1086 * That way we don't have to duplicate all of the logic for segment 1087 * page size demotion here 1088 */ 1089 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) 1090 return; 1091 #endif /* CONFIG_PPC_64K_PAGES */ 1092 1093 /* Get VSID */ 1094 ssize = user_segment_size(ea); 1095 vsid = get_vsid(mm->context.id, ea, ssize); 1096 1097 /* Hash doesn't like irqs */ 1098 local_irq_save(flags); 1099 1100 /* Is that local to this CPU ? */ 1101 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 1102 local = 1; 1103 1104 /* Hash it in */ 1105 #ifdef CONFIG_PPC_HAS_HASH_64K 1106 if (mm->context.user_psize == MMU_PAGE_64K) 1107 __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); 1108 else 1109 #endif /* CONFIG_PPC_HAS_HASH_64K */ 1110 __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, 1111 subpage_protection(pgdir, ea)); 1112 1113 local_irq_restore(flags); 1114 } 1115 1116 /* WARNING: This is called from hash_low_64.S, if you change this prototype, 1117 * do not forget to update the assembly call site ! 1118 */ 1119 void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize, 1120 int local) 1121 { 1122 unsigned long hash, index, shift, hidx, slot; 1123 1124 DBG_LOW("flush_hash_page(va=%016x)\n", va); 1125 pte_iterate_hashed_subpages(pte, psize, va, index, shift) { 1126 hash = hpt_hash(va, shift, ssize); 1127 hidx = __rpte_to_hidx(pte, index); 1128 if (hidx & _PTEIDX_SECONDARY) 1129 hash = ~hash; 1130 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1131 slot += hidx & _PTEIDX_GROUP_IX; 1132 DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx); 1133 ppc_md.hpte_invalidate(slot, va, psize, ssize, local); 1134 } pte_iterate_hashed_end(); 1135 } 1136 1137 void flush_hash_range(unsigned long number, int local) 1138 { 1139 if (ppc_md.flush_hash_range) 1140 ppc_md.flush_hash_range(number, local); 1141 else { 1142 int i; 1143 struct ppc64_tlb_batch *batch = 1144 &__get_cpu_var(ppc64_tlb_batch); 1145 1146 for (i = 0; i < number; i++) 1147 flush_hash_page(batch->vaddr[i], batch->pte[i], 1148 batch->psize, batch->ssize, local); 1149 } 1150 } 1151 1152 /* 1153 * low_hash_fault is called when we the low level hash code failed 1154 * to instert a PTE due to an hypervisor error 1155 */ 1156 void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) 1157 { 1158 if (user_mode(regs)) { 1159 #ifdef CONFIG_PPC_SUBPAGE_PROT 1160 if (rc == -2) 1161 _exception(SIGSEGV, regs, SEGV_ACCERR, address); 1162 else 1163 #endif 1164 _exception(SIGBUS, regs, BUS_ADRERR, address); 1165 } else 1166 bad_page_fault(regs, address, SIGBUS); 1167 } 1168 1169 #ifdef CONFIG_DEBUG_PAGEALLOC 1170 static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) 1171 { 1172 unsigned long hash, hpteg; 1173 unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); 1174 unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize); 1175 unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); 1176 int ret; 1177 1178 hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize); 1179 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); 1180 1181 ret = ppc_md.hpte_insert(hpteg, va, __pa(vaddr), 1182 mode, HPTE_V_BOLTED, 1183 mmu_linear_psize, mmu_kernel_ssize); 1184 BUG_ON (ret < 0); 1185 spin_lock(&linear_map_hash_lock); 1186 BUG_ON(linear_map_hash_slots[lmi] & 0x80); 1187 linear_map_hash_slots[lmi] = ret | 0x80; 1188 spin_unlock(&linear_map_hash_lock); 1189 } 1190 1191 static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) 1192 { 1193 unsigned long hash, hidx, slot; 1194 unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); 1195 unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize); 1196 1197 hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize); 1198 spin_lock(&linear_map_hash_lock); 1199 BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); 1200 hidx = linear_map_hash_slots[lmi] & 0x7f; 1201 linear_map_hash_slots[lmi] = 0; 1202 spin_unlock(&linear_map_hash_lock); 1203 if (hidx & _PTEIDX_SECONDARY) 1204 hash = ~hash; 1205 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1206 slot += hidx & _PTEIDX_GROUP_IX; 1207 ppc_md.hpte_invalidate(slot, va, mmu_linear_psize, mmu_kernel_ssize, 0); 1208 } 1209 1210 void kernel_map_pages(struct page *page, int numpages, int enable) 1211 { 1212 unsigned long flags, vaddr, lmi; 1213 int i; 1214 1215 local_irq_save(flags); 1216 for (i = 0; i < numpages; i++, page++) { 1217 vaddr = (unsigned long)page_address(page); 1218 lmi = __pa(vaddr) >> PAGE_SHIFT; 1219 if (lmi >= linear_map_hash_count) 1220 continue; 1221 if (enable) 1222 kernel_map_linear_page(vaddr, lmi); 1223 else 1224 kernel_unmap_linear_page(vaddr, lmi); 1225 } 1226 local_irq_restore(flags); 1227 } 1228 #endif /* CONFIG_DEBUG_PAGEALLOC */ 1229
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.