1 /* 2 * Handle caching attributes in page tables (PAT) 3 * 4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> 5 * Suresh B Siddha <suresh.b.siddha@intel.com> 6 * 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 8 */ 9 10 #include <linux/seq_file.h> 11 #include <linux/bootmem.h> 12 #include <linux/debugfs.h> 13 #include <linux/ioport.h> 14 #include <linux/kernel.h> 15 #include <linux/pfn_t.h> 16 #include <linux/slab.h> 17 #include <linux/mm.h> 18 #include <linux/fs.h> 19 #include <linux/rbtree.h> 20 21 #include <asm/cacheflush.h> 22 #include <asm/processor.h> 23 #include <asm/tlbflush.h> 24 #include <asm/x86_init.h> 25 #include <asm/pgtable.h> 26 #include <asm/fcntl.h> 27 #include <asm/e820/api.h> 28 #include <asm/mtrr.h> 29 #include <asm/page.h> 30 #include <asm/msr.h> 31 #include <asm/pat.h> 32 #include <asm/io.h> 33 34 #include "pat_internal.h" 35 #include "mm_internal.h" 36 37 #undef pr_fmt 38 #define pr_fmt(fmt) "" fmt 39 40 static bool __read_mostly boot_cpu_done; 41 static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); 42 static bool __read_mostly pat_initialized; 43 static bool __read_mostly init_cm_done; 44 45 void pat_disable(const char *reason) 46 { 47 if (pat_disabled) 48 return; 49 50 if (boot_cpu_done) { 51 WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); 52 return; 53 } 54 55 pat_disabled = true; 56 pr_info("x86/PAT: %s\n", reason); 57 } 58 59 static int __init nopat(char *str) 60 { 61 pat_disable("PAT support disabled."); 62 return 0; 63 } 64 early_param("nopat", nopat); 65 66 bool pat_enabled(void) 67 { 68 return pat_initialized; 69 } 70 EXPORT_SYMBOL_GPL(pat_enabled); 71 72 int pat_debug_enable; 73 74 static int __init pat_debug_setup(char *str) 75 { 76 pat_debug_enable = 1; 77 return 0; 78 } 79 __setup("debugpat", pat_debug_setup); 80 81 #ifdef CONFIG_X86_PAT 82 /* 83 * X86 PAT uses page flags arch_1 and uncached together to keep track of 84 * memory type of pages that have backing page struct. 85 * 86 * X86 PAT supports 4 different memory types: 87 * - _PAGE_CACHE_MODE_WB 88 * - _PAGE_CACHE_MODE_WC 89 * - _PAGE_CACHE_MODE_UC_MINUS 90 * - _PAGE_CACHE_MODE_WT 91 * 92 * _PAGE_CACHE_MODE_WB is the default type. 93 */ 94 95 #define _PGMT_WB 0 96 #define _PGMT_WC (1UL << PG_arch_1) 97 #define _PGMT_UC_MINUS (1UL << PG_uncached) 98 #define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) 99 #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) 100 #define _PGMT_CLEAR_MASK (~_PGMT_MASK) 101 102 static inline enum page_cache_mode get_page_memtype(struct page *pg) 103 { 104 unsigned long pg_flags = pg->flags & _PGMT_MASK; 105 106 if (pg_flags == _PGMT_WB) 107 return _PAGE_CACHE_MODE_WB; 108 else if (pg_flags == _PGMT_WC) 109 return _PAGE_CACHE_MODE_WC; 110 else if (pg_flags == _PGMT_UC_MINUS) 111 return _PAGE_CACHE_MODE_UC_MINUS; 112 else 113 return _PAGE_CACHE_MODE_WT; 114 } 115 116 static inline void set_page_memtype(struct page *pg, 117 enum page_cache_mode memtype) 118 { 119 unsigned long memtype_flags; 120 unsigned long old_flags; 121 unsigned long new_flags; 122 123 switch (memtype) { 124 case _PAGE_CACHE_MODE_WC: 125 memtype_flags = _PGMT_WC; 126 break; 127 case _PAGE_CACHE_MODE_UC_MINUS: 128 memtype_flags = _PGMT_UC_MINUS; 129 break; 130 case _PAGE_CACHE_MODE_WT: 131 memtype_flags = _PGMT_WT; 132 break; 133 case _PAGE_CACHE_MODE_WB: 134 default: 135 memtype_flags = _PGMT_WB; 136 break; 137 } 138 139 do { 140 old_flags = pg->flags; 141 new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; 142 } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); 143 } 144 #else 145 static inline enum page_cache_mode get_page_memtype(struct page *pg) 146 { 147 return -1; 148 } 149 static inline void set_page_memtype(struct page *pg, 150 enum page_cache_mode memtype) 151 { 152 } 153 #endif 154 155 enum { 156 PAT_UC = 0, /* uncached */ 157 PAT_WC = 1, /* Write combining */ 158 PAT_WT = 4, /* Write Through */ 159 PAT_WP = 5, /* Write Protected */ 160 PAT_WB = 6, /* Write Back (default) */ 161 PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ 162 }; 163 164 #define CM(c) (_PAGE_CACHE_MODE_ ## c) 165 166 static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) 167 { 168 enum page_cache_mode cache; 169 char *cache_mode; 170 171 switch (pat_val) { 172 case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 173 case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 174 case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 175 case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 176 case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 177 case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 178 default: cache = CM(WB); cache_mode = "WB "; break; 179 } 180 181 memcpy(msg, cache_mode, 4); 182 183 return cache; 184 } 185 186 #undef CM 187 188 /* 189 * Update the cache mode to pgprot translation tables according to PAT 190 * configuration. 191 * Using lower indices is preferred, so we start with highest index. 192 */ 193 static void __init_cache_modes(u64 pat) 194 { 195 enum page_cache_mode cache; 196 char pat_msg[33]; 197 int i; 198 199 pat_msg[32] = 0; 200 for (i = 7; i >= 0; i--) { 201 cache = pat_get_cache_mode((pat >> (i * 8)) & 7, 202 pat_msg + 4 * i); 203 update_cache_mode_entry(i, cache); 204 } 205 pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); 206 207 init_cm_done = true; 208 } 209 210 #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) 211 212 static void pat_bsp_init(u64 pat) 213 { 214 u64 tmp_pat; 215 216 if (!boot_cpu_has(X86_FEATURE_PAT)) { 217 pat_disable("PAT not supported by CPU."); 218 return; 219 } 220 221 rdmsrl(MSR_IA32_CR_PAT, tmp_pat); 222 if (!tmp_pat) { 223 pat_disable("PAT MSR is 0, disabled."); 224 return; 225 } 226 227 wrmsrl(MSR_IA32_CR_PAT, pat); 228 pat_initialized = true; 229 230 __init_cache_modes(pat); 231 } 232 233 static void pat_ap_init(u64 pat) 234 { 235 if (!boot_cpu_has(X86_FEATURE_PAT)) { 236 /* 237 * If this happens we are on a secondary CPU, but switched to 238 * PAT on the boot CPU. We have no way to undo PAT. 239 */ 240 panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); 241 } 242 243 wrmsrl(MSR_IA32_CR_PAT, pat); 244 } 245 246 void init_cache_modes(void) 247 { 248 u64 pat = 0; 249 250 if (init_cm_done) 251 return; 252 253 if (boot_cpu_has(X86_FEATURE_PAT)) { 254 /* 255 * CPU supports PAT. Set PAT table to be consistent with 256 * PAT MSR. This case supports "nopat" boot option, and 257 * virtual machine environments which support PAT without 258 * MTRRs. In specific, Xen has unique setup to PAT MSR. 259 * 260 * If PAT MSR returns 0, it is considered invalid and emulates 261 * as No PAT. 262 */ 263 rdmsrl(MSR_IA32_CR_PAT, pat); 264 } 265 266 if (!pat) { 267 /* 268 * No PAT. Emulate the PAT table that corresponds to the two 269 * cache bits, PWT (Write Through) and PCD (Cache Disable). 270 * This setup is also the same as the BIOS default setup. 271 * 272 * PTE encoding: 273 * 274 * PCD 275 * |PWT PAT 276 * || slot 277 * 00 0 WB : _PAGE_CACHE_MODE_WB 278 * 01 1 WT : _PAGE_CACHE_MODE_WT 279 * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 280 * 11 3 UC : _PAGE_CACHE_MODE_UC 281 * 282 * NOTE: When WC or WP is used, it is redirected to UC- per 283 * the default setup in __cachemode2pte_tbl[]. 284 */ 285 pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | 286 PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); 287 } 288 289 __init_cache_modes(pat); 290 } 291 292 /** 293 * pat_init - Initialize PAT MSR and PAT table 294 * 295 * This function initializes PAT MSR and PAT table with an OS-defined value 296 * to enable additional cache attributes, WC, WT and WP. 297 * 298 * This function must be called on all CPUs using the specific sequence of 299 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this 300 * procedure for PAT. 301 */ 302 void pat_init(void) 303 { 304 u64 pat; 305 struct cpuinfo_x86 *c = &boot_cpu_data; 306 307 if (pat_disabled) 308 return; 309 310 if ((c->x86_vendor == X86_VENDOR_INTEL) && 311 (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || 312 ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { 313 /* 314 * PAT support with the lower four entries. Intel Pentium 2, 315 * 3, M, and 4 are affected by PAT errata, which makes the 316 * upper four entries unusable. To be on the safe side, we don't 317 * use those. 318 * 319 * PTE encoding: 320 * PAT 321 * |PCD 322 * ||PWT PAT 323 * ||| slot 324 * 000 0 WB : _PAGE_CACHE_MODE_WB 325 * 001 1 WC : _PAGE_CACHE_MODE_WC 326 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 327 * 011 3 UC : _PAGE_CACHE_MODE_UC 328 * PAT bit unused 329 * 330 * NOTE: When WT or WP is used, it is redirected to UC- per 331 * the default setup in __cachemode2pte_tbl[]. 332 */ 333 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 334 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); 335 } else { 336 /* 337 * Full PAT support. We put WT in slot 7 to improve 338 * robustness in the presence of errata that might cause 339 * the high PAT bit to be ignored. This way, a buggy slot 7 340 * access will hit slot 3, and slot 3 is UC, so at worst 341 * we lose performance without causing a correctness issue. 342 * Pentium 4 erratum N46 is an example for such an erratum, 343 * although we try not to use PAT at all on affected CPUs. 344 * 345 * PTE encoding: 346 * PAT 347 * |PCD 348 * ||PWT PAT 349 * ||| slot 350 * 000 0 WB : _PAGE_CACHE_MODE_WB 351 * 001 1 WC : _PAGE_CACHE_MODE_WC 352 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 353 * 011 3 UC : _PAGE_CACHE_MODE_UC 354 * 100 4 WB : Reserved 355 * 101 5 WP : _PAGE_CACHE_MODE_WP 356 * 110 6 UC-: Reserved 357 * 111 7 WT : _PAGE_CACHE_MODE_WT 358 * 359 * The reserved slots are unused, but mapped to their 360 * corresponding types in the presence of PAT errata. 361 */ 362 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 363 PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); 364 } 365 366 if (!boot_cpu_done) { 367 pat_bsp_init(pat); 368 boot_cpu_done = true; 369 } else { 370 pat_ap_init(pat); 371 } 372 } 373 374 #undef PAT 375 376 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ 377 378 /* 379 * Does intersection of PAT memory type and MTRR memory type and returns 380 * the resulting memory type as PAT understands it. 381 * (Type in pat and mtrr will not have same value) 382 * The intersection is based on "Effective Memory Type" tables in IA-32 383 * SDM vol 3a 384 */ 385 static unsigned long pat_x_mtrr_type(u64 start, u64 end, 386 enum page_cache_mode req_type) 387 { 388 /* 389 * Look for MTRR hint to get the effective type in case where PAT 390 * request is for WB. 391 */ 392 if (req_type == _PAGE_CACHE_MODE_WB) { 393 u8 mtrr_type, uniform; 394 395 mtrr_type = mtrr_type_lookup(start, end, &uniform); 396 if (mtrr_type != MTRR_TYPE_WRBACK) 397 return _PAGE_CACHE_MODE_UC_MINUS; 398 399 return _PAGE_CACHE_MODE_WB; 400 } 401 402 return req_type; 403 } 404 405 struct pagerange_state { 406 unsigned long cur_pfn; 407 int ram; 408 int not_ram; 409 }; 410 411 static int 412 pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) 413 { 414 struct pagerange_state *state = arg; 415 416 state->not_ram |= initial_pfn > state->cur_pfn; 417 state->ram |= total_nr_pages > 0; 418 state->cur_pfn = initial_pfn + total_nr_pages; 419 420 return state->ram && state->not_ram; 421 } 422 423 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) 424 { 425 int ret = 0; 426 unsigned long start_pfn = start >> PAGE_SHIFT; 427 unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 428 struct pagerange_state state = {start_pfn, 0, 0}; 429 430 /* 431 * For legacy reasons, physical address range in the legacy ISA 432 * region is tracked as non-RAM. This will allow users of 433 * /dev/mem to map portions of legacy ISA region, even when 434 * some of those portions are listed(or not even listed) with 435 * different e820 types(RAM/reserved/..) 436 */ 437 if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) 438 start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; 439 440 if (start_pfn < end_pfn) { 441 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, 442 &state, pagerange_is_ram_callback); 443 } 444 445 return (ret > 0) ? -1 : (state.ram ? 1 : 0); 446 } 447 448 /* 449 * For RAM pages, we use page flags to mark the pages with appropriate type. 450 * The page flags are limited to four types, WB (default), WC, WT and UC-. 451 * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting 452 * a new memory type is only allowed for a page mapped with the default WB 453 * type. 454 * 455 * Here we do two passes: 456 * - Find the memtype of all the pages in the range, look for any conflicts. 457 * - In case of no conflicts, set the new memtype for pages in the range. 458 */ 459 static int reserve_ram_pages_type(u64 start, u64 end, 460 enum page_cache_mode req_type, 461 enum page_cache_mode *new_type) 462 { 463 struct page *page; 464 u64 pfn; 465 466 if (req_type == _PAGE_CACHE_MODE_WP) { 467 if (new_type) 468 *new_type = _PAGE_CACHE_MODE_UC_MINUS; 469 return -EINVAL; 470 } 471 472 if (req_type == _PAGE_CACHE_MODE_UC) { 473 /* We do not support strong UC */ 474 WARN_ON_ONCE(1); 475 req_type = _PAGE_CACHE_MODE_UC_MINUS; 476 } 477 478 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 479 enum page_cache_mode type; 480 481 page = pfn_to_page(pfn); 482 type = get_page_memtype(page); 483 if (type != _PAGE_CACHE_MODE_WB) { 484 pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", 485 start, end - 1, type, req_type); 486 if (new_type) 487 *new_type = type; 488 489 return -EBUSY; 490 } 491 } 492 493 if (new_type) 494 *new_type = req_type; 495 496 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 497 page = pfn_to_page(pfn); 498 set_page_memtype(page, req_type); 499 } 500 return 0; 501 } 502 503 static int free_ram_pages_type(u64 start, u64 end) 504 { 505 struct page *page; 506 u64 pfn; 507 508 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 509 page = pfn_to_page(pfn); 510 set_page_memtype(page, _PAGE_CACHE_MODE_WB); 511 } 512 return 0; 513 } 514 515 /* 516 * req_type typically has one of the: 517 * - _PAGE_CACHE_MODE_WB 518 * - _PAGE_CACHE_MODE_WC 519 * - _PAGE_CACHE_MODE_UC_MINUS 520 * - _PAGE_CACHE_MODE_UC 521 * - _PAGE_CACHE_MODE_WT 522 * 523 * If new_type is NULL, function will return an error if it cannot reserve the 524 * region with req_type. If new_type is non-NULL, function will return 525 * available type in new_type in case of no error. In case of any error 526 * it will return a negative return value. 527 */ 528 int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, 529 enum page_cache_mode *new_type) 530 { 531 struct memtype *new; 532 enum page_cache_mode actual_type; 533 int is_range_ram; 534 int err = 0; 535 536 BUG_ON(start >= end); /* end is exclusive */ 537 538 if (!pat_enabled()) { 539 /* This is identical to page table setting without PAT */ 540 if (new_type) 541 *new_type = req_type; 542 return 0; 543 } 544 545 /* Low ISA region is always mapped WB in page table. No need to track */ 546 if (x86_platform.is_untracked_pat_range(start, end)) { 547 if (new_type) 548 *new_type = _PAGE_CACHE_MODE_WB; 549 return 0; 550 } 551 552 /* 553 * Call mtrr_lookup to get the type hint. This is an 554 * optimization for /dev/mem mmap'ers into WB memory (BIOS 555 * tools and ACPI tools). Use WB request for WB memory and use 556 * UC_MINUS otherwise. 557 */ 558 actual_type = pat_x_mtrr_type(start, end, req_type); 559 560 if (new_type) 561 *new_type = actual_type; 562 563 is_range_ram = pat_pagerange_is_ram(start, end); 564 if (is_range_ram == 1) { 565 566 err = reserve_ram_pages_type(start, end, req_type, new_type); 567 568 return err; 569 } else if (is_range_ram < 0) { 570 return -EINVAL; 571 } 572 573 new = kzalloc(sizeof(struct memtype), GFP_KERNEL); 574 if (!new) 575 return -ENOMEM; 576 577 new->start = start; 578 new->end = end; 579 new->type = actual_type; 580 581 spin_lock(&memtype_lock); 582 583 err = rbt_memtype_check_insert(new, new_type); 584 if (err) { 585 pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", 586 start, end - 1, 587 cattr_name(new->type), cattr_name(req_type)); 588 kfree(new); 589 spin_unlock(&memtype_lock); 590 591 return err; 592 } 593 594 spin_unlock(&memtype_lock); 595 596 dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", 597 start, end - 1, cattr_name(new->type), cattr_name(req_type), 598 new_type ? cattr_name(*new_type) : "-"); 599 600 return err; 601 } 602 603 int free_memtype(u64 start, u64 end) 604 { 605 int err = -EINVAL; 606 int is_range_ram; 607 struct memtype *entry; 608 609 if (!pat_enabled()) 610 return 0; 611 612 /* Low ISA region is always mapped WB. No need to track */ 613 if (x86_platform.is_untracked_pat_range(start, end)) 614 return 0; 615 616 is_range_ram = pat_pagerange_is_ram(start, end); 617 if (is_range_ram == 1) { 618 619 err = free_ram_pages_type(start, end); 620 621 return err; 622 } else if (is_range_ram < 0) { 623 return -EINVAL; 624 } 625 626 spin_lock(&memtype_lock); 627 entry = rbt_memtype_erase(start, end); 628 spin_unlock(&memtype_lock); 629 630 if (IS_ERR(entry)) { 631 pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", 632 current->comm, current->pid, start, end - 1); 633 return -EINVAL; 634 } 635 636 kfree(entry); 637 638 dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); 639 640 return 0; 641 } 642 643 644 /** 645 * lookup_memtype - Looksup the memory type for a physical address 646 * @paddr: physical address of which memory type needs to be looked up 647 * 648 * Only to be called when PAT is enabled 649 * 650 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS 651 * or _PAGE_CACHE_MODE_WT. 652 */ 653 static enum page_cache_mode lookup_memtype(u64 paddr) 654 { 655 enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; 656 struct memtype *entry; 657 658 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) 659 return rettype; 660 661 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 662 struct page *page; 663 664 page = pfn_to_page(paddr >> PAGE_SHIFT); 665 return get_page_memtype(page); 666 } 667 668 spin_lock(&memtype_lock); 669 670 entry = rbt_memtype_lookup(paddr); 671 if (entry != NULL) 672 rettype = entry->type; 673 else 674 rettype = _PAGE_CACHE_MODE_UC_MINUS; 675 676 spin_unlock(&memtype_lock); 677 return rettype; 678 } 679 680 /** 681 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type 682 * of @pfn cannot be overridden by UC MTRR memory type. 683 * 684 * Only to be called when PAT is enabled. 685 * 686 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. 687 * Returns false in other cases. 688 */ 689 bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) 690 { 691 enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); 692 693 return cm == _PAGE_CACHE_MODE_UC || 694 cm == _PAGE_CACHE_MODE_UC_MINUS || 695 cm == _PAGE_CACHE_MODE_WC; 696 } 697 EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); 698 699 /** 700 * io_reserve_memtype - Request a memory type mapping for a region of memory 701 * @start: start (physical address) of the region 702 * @end: end (physical address) of the region 703 * @type: A pointer to memtype, with requested type. On success, requested 704 * or any other compatible type that was available for the region is returned 705 * 706 * On success, returns 0 707 * On failure, returns non-zero 708 */ 709 int io_reserve_memtype(resource_size_t start, resource_size_t end, 710 enum page_cache_mode *type) 711 { 712 resource_size_t size = end - start; 713 enum page_cache_mode req_type = *type; 714 enum page_cache_mode new_type; 715 int ret; 716 717 WARN_ON_ONCE(iomem_map_sanity_check(start, size)); 718 719 ret = reserve_memtype(start, end, req_type, &new_type); 720 if (ret) 721 goto out_err; 722 723 if (!is_new_memtype_allowed(start, size, req_type, new_type)) 724 goto out_free; 725 726 if (kernel_map_sync_memtype(start, size, new_type) < 0) 727 goto out_free; 728 729 *type = new_type; 730 return 0; 731 732 out_free: 733 free_memtype(start, end); 734 ret = -EBUSY; 735 out_err: 736 return ret; 737 } 738 739 /** 740 * io_free_memtype - Release a memory type mapping for a region of memory 741 * @start: start (physical address) of the region 742 * @end: end (physical address) of the region 743 */ 744 void io_free_memtype(resource_size_t start, resource_size_t end) 745 { 746 free_memtype(start, end); 747 } 748 749 int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) 750 { 751 enum page_cache_mode type = _PAGE_CACHE_MODE_WC; 752 753 return io_reserve_memtype(start, start + size, &type); 754 } 755 EXPORT_SYMBOL(arch_io_reserve_memtype_wc); 756 757 void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) 758 { 759 io_free_memtype(start, start + size); 760 } 761 EXPORT_SYMBOL(arch_io_free_memtype_wc); 762 763 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 764 unsigned long size, pgprot_t vma_prot) 765 { 766 if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) 767 vma_prot = pgprot_decrypted(vma_prot); 768 769 return vma_prot; 770 } 771 772 #ifdef CONFIG_STRICT_DEVMEM 773 /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ 774 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 775 { 776 return 1; 777 } 778 #else 779 /* This check is needed to avoid cache aliasing when PAT is enabled */ 780 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 781 { 782 u64 from = ((u64)pfn) << PAGE_SHIFT; 783 u64 to = from + size; 784 u64 cursor = from; 785 786 if (!pat_enabled()) 787 return 1; 788 789 while (cursor < to) { 790 if (!devmem_is_allowed(pfn)) 791 return 0; 792 cursor += PAGE_SIZE; 793 pfn++; 794 } 795 return 1; 796 } 797 #endif /* CONFIG_STRICT_DEVMEM */ 798 799 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 800 unsigned long size, pgprot_t *vma_prot) 801 { 802 enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; 803 804 if (!range_is_allowed(pfn, size)) 805 return 0; 806 807 if (file->f_flags & O_DSYNC) 808 pcm = _PAGE_CACHE_MODE_UC_MINUS; 809 810 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | 811 cachemode2protval(pcm)); 812 return 1; 813 } 814 815 /* 816 * Change the memory type for the physial address range in kernel identity 817 * mapping space if that range is a part of identity map. 818 */ 819 int kernel_map_sync_memtype(u64 base, unsigned long size, 820 enum page_cache_mode pcm) 821 { 822 unsigned long id_sz; 823 824 if (base > __pa(high_memory-1)) 825 return 0; 826 827 /* 828 * some areas in the middle of the kernel identity range 829 * are not mapped, like the PCI space. 830 */ 831 if (!page_is_ram(base >> PAGE_SHIFT)) 832 return 0; 833 834 id_sz = (__pa(high_memory-1) <= base + size) ? 835 __pa(high_memory) - base : 836 size; 837 838 if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { 839 pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", 840 current->comm, current->pid, 841 cattr_name(pcm), 842 base, (unsigned long long)(base + size-1)); 843 return -EINVAL; 844 } 845 return 0; 846 } 847 848 /* 849 * Internal interface to reserve a range of physical memory with prot. 850 * Reserved non RAM regions only and after successful reserve_memtype, 851 * this func also keeps identity mapping (if any) in sync with this new prot. 852 */ 853 static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, 854 int strict_prot) 855 { 856 int is_ram = 0; 857 int ret; 858 enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); 859 enum page_cache_mode pcm = want_pcm; 860 861 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 862 863 /* 864 * reserve_pfn_range() for RAM pages. We do not refcount to keep 865 * track of number of mappings of RAM pages. We can assert that 866 * the type requested matches the type of first page in the range. 867 */ 868 if (is_ram) { 869 if (!pat_enabled()) 870 return 0; 871 872 pcm = lookup_memtype(paddr); 873 if (want_pcm != pcm) { 874 pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", 875 current->comm, current->pid, 876 cattr_name(want_pcm), 877 (unsigned long long)paddr, 878 (unsigned long long)(paddr + size - 1), 879 cattr_name(pcm)); 880 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 881 (~_PAGE_CACHE_MASK)) | 882 cachemode2protval(pcm)); 883 } 884 return 0; 885 } 886 887 ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm); 888 if (ret) 889 return ret; 890 891 if (pcm != want_pcm) { 892 if (strict_prot || 893 !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { 894 free_memtype(paddr, paddr + size); 895 pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", 896 current->comm, current->pid, 897 cattr_name(want_pcm), 898 (unsigned long long)paddr, 899 (unsigned long long)(paddr + size - 1), 900 cattr_name(pcm)); 901 return -EINVAL; 902 } 903 /* 904 * We allow returning different type than the one requested in 905 * non strict case. 906 */ 907 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 908 (~_PAGE_CACHE_MASK)) | 909 cachemode2protval(pcm)); 910 } 911 912 if (kernel_map_sync_memtype(paddr, size, pcm) < 0) { 913 free_memtype(paddr, paddr + size); 914 return -EINVAL; 915 } 916 return 0; 917 } 918 919 /* 920 * Internal interface to free a range of physical memory. 921 * Frees non RAM regions only. 922 */ 923 static void free_pfn_range(u64 paddr, unsigned long size) 924 { 925 int is_ram; 926 927 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 928 if (is_ram == 0) 929 free_memtype(paddr, paddr + size); 930 } 931 932 /* 933 * track_pfn_copy is called when vma that is covering the pfnmap gets 934 * copied through copy_page_range(). 935 * 936 * If the vma has a linear pfn mapping for the entire range, we get the prot 937 * from pte and reserve the entire vma range with single reserve_pfn_range call. 938 */ 939 int track_pfn_copy(struct vm_area_struct *vma) 940 { 941 resource_size_t paddr; 942 unsigned long prot; 943 unsigned long vma_size = vma->vm_end - vma->vm_start; 944 pgprot_t pgprot; 945 946 if (vma->vm_flags & VM_PAT) { 947 /* 948 * reserve the whole chunk covered by vma. We need the 949 * starting address and protection from pte. 950 */ 951 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 952 WARN_ON_ONCE(1); 953 return -EINVAL; 954 } 955 pgprot = __pgprot(prot); 956 return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 957 } 958 959 return 0; 960 } 961 962 /* 963 * prot is passed in as a parameter for the new mapping. If the vma has 964 * a linear pfn mapping for the entire range, or no vma is provided, 965 * reserve the entire pfn + size range with single reserve_pfn_range 966 * call. 967 */ 968 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, 969 unsigned long pfn, unsigned long addr, unsigned long size) 970 { 971 resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; 972 enum page_cache_mode pcm; 973 974 /* reserve the whole chunk starting from paddr */ 975 if (!vma || (addr == vma->vm_start 976 && size == (vma->vm_end - vma->vm_start))) { 977 int ret; 978 979 ret = reserve_pfn_range(paddr, size, prot, 0); 980 if (ret == 0 && vma) 981 vma->vm_flags |= VM_PAT; 982 return ret; 983 } 984 985 if (!pat_enabled()) 986 return 0; 987 988 /* 989 * For anything smaller than the vma size we set prot based on the 990 * lookup. 991 */ 992 pcm = lookup_memtype(paddr); 993 994 /* Check memtype for the remaining pages */ 995 while (size > PAGE_SIZE) { 996 size -= PAGE_SIZE; 997 paddr += PAGE_SIZE; 998 if (pcm != lookup_memtype(paddr)) 999 return -EINVAL; 1000 } 1001 1002 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1003 cachemode2protval(pcm)); 1004 1005 return 0; 1006 } 1007 1008 void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) 1009 { 1010 enum page_cache_mode pcm; 1011 1012 if (!pat_enabled()) 1013 return; 1014 1015 /* Set prot based on lookup */ 1016 pcm = lookup_memtype(pfn_t_to_phys(pfn)); 1017 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1018 cachemode2protval(pcm)); 1019 } 1020 1021 /* 1022 * untrack_pfn is called while unmapping a pfnmap for a region. 1023 * untrack can be called for a specific region indicated by pfn and size or 1024 * can be for the entire vma (in which case pfn, size are zero). 1025 */ 1026 void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 1027 unsigned long size) 1028 { 1029 resource_size_t paddr; 1030 unsigned long prot; 1031 1032 if (vma && !(vma->vm_flags & VM_PAT)) 1033 return; 1034 1035 /* free the chunk starting from pfn or the whole chunk */ 1036 paddr = (resource_size_t)pfn << PAGE_SHIFT; 1037 if (!paddr && !size) { 1038 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 1039 WARN_ON_ONCE(1); 1040 return; 1041 } 1042 1043 size = vma->vm_end - vma->vm_start; 1044 } 1045 free_pfn_range(paddr, size); 1046 if (vma) 1047 vma->vm_flags &= ~VM_PAT; 1048 } 1049 1050 /* 1051 * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, 1052 * with the old vma after its pfnmap page table has been removed. The new 1053 * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. 1054 */ 1055 void untrack_pfn_moved(struct vm_area_struct *vma) 1056 { 1057 vma->vm_flags &= ~VM_PAT; 1058 } 1059 1060 pgprot_t pgprot_writecombine(pgprot_t prot) 1061 { 1062 return __pgprot(pgprot_val(prot) | 1063 cachemode2protval(_PAGE_CACHE_MODE_WC)); 1064 } 1065 EXPORT_SYMBOL_GPL(pgprot_writecombine); 1066 1067 pgprot_t pgprot_writethrough(pgprot_t prot) 1068 { 1069 return __pgprot(pgprot_val(prot) | 1070 cachemode2protval(_PAGE_CACHE_MODE_WT)); 1071 } 1072 EXPORT_SYMBOL_GPL(pgprot_writethrough); 1073 1074 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 1075 1076 static struct memtype *memtype_get_idx(loff_t pos) 1077 { 1078 struct memtype *print_entry; 1079 int ret; 1080 1081 print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); 1082 if (!print_entry) 1083 return NULL; 1084 1085 spin_lock(&memtype_lock); 1086 ret = rbt_memtype_copy_nth_element(print_entry, pos); 1087 spin_unlock(&memtype_lock); 1088 1089 if (!ret) { 1090 return print_entry; 1091 } else { 1092 kfree(print_entry); 1093 return NULL; 1094 } 1095 } 1096 1097 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) 1098 { 1099 if (*pos == 0) { 1100 ++*pos; 1101 seq_puts(seq, "PAT memtype list:\n"); 1102 } 1103 1104 return memtype_get_idx(*pos); 1105 } 1106 1107 static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1108 { 1109 ++*pos; 1110 return memtype_get_idx(*pos); 1111 } 1112 1113 static void memtype_seq_stop(struct seq_file *seq, void *v) 1114 { 1115 } 1116 1117 static int memtype_seq_show(struct seq_file *seq, void *v) 1118 { 1119 struct memtype *print_entry = (struct memtype *)v; 1120 1121 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), 1122 print_entry->start, print_entry->end); 1123 kfree(print_entry); 1124 1125 return 0; 1126 } 1127 1128 static const struct seq_operations memtype_seq_ops = { 1129 .start = memtype_seq_start, 1130 .next = memtype_seq_next, 1131 .stop = memtype_seq_stop, 1132 .show = memtype_seq_show, 1133 }; 1134 1135 static int memtype_seq_open(struct inode *inode, struct file *file) 1136 { 1137 return seq_open(file, &memtype_seq_ops); 1138 } 1139 1140 static const struct file_operations memtype_fops = { 1141 .open = memtype_seq_open, 1142 .read = seq_read, 1143 .llseek = seq_lseek, 1144 .release = seq_release, 1145 }; 1146 1147 static int __init pat_memtype_list_init(void) 1148 { 1149 if (pat_enabled()) { 1150 debugfs_create_file("pat_memtype_list", S_IRUSR, 1151 arch_debugfs_dir, NULL, &memtype_fops); 1152 } 1153 return 0; 1154 } 1155 1156 late_initcall(pat_memtype_list_init); 1157 1158 #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ 1159
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.