1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_mmio.h> 18 #include <asm/kvm_ras.h> 19 #include <asm/kvm_asm.h> 20 #include <asm/kvm_emulate.h> 21 #include <asm/virt.h> 22 23 #include "trace.h" 24 25 static pgd_t *boot_hyp_pgd; 26 static pgd_t *hyp_pgd; 27 static pgd_t *merged_hyp_pgd; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long hyp_idmap_start; 31 static unsigned long hyp_idmap_end; 32 static phys_addr_t hyp_idmap_vector; 33 34 static unsigned long io_map_base; 35 36 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 37 38 #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) 39 #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) 40 41 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 42 { 43 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 44 } 45 46 /** 47 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 48 * @kvm: pointer to kvm structure. 49 * 50 * Interface to HYP function to flush all VM TLB entries 51 */ 52 void kvm_flush_remote_tlbs(struct kvm *kvm) 53 { 54 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); 55 } 56 57 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 58 { 59 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 60 } 61 62 /* 63 * D-Cache management functions. They take the page table entries by 64 * value, as they are flushing the cache using the kernel mapping (or 65 * kmap on 32bit). 66 */ 67 static void kvm_flush_dcache_pte(pte_t pte) 68 { 69 __kvm_flush_dcache_pte(pte); 70 } 71 72 static void kvm_flush_dcache_pmd(pmd_t pmd) 73 { 74 __kvm_flush_dcache_pmd(pmd); 75 } 76 77 static void kvm_flush_dcache_pud(pud_t pud) 78 { 79 __kvm_flush_dcache_pud(pud); 80 } 81 82 static bool kvm_is_device_pfn(unsigned long pfn) 83 { 84 return !pfn_valid(pfn); 85 } 86 87 /** 88 * stage2_dissolve_pmd() - clear and flush huge PMD entry 89 * @kvm: pointer to kvm structure. 90 * @addr: IPA 91 * @pmd: pmd pointer for IPA 92 * 93 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. 94 */ 95 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 96 { 97 if (!pmd_thp_or_huge(*pmd)) 98 return; 99 100 pmd_clear(pmd); 101 kvm_tlb_flush_vmid_ipa(kvm, addr); 102 put_page(virt_to_page(pmd)); 103 } 104 105 /** 106 * stage2_dissolve_pud() - clear and flush huge PUD entry 107 * @kvm: pointer to kvm structure. 108 * @addr: IPA 109 * @pud: pud pointer for IPA 110 * 111 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. 112 */ 113 static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) 114 { 115 if (!stage2_pud_huge(kvm, *pudp)) 116 return; 117 118 stage2_pud_clear(kvm, pudp); 119 kvm_tlb_flush_vmid_ipa(kvm, addr); 120 put_page(virt_to_page(pudp)); 121 } 122 123 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 124 int min, int max) 125 { 126 void *page; 127 128 BUG_ON(max > KVM_NR_MEM_OBJS); 129 if (cache->nobjs >= min) 130 return 0; 131 while (cache->nobjs < max) { 132 page = (void *)__get_free_page(GFP_PGTABLE_USER); 133 if (!page) 134 return -ENOMEM; 135 cache->objects[cache->nobjs++] = page; 136 } 137 return 0; 138 } 139 140 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 141 { 142 while (mc->nobjs) 143 free_page((unsigned long)mc->objects[--mc->nobjs]); 144 } 145 146 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 147 { 148 void *p; 149 150 BUG_ON(!mc || !mc->nobjs); 151 p = mc->objects[--mc->nobjs]; 152 return p; 153 } 154 155 static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) 156 { 157 pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); 158 stage2_pgd_clear(kvm, pgd); 159 kvm_tlb_flush_vmid_ipa(kvm, addr); 160 stage2_pud_free(kvm, pud_table); 161 put_page(virt_to_page(pgd)); 162 } 163 164 static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 165 { 166 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); 167 VM_BUG_ON(stage2_pud_huge(kvm, *pud)); 168 stage2_pud_clear(kvm, pud); 169 kvm_tlb_flush_vmid_ipa(kvm, addr); 170 stage2_pmd_free(kvm, pmd_table); 171 put_page(virt_to_page(pud)); 172 } 173 174 static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 175 { 176 pte_t *pte_table = pte_offset_kernel(pmd, 0); 177 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 178 pmd_clear(pmd); 179 kvm_tlb_flush_vmid_ipa(kvm, addr); 180 free_page((unsigned long)pte_table); 181 put_page(virt_to_page(pmd)); 182 } 183 184 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) 185 { 186 WRITE_ONCE(*ptep, new_pte); 187 dsb(ishst); 188 } 189 190 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) 191 { 192 WRITE_ONCE(*pmdp, new_pmd); 193 dsb(ishst); 194 } 195 196 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) 197 { 198 kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); 199 } 200 201 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) 202 { 203 WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); 204 dsb(ishst); 205 } 206 207 static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) 208 { 209 WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); 210 dsb(ishst); 211 } 212 213 /* 214 * Unmapping vs dcache management: 215 * 216 * If a guest maps certain memory pages as uncached, all writes will 217 * bypass the data cache and go directly to RAM. However, the CPUs 218 * can still speculate reads (not writes) and fill cache lines with 219 * data. 220 * 221 * Those cache lines will be *clean* cache lines though, so a 222 * clean+invalidate operation is equivalent to an invalidate 223 * operation, because no cache lines are marked dirty. 224 * 225 * Those clean cache lines could be filled prior to an uncached write 226 * by the guest, and the cache coherent IO subsystem would therefore 227 * end up writing old data to disk. 228 * 229 * This is why right after unmapping a page/section and invalidating 230 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure 231 * the IO subsystem will never hit in the cache. 232 * 233 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 234 * we then fully enforce cacheability of RAM, no matter what the guest 235 * does. 236 */ 237 static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, 238 phys_addr_t addr, phys_addr_t end) 239 { 240 phys_addr_t start_addr = addr; 241 pte_t *pte, *start_pte; 242 243 start_pte = pte = pte_offset_kernel(pmd, addr); 244 do { 245 if (!pte_none(*pte)) { 246 pte_t old_pte = *pte; 247 248 kvm_set_pte(pte, __pte(0)); 249 kvm_tlb_flush_vmid_ipa(kvm, addr); 250 251 /* No need to invalidate the cache for device mappings */ 252 if (!kvm_is_device_pfn(pte_pfn(old_pte))) 253 kvm_flush_dcache_pte(old_pte); 254 255 put_page(virt_to_page(pte)); 256 } 257 } while (pte++, addr += PAGE_SIZE, addr != end); 258 259 if (stage2_pte_table_empty(kvm, start_pte)) 260 clear_stage2_pmd_entry(kvm, pmd, start_addr); 261 } 262 263 static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, 264 phys_addr_t addr, phys_addr_t end) 265 { 266 phys_addr_t next, start_addr = addr; 267 pmd_t *pmd, *start_pmd; 268 269 start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); 270 do { 271 next = stage2_pmd_addr_end(kvm, addr, end); 272 if (!pmd_none(*pmd)) { 273 if (pmd_thp_or_huge(*pmd)) { 274 pmd_t old_pmd = *pmd; 275 276 pmd_clear(pmd); 277 kvm_tlb_flush_vmid_ipa(kvm, addr); 278 279 kvm_flush_dcache_pmd(old_pmd); 280 281 put_page(virt_to_page(pmd)); 282 } else { 283 unmap_stage2_ptes(kvm, pmd, addr, next); 284 } 285 } 286 } while (pmd++, addr = next, addr != end); 287 288 if (stage2_pmd_table_empty(kvm, start_pmd)) 289 clear_stage2_pud_entry(kvm, pud, start_addr); 290 } 291 292 static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, 293 phys_addr_t addr, phys_addr_t end) 294 { 295 phys_addr_t next, start_addr = addr; 296 pud_t *pud, *start_pud; 297 298 start_pud = pud = stage2_pud_offset(kvm, pgd, addr); 299 do { 300 next = stage2_pud_addr_end(kvm, addr, end); 301 if (!stage2_pud_none(kvm, *pud)) { 302 if (stage2_pud_huge(kvm, *pud)) { 303 pud_t old_pud = *pud; 304 305 stage2_pud_clear(kvm, pud); 306 kvm_tlb_flush_vmid_ipa(kvm, addr); 307 kvm_flush_dcache_pud(old_pud); 308 put_page(virt_to_page(pud)); 309 } else { 310 unmap_stage2_pmds(kvm, pud, addr, next); 311 } 312 } 313 } while (pud++, addr = next, addr != end); 314 315 if (stage2_pud_table_empty(kvm, start_pud)) 316 clear_stage2_pgd_entry(kvm, pgd, start_addr); 317 } 318 319 /** 320 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 321 * @kvm: The VM pointer 322 * @start: The intermediate physical base address of the range to unmap 323 * @size: The size of the area to unmap 324 * 325 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 326 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 327 * destroying the VM), otherwise another faulting VCPU may come in and mess 328 * with things behind our backs. 329 */ 330 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 331 { 332 pgd_t *pgd; 333 phys_addr_t addr = start, end = start + size; 334 phys_addr_t next; 335 336 assert_spin_locked(&kvm->mmu_lock); 337 WARN_ON(size & ~PAGE_MASK); 338 339 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 340 do { 341 /* 342 * Make sure the page table is still active, as another thread 343 * could have possibly freed the page table, while we released 344 * the lock. 345 */ 346 if (!READ_ONCE(kvm->arch.pgd)) 347 break; 348 next = stage2_pgd_addr_end(kvm, addr, end); 349 if (!stage2_pgd_none(kvm, *pgd)) 350 unmap_stage2_puds(kvm, pgd, addr, next); 351 /* 352 * If the range is too large, release the kvm->mmu_lock 353 * to prevent starvation and lockup detector warnings. 354 */ 355 if (next != end) 356 cond_resched_lock(&kvm->mmu_lock); 357 } while (pgd++, addr = next, addr != end); 358 } 359 360 static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, 361 phys_addr_t addr, phys_addr_t end) 362 { 363 pte_t *pte; 364 365 pte = pte_offset_kernel(pmd, addr); 366 do { 367 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) 368 kvm_flush_dcache_pte(*pte); 369 } while (pte++, addr += PAGE_SIZE, addr != end); 370 } 371 372 static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, 373 phys_addr_t addr, phys_addr_t end) 374 { 375 pmd_t *pmd; 376 phys_addr_t next; 377 378 pmd = stage2_pmd_offset(kvm, pud, addr); 379 do { 380 next = stage2_pmd_addr_end(kvm, addr, end); 381 if (!pmd_none(*pmd)) { 382 if (pmd_thp_or_huge(*pmd)) 383 kvm_flush_dcache_pmd(*pmd); 384 else 385 stage2_flush_ptes(kvm, pmd, addr, next); 386 } 387 } while (pmd++, addr = next, addr != end); 388 } 389 390 static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, 391 phys_addr_t addr, phys_addr_t end) 392 { 393 pud_t *pud; 394 phys_addr_t next; 395 396 pud = stage2_pud_offset(kvm, pgd, addr); 397 do { 398 next = stage2_pud_addr_end(kvm, addr, end); 399 if (!stage2_pud_none(kvm, *pud)) { 400 if (stage2_pud_huge(kvm, *pud)) 401 kvm_flush_dcache_pud(*pud); 402 else 403 stage2_flush_pmds(kvm, pud, addr, next); 404 } 405 } while (pud++, addr = next, addr != end); 406 } 407 408 static void stage2_flush_memslot(struct kvm *kvm, 409 struct kvm_memory_slot *memslot) 410 { 411 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 412 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 413 phys_addr_t next; 414 pgd_t *pgd; 415 416 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 417 do { 418 next = stage2_pgd_addr_end(kvm, addr, end); 419 if (!stage2_pgd_none(kvm, *pgd)) 420 stage2_flush_puds(kvm, pgd, addr, next); 421 } while (pgd++, addr = next, addr != end); 422 } 423 424 /** 425 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 426 * @kvm: The struct kvm pointer 427 * 428 * Go through the stage 2 page tables and invalidate any cache lines 429 * backing memory already mapped to the VM. 430 */ 431 static void stage2_flush_vm(struct kvm *kvm) 432 { 433 struct kvm_memslots *slots; 434 struct kvm_memory_slot *memslot; 435 int idx; 436 437 idx = srcu_read_lock(&kvm->srcu); 438 spin_lock(&kvm->mmu_lock); 439 440 slots = kvm_memslots(kvm); 441 kvm_for_each_memslot(memslot, slots) 442 stage2_flush_memslot(kvm, memslot); 443 444 spin_unlock(&kvm->mmu_lock); 445 srcu_read_unlock(&kvm->srcu, idx); 446 } 447 448 static void clear_hyp_pgd_entry(pgd_t *pgd) 449 { 450 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); 451 pgd_clear(pgd); 452 pud_free(NULL, pud_table); 453 put_page(virt_to_page(pgd)); 454 } 455 456 static void clear_hyp_pud_entry(pud_t *pud) 457 { 458 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); 459 VM_BUG_ON(pud_huge(*pud)); 460 pud_clear(pud); 461 pmd_free(NULL, pmd_table); 462 put_page(virt_to_page(pud)); 463 } 464 465 static void clear_hyp_pmd_entry(pmd_t *pmd) 466 { 467 pte_t *pte_table = pte_offset_kernel(pmd, 0); 468 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 469 pmd_clear(pmd); 470 pte_free_kernel(NULL, pte_table); 471 put_page(virt_to_page(pmd)); 472 } 473 474 static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 475 { 476 pte_t *pte, *start_pte; 477 478 start_pte = pte = pte_offset_kernel(pmd, addr); 479 do { 480 if (!pte_none(*pte)) { 481 kvm_set_pte(pte, __pte(0)); 482 put_page(virt_to_page(pte)); 483 } 484 } while (pte++, addr += PAGE_SIZE, addr != end); 485 486 if (hyp_pte_table_empty(start_pte)) 487 clear_hyp_pmd_entry(pmd); 488 } 489 490 static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 491 { 492 phys_addr_t next; 493 pmd_t *pmd, *start_pmd; 494 495 start_pmd = pmd = pmd_offset(pud, addr); 496 do { 497 next = pmd_addr_end(addr, end); 498 /* Hyp doesn't use huge pmds */ 499 if (!pmd_none(*pmd)) 500 unmap_hyp_ptes(pmd, addr, next); 501 } while (pmd++, addr = next, addr != end); 502 503 if (hyp_pmd_table_empty(start_pmd)) 504 clear_hyp_pud_entry(pud); 505 } 506 507 static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 508 { 509 phys_addr_t next; 510 pud_t *pud, *start_pud; 511 512 start_pud = pud = pud_offset(pgd, addr); 513 do { 514 next = pud_addr_end(addr, end); 515 /* Hyp doesn't use huge puds */ 516 if (!pud_none(*pud)) 517 unmap_hyp_pmds(pud, addr, next); 518 } while (pud++, addr = next, addr != end); 519 520 if (hyp_pud_table_empty(start_pud)) 521 clear_hyp_pgd_entry(pgd); 522 } 523 524 static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) 525 { 526 return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); 527 } 528 529 static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, 530 phys_addr_t start, u64 size) 531 { 532 pgd_t *pgd; 533 phys_addr_t addr = start, end = start + size; 534 phys_addr_t next; 535 536 /* 537 * We don't unmap anything from HYP, except at the hyp tear down. 538 * Hence, we don't have to invalidate the TLBs here. 539 */ 540 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 541 do { 542 next = pgd_addr_end(addr, end); 543 if (!pgd_none(*pgd)) 544 unmap_hyp_puds(pgd, addr, next); 545 } while (pgd++, addr = next, addr != end); 546 } 547 548 static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) 549 { 550 __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); 551 } 552 553 static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) 554 { 555 __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); 556 } 557 558 /** 559 * free_hyp_pgds - free Hyp-mode page tables 560 * 561 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 562 * therefore contains either mappings in the kernel memory area (above 563 * PAGE_OFFSET), or device mappings in the idmap range. 564 * 565 * boot_hyp_pgd should only map the idmap range, and is only used in 566 * the extended idmap case. 567 */ 568 void free_hyp_pgds(void) 569 { 570 pgd_t *id_pgd; 571 572 mutex_lock(&kvm_hyp_pgd_mutex); 573 574 id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; 575 576 if (id_pgd) { 577 /* In case we never called hyp_mmu_init() */ 578 if (!io_map_base) 579 io_map_base = hyp_idmap_start; 580 unmap_hyp_idmap_range(id_pgd, io_map_base, 581 hyp_idmap_start + PAGE_SIZE - io_map_base); 582 } 583 584 if (boot_hyp_pgd) { 585 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 586 boot_hyp_pgd = NULL; 587 } 588 589 if (hyp_pgd) { 590 unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), 591 (uintptr_t)high_memory - PAGE_OFFSET); 592 593 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 594 hyp_pgd = NULL; 595 } 596 if (merged_hyp_pgd) { 597 clear_page(merged_hyp_pgd); 598 free_page((unsigned long)merged_hyp_pgd); 599 merged_hyp_pgd = NULL; 600 } 601 602 mutex_unlock(&kvm_hyp_pgd_mutex); 603 } 604 605 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 606 unsigned long end, unsigned long pfn, 607 pgprot_t prot) 608 { 609 pte_t *pte; 610 unsigned long addr; 611 612 addr = start; 613 do { 614 pte = pte_offset_kernel(pmd, addr); 615 kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); 616 get_page(virt_to_page(pte)); 617 pfn++; 618 } while (addr += PAGE_SIZE, addr != end); 619 } 620 621 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 622 unsigned long end, unsigned long pfn, 623 pgprot_t prot) 624 { 625 pmd_t *pmd; 626 pte_t *pte; 627 unsigned long addr, next; 628 629 addr = start; 630 do { 631 pmd = pmd_offset(pud, addr); 632 633 BUG_ON(pmd_sect(*pmd)); 634 635 if (pmd_none(*pmd)) { 636 pte = pte_alloc_one_kernel(NULL); 637 if (!pte) { 638 kvm_err("Cannot allocate Hyp pte\n"); 639 return -ENOMEM; 640 } 641 kvm_pmd_populate(pmd, pte); 642 get_page(virt_to_page(pmd)); 643 } 644 645 next = pmd_addr_end(addr, end); 646 647 create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 648 pfn += (next - addr) >> PAGE_SHIFT; 649 } while (addr = next, addr != end); 650 651 return 0; 652 } 653 654 static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, 655 unsigned long end, unsigned long pfn, 656 pgprot_t prot) 657 { 658 pud_t *pud; 659 pmd_t *pmd; 660 unsigned long addr, next; 661 int ret; 662 663 addr = start; 664 do { 665 pud = pud_offset(pgd, addr); 666 667 if (pud_none_or_clear_bad(pud)) { 668 pmd = pmd_alloc_one(NULL, addr); 669 if (!pmd) { 670 kvm_err("Cannot allocate Hyp pmd\n"); 671 return -ENOMEM; 672 } 673 kvm_pud_populate(pud, pmd); 674 get_page(virt_to_page(pud)); 675 } 676 677 next = pud_addr_end(addr, end); 678 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 679 if (ret) 680 return ret; 681 pfn += (next - addr) >> PAGE_SHIFT; 682 } while (addr = next, addr != end); 683 684 return 0; 685 } 686 687 static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, 688 unsigned long start, unsigned long end, 689 unsigned long pfn, pgprot_t prot) 690 { 691 pgd_t *pgd; 692 pud_t *pud; 693 unsigned long addr, next; 694 int err = 0; 695 696 mutex_lock(&kvm_hyp_pgd_mutex); 697 addr = start & PAGE_MASK; 698 end = PAGE_ALIGN(end); 699 do { 700 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 701 702 if (pgd_none(*pgd)) { 703 pud = pud_alloc_one(NULL, addr); 704 if (!pud) { 705 kvm_err("Cannot allocate Hyp pud\n"); 706 err = -ENOMEM; 707 goto out; 708 } 709 kvm_pgd_populate(pgd, pud); 710 get_page(virt_to_page(pgd)); 711 } 712 713 next = pgd_addr_end(addr, end); 714 err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); 715 if (err) 716 goto out; 717 pfn += (next - addr) >> PAGE_SHIFT; 718 } while (addr = next, addr != end); 719 out: 720 mutex_unlock(&kvm_hyp_pgd_mutex); 721 return err; 722 } 723 724 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 725 { 726 if (!is_vmalloc_addr(kaddr)) { 727 BUG_ON(!virt_addr_valid(kaddr)); 728 return __pa(kaddr); 729 } else { 730 return page_to_phys(vmalloc_to_page(kaddr)) + 731 offset_in_page(kaddr); 732 } 733 } 734 735 /** 736 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 737 * @from: The virtual kernel start address of the range 738 * @to: The virtual kernel end address of the range (exclusive) 739 * @prot: The protection to be applied to this range 740 * 741 * The same virtual address as the kernel virtual address is also used 742 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 743 * physical pages. 744 */ 745 int create_hyp_mappings(void *from, void *to, pgprot_t prot) 746 { 747 phys_addr_t phys_addr; 748 unsigned long virt_addr; 749 unsigned long start = kern_hyp_va((unsigned long)from); 750 unsigned long end = kern_hyp_va((unsigned long)to); 751 752 if (is_kernel_in_hyp_mode()) 753 return 0; 754 755 start = start & PAGE_MASK; 756 end = PAGE_ALIGN(end); 757 758 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 759 int err; 760 761 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 762 err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, 763 virt_addr, virt_addr + PAGE_SIZE, 764 __phys_to_pfn(phys_addr), 765 prot); 766 if (err) 767 return err; 768 } 769 770 return 0; 771 } 772 773 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 774 unsigned long *haddr, pgprot_t prot) 775 { 776 pgd_t *pgd = hyp_pgd; 777 unsigned long base; 778 int ret = 0; 779 780 mutex_lock(&kvm_hyp_pgd_mutex); 781 782 /* 783 * This assumes that we we have enough space below the idmap 784 * page to allocate our VAs. If not, the check below will 785 * kick. A potential alternative would be to detect that 786 * overflow and switch to an allocation above the idmap. 787 * 788 * The allocated size is always a multiple of PAGE_SIZE. 789 */ 790 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 791 base = io_map_base - size; 792 793 /* 794 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 795 * allocating the new area, as it would indicate we've 796 * overflowed the idmap/IO address range. 797 */ 798 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 799 ret = -ENOMEM; 800 else 801 io_map_base = base; 802 803 mutex_unlock(&kvm_hyp_pgd_mutex); 804 805 if (ret) 806 goto out; 807 808 if (__kvm_cpu_uses_extended_idmap()) 809 pgd = boot_hyp_pgd; 810 811 ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 812 base, base + size, 813 __phys_to_pfn(phys_addr), prot); 814 if (ret) 815 goto out; 816 817 *haddr = base + offset_in_page(phys_addr); 818 819 out: 820 return ret; 821 } 822 823 /** 824 * create_hyp_io_mappings - Map IO into both kernel and HYP 825 * @phys_addr: The physical start address which gets mapped 826 * @size: Size of the region being mapped 827 * @kaddr: Kernel VA for this mapping 828 * @haddr: HYP VA for this mapping 829 */ 830 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 831 void __iomem **kaddr, 832 void __iomem **haddr) 833 { 834 unsigned long addr; 835 int ret; 836 837 *kaddr = ioremap(phys_addr, size); 838 if (!*kaddr) 839 return -ENOMEM; 840 841 if (is_kernel_in_hyp_mode()) { 842 *haddr = *kaddr; 843 return 0; 844 } 845 846 ret = __create_hyp_private_mapping(phys_addr, size, 847 &addr, PAGE_HYP_DEVICE); 848 if (ret) { 849 iounmap(*kaddr); 850 *kaddr = NULL; 851 *haddr = NULL; 852 return ret; 853 } 854 855 *haddr = (void __iomem *)addr; 856 return 0; 857 } 858 859 /** 860 * create_hyp_exec_mappings - Map an executable range into HYP 861 * @phys_addr: The physical start address which gets mapped 862 * @size: Size of the region being mapped 863 * @haddr: HYP VA for this mapping 864 */ 865 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 866 void **haddr) 867 { 868 unsigned long addr; 869 int ret; 870 871 BUG_ON(is_kernel_in_hyp_mode()); 872 873 ret = __create_hyp_private_mapping(phys_addr, size, 874 &addr, PAGE_HYP_EXEC); 875 if (ret) { 876 *haddr = NULL; 877 return ret; 878 } 879 880 *haddr = (void *)addr; 881 return 0; 882 } 883 884 /** 885 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 886 * @kvm: The KVM struct pointer for the VM. 887 * 888 * Allocates only the stage-2 HW PGD level table(s) of size defined by 889 * stage2_pgd_size(kvm). 890 * 891 * Note we don't need locking here as this is only called when the VM is 892 * created, which can only be done once. 893 */ 894 int kvm_alloc_stage2_pgd(struct kvm *kvm) 895 { 896 phys_addr_t pgd_phys; 897 pgd_t *pgd; 898 899 if (kvm->arch.pgd != NULL) { 900 kvm_err("kvm_arch already initialized?\n"); 901 return -EINVAL; 902 } 903 904 /* Allocate the HW PGD, making sure that each page gets its own refcount */ 905 pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); 906 if (!pgd) 907 return -ENOMEM; 908 909 pgd_phys = virt_to_phys(pgd); 910 if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) 911 return -EINVAL; 912 913 kvm->arch.pgd = pgd; 914 kvm->arch.pgd_phys = pgd_phys; 915 return 0; 916 } 917 918 static void stage2_unmap_memslot(struct kvm *kvm, 919 struct kvm_memory_slot *memslot) 920 { 921 hva_t hva = memslot->userspace_addr; 922 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 923 phys_addr_t size = PAGE_SIZE * memslot->npages; 924 hva_t reg_end = hva + size; 925 926 /* 927 * A memory region could potentially cover multiple VMAs, and any holes 928 * between them, so iterate over all of them to find out if we should 929 * unmap any of them. 930 * 931 * +--------------------------------------------+ 932 * +---------------+----------------+ +----------------+ 933 * | : VMA 1 | VMA 2 | | VMA 3 : | 934 * +---------------+----------------+ +----------------+ 935 * | memory region | 936 * +--------------------------------------------+ 937 */ 938 do { 939 struct vm_area_struct *vma = find_vma(current->mm, hva); 940 hva_t vm_start, vm_end; 941 942 if (!vma || vma->vm_start >= reg_end) 943 break; 944 945 /* 946 * Take the intersection of this VMA with the memory region 947 */ 948 vm_start = max(hva, vma->vm_start); 949 vm_end = min(reg_end, vma->vm_end); 950 951 if (!(vma->vm_flags & VM_PFNMAP)) { 952 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 953 unmap_stage2_range(kvm, gpa, vm_end - vm_start); 954 } 955 hva = vm_end; 956 } while (hva < reg_end); 957 } 958 959 /** 960 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 961 * @kvm: The struct kvm pointer 962 * 963 * Go through the memregions and unmap any reguler RAM 964 * backing memory already mapped to the VM. 965 */ 966 void stage2_unmap_vm(struct kvm *kvm) 967 { 968 struct kvm_memslots *slots; 969 struct kvm_memory_slot *memslot; 970 int idx; 971 972 idx = srcu_read_lock(&kvm->srcu); 973 down_read(¤t->mm->mmap_sem); 974 spin_lock(&kvm->mmu_lock); 975 976 slots = kvm_memslots(kvm); 977 kvm_for_each_memslot(memslot, slots) 978 stage2_unmap_memslot(kvm, memslot); 979 980 spin_unlock(&kvm->mmu_lock); 981 up_read(¤t->mm->mmap_sem); 982 srcu_read_unlock(&kvm->srcu, idx); 983 } 984 985 /** 986 * kvm_free_stage2_pgd - free all stage-2 tables 987 * @kvm: The KVM struct pointer for the VM. 988 * 989 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all 990 * underlying level-2 and level-3 tables before freeing the actual level-1 table 991 * and setting the struct pointer to NULL. 992 */ 993 void kvm_free_stage2_pgd(struct kvm *kvm) 994 { 995 void *pgd = NULL; 996 997 spin_lock(&kvm->mmu_lock); 998 if (kvm->arch.pgd) { 999 unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); 1000 pgd = READ_ONCE(kvm->arch.pgd); 1001 kvm->arch.pgd = NULL; 1002 kvm->arch.pgd_phys = 0; 1003 } 1004 spin_unlock(&kvm->mmu_lock); 1005 1006 /* Free the HW pgd, one page at a time */ 1007 if (pgd) 1008 free_pages_exact(pgd, stage2_pgd_size(kvm)); 1009 } 1010 1011 static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1012 phys_addr_t addr) 1013 { 1014 pgd_t *pgd; 1015 pud_t *pud; 1016 1017 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 1018 if (stage2_pgd_none(kvm, *pgd)) { 1019 if (!cache) 1020 return NULL; 1021 pud = mmu_memory_cache_alloc(cache); 1022 stage2_pgd_populate(kvm, pgd, pud); 1023 get_page(virt_to_page(pgd)); 1024 } 1025 1026 return stage2_pud_offset(kvm, pgd, addr); 1027 } 1028 1029 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1030 phys_addr_t addr) 1031 { 1032 pud_t *pud; 1033 pmd_t *pmd; 1034 1035 pud = stage2_get_pud(kvm, cache, addr); 1036 if (!pud || stage2_pud_huge(kvm, *pud)) 1037 return NULL; 1038 1039 if (stage2_pud_none(kvm, *pud)) { 1040 if (!cache) 1041 return NULL; 1042 pmd = mmu_memory_cache_alloc(cache); 1043 stage2_pud_populate(kvm, pud, pmd); 1044 get_page(virt_to_page(pud)); 1045 } 1046 1047 return stage2_pmd_offset(kvm, pud, addr); 1048 } 1049 1050 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 1051 *cache, phys_addr_t addr, const pmd_t *new_pmd) 1052 { 1053 pmd_t *pmd, old_pmd; 1054 1055 retry: 1056 pmd = stage2_get_pmd(kvm, cache, addr); 1057 VM_BUG_ON(!pmd); 1058 1059 old_pmd = *pmd; 1060 /* 1061 * Multiple vcpus faulting on the same PMD entry, can 1062 * lead to them sequentially updating the PMD with the 1063 * same value. Following the break-before-make 1064 * (pmd_clear() followed by tlb_flush()) process can 1065 * hinder forward progress due to refaults generated 1066 * on missing translations. 1067 * 1068 * Skip updating the page table if the entry is 1069 * unchanged. 1070 */ 1071 if (pmd_val(old_pmd) == pmd_val(*new_pmd)) 1072 return 0; 1073 1074 if (pmd_present(old_pmd)) { 1075 /* 1076 * If we already have PTE level mapping for this block, 1077 * we must unmap it to avoid inconsistent TLB state and 1078 * leaking the table page. We could end up in this situation 1079 * if the memory slot was marked for dirty logging and was 1080 * reverted, leaving PTE level mappings for the pages accessed 1081 * during the period. So, unmap the PTE level mapping for this 1082 * block and retry, as we could have released the upper level 1083 * table in the process. 1084 * 1085 * Normal THP split/merge follows mmu_notifier callbacks and do 1086 * get handled accordingly. 1087 */ 1088 if (!pmd_thp_or_huge(old_pmd)) { 1089 unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); 1090 goto retry; 1091 } 1092 /* 1093 * Mapping in huge pages should only happen through a 1094 * fault. If a page is merged into a transparent huge 1095 * page, the individual subpages of that huge page 1096 * should be unmapped through MMU notifiers before we 1097 * get here. 1098 * 1099 * Merging of CompoundPages is not supported; they 1100 * should become splitting first, unmapped, merged, 1101 * and mapped back in on-demand. 1102 */ 1103 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); 1104 pmd_clear(pmd); 1105 kvm_tlb_flush_vmid_ipa(kvm, addr); 1106 } else { 1107 get_page(virt_to_page(pmd)); 1108 } 1109 1110 kvm_set_pmd(pmd, *new_pmd); 1111 return 0; 1112 } 1113 1114 static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1115 phys_addr_t addr, const pud_t *new_pudp) 1116 { 1117 pud_t *pudp, old_pud; 1118 1119 retry: 1120 pudp = stage2_get_pud(kvm, cache, addr); 1121 VM_BUG_ON(!pudp); 1122 1123 old_pud = *pudp; 1124 1125 /* 1126 * A large number of vcpus faulting on the same stage 2 entry, 1127 * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). 1128 * Skip updating the page tables if there is no change. 1129 */ 1130 if (pud_val(old_pud) == pud_val(*new_pudp)) 1131 return 0; 1132 1133 if (stage2_pud_present(kvm, old_pud)) { 1134 /* 1135 * If we already have table level mapping for this block, unmap 1136 * the range for this block and retry. 1137 */ 1138 if (!stage2_pud_huge(kvm, old_pud)) { 1139 unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); 1140 goto retry; 1141 } 1142 1143 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); 1144 stage2_pud_clear(kvm, pudp); 1145 kvm_tlb_flush_vmid_ipa(kvm, addr); 1146 } else { 1147 get_page(virt_to_page(pudp)); 1148 } 1149 1150 kvm_set_pud(pudp, *new_pudp); 1151 return 0; 1152 } 1153 1154 /* 1155 * stage2_get_leaf_entry - walk the stage2 VM page tables and return 1156 * true if a valid and present leaf-entry is found. A pointer to the 1157 * leaf-entry is returned in the appropriate level variable - pudpp, 1158 * pmdpp, ptepp. 1159 */ 1160 static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, 1161 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) 1162 { 1163 pud_t *pudp; 1164 pmd_t *pmdp; 1165 pte_t *ptep; 1166 1167 *pudpp = NULL; 1168 *pmdpp = NULL; 1169 *ptepp = NULL; 1170 1171 pudp = stage2_get_pud(kvm, NULL, addr); 1172 if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) 1173 return false; 1174 1175 if (stage2_pud_huge(kvm, *pudp)) { 1176 *pudpp = pudp; 1177 return true; 1178 } 1179 1180 pmdp = stage2_pmd_offset(kvm, pudp, addr); 1181 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) 1182 return false; 1183 1184 if (pmd_thp_or_huge(*pmdp)) { 1185 *pmdpp = pmdp; 1186 return true; 1187 } 1188 1189 ptep = pte_offset_kernel(pmdp, addr); 1190 if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) 1191 return false; 1192 1193 *ptepp = ptep; 1194 return true; 1195 } 1196 1197 static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) 1198 { 1199 pud_t *pudp; 1200 pmd_t *pmdp; 1201 pte_t *ptep; 1202 bool found; 1203 1204 found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); 1205 if (!found) 1206 return false; 1207 1208 if (pudp) 1209 return kvm_s2pud_exec(pudp); 1210 else if (pmdp) 1211 return kvm_s2pmd_exec(pmdp); 1212 else 1213 return kvm_s2pte_exec(ptep); 1214 } 1215 1216 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1217 phys_addr_t addr, const pte_t *new_pte, 1218 unsigned long flags) 1219 { 1220 pud_t *pud; 1221 pmd_t *pmd; 1222 pte_t *pte, old_pte; 1223 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; 1224 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; 1225 1226 VM_BUG_ON(logging_active && !cache); 1227 1228 /* Create stage-2 page table mapping - Levels 0 and 1 */ 1229 pud = stage2_get_pud(kvm, cache, addr); 1230 if (!pud) { 1231 /* 1232 * Ignore calls from kvm_set_spte_hva for unallocated 1233 * address ranges. 1234 */ 1235 return 0; 1236 } 1237 1238 /* 1239 * While dirty page logging - dissolve huge PUD, then continue 1240 * on to allocate page. 1241 */ 1242 if (logging_active) 1243 stage2_dissolve_pud(kvm, addr, pud); 1244 1245 if (stage2_pud_none(kvm, *pud)) { 1246 if (!cache) 1247 return 0; /* ignore calls from kvm_set_spte_hva */ 1248 pmd = mmu_memory_cache_alloc(cache); 1249 stage2_pud_populate(kvm, pud, pmd); 1250 get_page(virt_to_page(pud)); 1251 } 1252 1253 pmd = stage2_pmd_offset(kvm, pud, addr); 1254 if (!pmd) { 1255 /* 1256 * Ignore calls from kvm_set_spte_hva for unallocated 1257 * address ranges. 1258 */ 1259 return 0; 1260 } 1261 1262 /* 1263 * While dirty page logging - dissolve huge PMD, then continue on to 1264 * allocate page. 1265 */ 1266 if (logging_active) 1267 stage2_dissolve_pmd(kvm, addr, pmd); 1268 1269 /* Create stage-2 page mappings - Level 2 */ 1270 if (pmd_none(*pmd)) { 1271 if (!cache) 1272 return 0; /* ignore calls from kvm_set_spte_hva */ 1273 pte = mmu_memory_cache_alloc(cache); 1274 kvm_pmd_populate(pmd, pte); 1275 get_page(virt_to_page(pmd)); 1276 } 1277 1278 pte = pte_offset_kernel(pmd, addr); 1279 1280 if (iomap && pte_present(*pte)) 1281 return -EFAULT; 1282 1283 /* Create 2nd stage page table mapping - Level 3 */ 1284 old_pte = *pte; 1285 if (pte_present(old_pte)) { 1286 /* Skip page table update if there is no change */ 1287 if (pte_val(old_pte) == pte_val(*new_pte)) 1288 return 0; 1289 1290 kvm_set_pte(pte, __pte(0)); 1291 kvm_tlb_flush_vmid_ipa(kvm, addr); 1292 } else { 1293 get_page(virt_to_page(pte)); 1294 } 1295 1296 kvm_set_pte(pte, *new_pte); 1297 return 0; 1298 } 1299 1300 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 1301 static int stage2_ptep_test_and_clear_young(pte_t *pte) 1302 { 1303 if (pte_young(*pte)) { 1304 *pte = pte_mkold(*pte); 1305 return 1; 1306 } 1307 return 0; 1308 } 1309 #else 1310 static int stage2_ptep_test_and_clear_young(pte_t *pte) 1311 { 1312 return __ptep_test_and_clear_young(pte); 1313 } 1314 #endif 1315 1316 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) 1317 { 1318 return stage2_ptep_test_and_clear_young((pte_t *)pmd); 1319 } 1320 1321 static int stage2_pudp_test_and_clear_young(pud_t *pud) 1322 { 1323 return stage2_ptep_test_and_clear_young((pte_t *)pud); 1324 } 1325 1326 /** 1327 * kvm_phys_addr_ioremap - map a device range to guest IPA 1328 * 1329 * @kvm: The KVM pointer 1330 * @guest_ipa: The IPA at which to insert the mapping 1331 * @pa: The physical address of the device 1332 * @size: The size of the mapping 1333 */ 1334 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1335 phys_addr_t pa, unsigned long size, bool writable) 1336 { 1337 phys_addr_t addr, end; 1338 int ret = 0; 1339 unsigned long pfn; 1340 struct kvm_mmu_memory_cache cache = { 0, }; 1341 1342 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; 1343 pfn = __phys_to_pfn(pa); 1344 1345 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 1346 pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); 1347 1348 if (writable) 1349 pte = kvm_s2pte_mkwrite(pte); 1350 1351 ret = mmu_topup_memory_cache(&cache, 1352 kvm_mmu_cache_min_pages(kvm), 1353 KVM_NR_MEM_OBJS); 1354 if (ret) 1355 goto out; 1356 spin_lock(&kvm->mmu_lock); 1357 ret = stage2_set_pte(kvm, &cache, addr, &pte, 1358 KVM_S2PTE_FLAG_IS_IOMAP); 1359 spin_unlock(&kvm->mmu_lock); 1360 if (ret) 1361 goto out; 1362 1363 pfn++; 1364 } 1365 1366 out: 1367 mmu_free_memory_cache(&cache); 1368 return ret; 1369 } 1370 1371 static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) 1372 { 1373 kvm_pfn_t pfn = *pfnp; 1374 gfn_t gfn = *ipap >> PAGE_SHIFT; 1375 struct page *page = pfn_to_page(pfn); 1376 1377 /* 1378 * PageTransCompoundMap() returns true for THP and 1379 * hugetlbfs. Make sure the adjustment is done only for THP 1380 * pages. 1381 */ 1382 if (!PageHuge(page) && PageTransCompoundMap(page)) { 1383 unsigned long mask; 1384 /* 1385 * The address we faulted on is backed by a transparent huge 1386 * page. However, because we map the compound huge page and 1387 * not the individual tail page, we need to transfer the 1388 * refcount to the head page. We have to be careful that the 1389 * THP doesn't start to split while we are adjusting the 1390 * refcounts. 1391 * 1392 * We are sure this doesn't happen, because mmu_notifier_retry 1393 * was successful and we are holding the mmu_lock, so if this 1394 * THP is trying to split, it will be blocked in the mmu 1395 * notifier before touching any of the pages, specifically 1396 * before being able to call __split_huge_page_refcount(). 1397 * 1398 * We can therefore safely transfer the refcount from PG_tail 1399 * to PG_head and switch the pfn from a tail page to the head 1400 * page accordingly. 1401 */ 1402 mask = PTRS_PER_PMD - 1; 1403 VM_BUG_ON((gfn & mask) != (pfn & mask)); 1404 if (pfn & mask) { 1405 *ipap &= PMD_MASK; 1406 kvm_release_pfn_clean(pfn); 1407 pfn &= ~mask; 1408 kvm_get_pfn(pfn); 1409 *pfnp = pfn; 1410 } 1411 1412 return true; 1413 } 1414 1415 return false; 1416 } 1417 1418 /** 1419 * stage2_wp_ptes - write protect PMD range 1420 * @pmd: pointer to pmd entry 1421 * @addr: range start address 1422 * @end: range end address 1423 */ 1424 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 1425 { 1426 pte_t *pte; 1427 1428 pte = pte_offset_kernel(pmd, addr); 1429 do { 1430 if (!pte_none(*pte)) { 1431 if (!kvm_s2pte_readonly(pte)) 1432 kvm_set_s2pte_readonly(pte); 1433 } 1434 } while (pte++, addr += PAGE_SIZE, addr != end); 1435 } 1436 1437 /** 1438 * stage2_wp_pmds - write protect PUD range 1439 * kvm: kvm instance for the VM 1440 * @pud: pointer to pud entry 1441 * @addr: range start address 1442 * @end: range end address 1443 */ 1444 static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, 1445 phys_addr_t addr, phys_addr_t end) 1446 { 1447 pmd_t *pmd; 1448 phys_addr_t next; 1449 1450 pmd = stage2_pmd_offset(kvm, pud, addr); 1451 1452 do { 1453 next = stage2_pmd_addr_end(kvm, addr, end); 1454 if (!pmd_none(*pmd)) { 1455 if (pmd_thp_or_huge(*pmd)) { 1456 if (!kvm_s2pmd_readonly(pmd)) 1457 kvm_set_s2pmd_readonly(pmd); 1458 } else { 1459 stage2_wp_ptes(pmd, addr, next); 1460 } 1461 } 1462 } while (pmd++, addr = next, addr != end); 1463 } 1464 1465 /** 1466 * stage2_wp_puds - write protect PGD range 1467 * @pgd: pointer to pgd entry 1468 * @addr: range start address 1469 * @end: range end address 1470 */ 1471 static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, 1472 phys_addr_t addr, phys_addr_t end) 1473 { 1474 pud_t *pud; 1475 phys_addr_t next; 1476 1477 pud = stage2_pud_offset(kvm, pgd, addr); 1478 do { 1479 next = stage2_pud_addr_end(kvm, addr, end); 1480 if (!stage2_pud_none(kvm, *pud)) { 1481 if (stage2_pud_huge(kvm, *pud)) { 1482 if (!kvm_s2pud_readonly(pud)) 1483 kvm_set_s2pud_readonly(pud); 1484 } else { 1485 stage2_wp_pmds(kvm, pud, addr, next); 1486 } 1487 } 1488 } while (pud++, addr = next, addr != end); 1489 } 1490 1491 /** 1492 * stage2_wp_range() - write protect stage2 memory region range 1493 * @kvm: The KVM pointer 1494 * @addr: Start address of range 1495 * @end: End address of range 1496 */ 1497 static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) 1498 { 1499 pgd_t *pgd; 1500 phys_addr_t next; 1501 1502 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 1503 do { 1504 /* 1505 * Release kvm_mmu_lock periodically if the memory region is 1506 * large. Otherwise, we may see kernel panics with 1507 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, 1508 * CONFIG_LOCKDEP. Additionally, holding the lock too long 1509 * will also starve other vCPUs. We have to also make sure 1510 * that the page tables are not freed while we released 1511 * the lock. 1512 */ 1513 cond_resched_lock(&kvm->mmu_lock); 1514 if (!READ_ONCE(kvm->arch.pgd)) 1515 break; 1516 next = stage2_pgd_addr_end(kvm, addr, end); 1517 if (stage2_pgd_present(kvm, *pgd)) 1518 stage2_wp_puds(kvm, pgd, addr, next); 1519 } while (pgd++, addr = next, addr != end); 1520 } 1521 1522 /** 1523 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1524 * @kvm: The KVM pointer 1525 * @slot: The memory slot to write protect 1526 * 1527 * Called to start logging dirty pages after memory region 1528 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1529 * all present PUD, PMD and PTEs are write protected in the memory region. 1530 * Afterwards read of dirty page log can be called. 1531 * 1532 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1533 * serializing operations for VM memory regions. 1534 */ 1535 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1536 { 1537 struct kvm_memslots *slots = kvm_memslots(kvm); 1538 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1539 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; 1540 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1541 1542 spin_lock(&kvm->mmu_lock); 1543 stage2_wp_range(kvm, start, end); 1544 spin_unlock(&kvm->mmu_lock); 1545 kvm_flush_remote_tlbs(kvm); 1546 } 1547 1548 /** 1549 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 1550 * @kvm: The KVM pointer 1551 * @slot: The memory slot associated with mask 1552 * @gfn_offset: The gfn offset in memory slot 1553 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 1554 * slot to be write protected 1555 * 1556 * Walks bits set in mask write protects the associated pte's. Caller must 1557 * acquire kvm_mmu_lock. 1558 */ 1559 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1560 struct kvm_memory_slot *slot, 1561 gfn_t gfn_offset, unsigned long mask) 1562 { 1563 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1564 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1565 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1566 1567 stage2_wp_range(kvm, start, end); 1568 } 1569 1570 /* 1571 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1572 * dirty pages. 1573 * 1574 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1575 * enable dirty logging for them. 1576 */ 1577 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1578 struct kvm_memory_slot *slot, 1579 gfn_t gfn_offset, unsigned long mask) 1580 { 1581 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1582 } 1583 1584 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) 1585 { 1586 __clean_dcache_guest_page(pfn, size); 1587 } 1588 1589 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) 1590 { 1591 __invalidate_icache_guest_page(pfn, size); 1592 } 1593 1594 static void kvm_send_hwpoison_signal(unsigned long address, 1595 struct vm_area_struct *vma) 1596 { 1597 short lsb; 1598 1599 if (is_vm_hugetlb_page(vma)) 1600 lsb = huge_page_shift(hstate_vma(vma)); 1601 else 1602 lsb = PAGE_SHIFT; 1603 1604 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1605 } 1606 1607 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1608 unsigned long hva, 1609 unsigned long map_size) 1610 { 1611 gpa_t gpa_start; 1612 hva_t uaddr_start, uaddr_end; 1613 size_t size; 1614 1615 size = memslot->npages * PAGE_SIZE; 1616 1617 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1618 1619 uaddr_start = memslot->userspace_addr; 1620 uaddr_end = uaddr_start + size; 1621 1622 /* 1623 * Pages belonging to memslots that don't have the same alignment 1624 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1625 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1626 * 1627 * Consider a layout like the following: 1628 * 1629 * memslot->userspace_addr: 1630 * +-----+--------------------+--------------------+---+ 1631 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1632 * +-----+--------------------+--------------------+---+ 1633 * 1634 * memslot->base_gfn << PAGE_SIZE: 1635 * +---+--------------------+--------------------+-----+ 1636 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1637 * +---+--------------------+--------------------+-----+ 1638 * 1639 * If we create those stage-2 blocks, we'll end up with this incorrect 1640 * mapping: 1641 * d -> f 1642 * e -> g 1643 * f -> h 1644 */ 1645 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1646 return false; 1647 1648 /* 1649 * Next, let's make sure we're not trying to map anything not covered 1650 * by the memslot. This means we have to prohibit block size mappings 1651 * for the beginning and end of a non-block aligned and non-block sized 1652 * memory slot (illustrated by the head and tail parts of the 1653 * userspace view above containing pages 'abcde' and 'xyz', 1654 * respectively). 1655 * 1656 * Note that it doesn't matter if we do the check using the 1657 * userspace_addr or the base_gfn, as both are equally aligned (per 1658 * the check above) and equally sized. 1659 */ 1660 return (hva & ~(map_size - 1)) >= uaddr_start && 1661 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1662 } 1663 1664 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1665 struct kvm_memory_slot *memslot, unsigned long hva, 1666 unsigned long fault_status) 1667 { 1668 int ret; 1669 bool write_fault, writable, force_pte = false; 1670 bool exec_fault, needs_exec; 1671 unsigned long mmu_seq; 1672 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 1673 struct kvm *kvm = vcpu->kvm; 1674 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1675 struct vm_area_struct *vma; 1676 kvm_pfn_t pfn; 1677 pgprot_t mem_type = PAGE_S2; 1678 bool logging_active = memslot_is_logging(memslot); 1679 unsigned long vma_pagesize, flags = 0; 1680 1681 write_fault = kvm_is_write_fault(vcpu); 1682 exec_fault = kvm_vcpu_trap_is_iabt(vcpu); 1683 VM_BUG_ON(write_fault && exec_fault); 1684 1685 if (fault_status == FSC_PERM && !write_fault && !exec_fault) { 1686 kvm_err("Unexpected L2 read permission error\n"); 1687 return -EFAULT; 1688 } 1689 1690 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1691 down_read(¤t->mm->mmap_sem); 1692 vma = find_vma_intersection(current->mm, hva, hva + 1); 1693 if (unlikely(!vma)) { 1694 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1695 up_read(¤t->mm->mmap_sem); 1696 return -EFAULT; 1697 } 1698 1699 vma_pagesize = vma_kernel_pagesize(vma); 1700 if (logging_active || 1701 !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { 1702 force_pte = true; 1703 vma_pagesize = PAGE_SIZE; 1704 } 1705 1706 /* 1707 * The stage2 has a minimum of 2 level table (For arm64 see 1708 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can 1709 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). 1710 * As for PUD huge maps, we must make sure that we have at least 1711 * 3 levels, i.e, PMD is not folded. 1712 */ 1713 if (vma_pagesize == PMD_SIZE || 1714 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) 1715 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; 1716 up_read(¤t->mm->mmap_sem); 1717 1718 /* We need minimum second+third level pages */ 1719 ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), 1720 KVM_NR_MEM_OBJS); 1721 if (ret) 1722 return ret; 1723 1724 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1725 /* 1726 * Ensure the read of mmu_notifier_seq happens before we call 1727 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1728 * the page we just got a reference to gets unmapped before we have a 1729 * chance to grab the mmu_lock, which ensure that if the page gets 1730 * unmapped afterwards, the call to kvm_unmap_hva will take it away 1731 * from us again properly. This smp_rmb() interacts with the smp_wmb() 1732 * in kvm_mmu_notifier_invalidate_<page|range_end>. 1733 */ 1734 smp_rmb(); 1735 1736 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1737 if (pfn == KVM_PFN_ERR_HWPOISON) { 1738 kvm_send_hwpoison_signal(hva, vma); 1739 return 0; 1740 } 1741 if (is_error_noslot_pfn(pfn)) 1742 return -EFAULT; 1743 1744 if (kvm_is_device_pfn(pfn)) { 1745 mem_type = PAGE_S2_DEVICE; 1746 flags |= KVM_S2PTE_FLAG_IS_IOMAP; 1747 } else if (logging_active) { 1748 /* 1749 * Faults on pages in a memslot with logging enabled 1750 * should not be mapped with huge pages (it introduces churn 1751 * and performance degradation), so force a pte mapping. 1752 */ 1753 flags |= KVM_S2_FLAG_LOGGING_ACTIVE; 1754 1755 /* 1756 * Only actually map the page as writable if this was a write 1757 * fault. 1758 */ 1759 if (!write_fault) 1760 writable = false; 1761 } 1762 1763 spin_lock(&kvm->mmu_lock); 1764 if (mmu_notifier_retry(kvm, mmu_seq)) 1765 goto out_unlock; 1766 1767 if (vma_pagesize == PAGE_SIZE && !force_pte) { 1768 /* 1769 * Only PMD_SIZE transparent hugepages(THP) are 1770 * currently supported. This code will need to be 1771 * updated to support other THP sizes. 1772 * 1773 * Make sure the host VA and the guest IPA are sufficiently 1774 * aligned and that the block is contained within the memslot. 1775 */ 1776 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && 1777 transparent_hugepage_adjust(&pfn, &fault_ipa)) 1778 vma_pagesize = PMD_SIZE; 1779 } 1780 1781 if (writable) 1782 kvm_set_pfn_dirty(pfn); 1783 1784 if (fault_status != FSC_PERM) 1785 clean_dcache_guest_page(pfn, vma_pagesize); 1786 1787 if (exec_fault) 1788 invalidate_icache_guest_page(pfn, vma_pagesize); 1789 1790 /* 1791 * If we took an execution fault we have made the 1792 * icache/dcache coherent above and should now let the s2 1793 * mapping be executable. 1794 * 1795 * Write faults (!exec_fault && FSC_PERM) are orthogonal to 1796 * execute permissions, and we preserve whatever we have. 1797 */ 1798 needs_exec = exec_fault || 1799 (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); 1800 1801 if (vma_pagesize == PUD_SIZE) { 1802 pud_t new_pud = kvm_pfn_pud(pfn, mem_type); 1803 1804 new_pud = kvm_pud_mkhuge(new_pud); 1805 if (writable) 1806 new_pud = kvm_s2pud_mkwrite(new_pud); 1807 1808 if (needs_exec) 1809 new_pud = kvm_s2pud_mkexec(new_pud); 1810 1811 ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); 1812 } else if (vma_pagesize == PMD_SIZE) { 1813 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); 1814 1815 new_pmd = kvm_pmd_mkhuge(new_pmd); 1816 1817 if (writable) 1818 new_pmd = kvm_s2pmd_mkwrite(new_pmd); 1819 1820 if (needs_exec) 1821 new_pmd = kvm_s2pmd_mkexec(new_pmd); 1822 1823 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1824 } else { 1825 pte_t new_pte = kvm_pfn_pte(pfn, mem_type); 1826 1827 if (writable) { 1828 new_pte = kvm_s2pte_mkwrite(new_pte); 1829 mark_page_dirty(kvm, gfn); 1830 } 1831 1832 if (needs_exec) 1833 new_pte = kvm_s2pte_mkexec(new_pte); 1834 1835 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1836 } 1837 1838 out_unlock: 1839 spin_unlock(&kvm->mmu_lock); 1840 kvm_set_pfn_accessed(pfn); 1841 kvm_release_pfn_clean(pfn); 1842 return ret; 1843 } 1844 1845 /* 1846 * Resolve the access fault by making the page young again. 1847 * Note that because the faulting entry is guaranteed not to be 1848 * cached in the TLB, we don't need to invalidate anything. 1849 * Only the HW Access Flag updates are supported for Stage 2 (no DBM), 1850 * so there is no need for atomic (pte|pmd)_mkyoung operations. 1851 */ 1852 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1853 { 1854 pud_t *pud; 1855 pmd_t *pmd; 1856 pte_t *pte; 1857 kvm_pfn_t pfn; 1858 bool pfn_valid = false; 1859 1860 trace_kvm_access_fault(fault_ipa); 1861 1862 spin_lock(&vcpu->kvm->mmu_lock); 1863 1864 if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) 1865 goto out; 1866 1867 if (pud) { /* HugeTLB */ 1868 *pud = kvm_s2pud_mkyoung(*pud); 1869 pfn = kvm_pud_pfn(*pud); 1870 pfn_valid = true; 1871 } else if (pmd) { /* THP, HugeTLB */ 1872 *pmd = pmd_mkyoung(*pmd); 1873 pfn = pmd_pfn(*pmd); 1874 pfn_valid = true; 1875 } else { 1876 *pte = pte_mkyoung(*pte); /* Just a page... */ 1877 pfn = pte_pfn(*pte); 1878 pfn_valid = true; 1879 } 1880 1881 out: 1882 spin_unlock(&vcpu->kvm->mmu_lock); 1883 if (pfn_valid) 1884 kvm_set_pfn_accessed(pfn); 1885 } 1886 1887 /** 1888 * kvm_handle_guest_abort - handles all 2nd stage aborts 1889 * @vcpu: the VCPU pointer 1890 * @run: the kvm_run structure 1891 * 1892 * Any abort that gets to the host is almost guaranteed to be caused by a 1893 * missing second stage translation table entry, which can mean that either the 1894 * guest simply needs more memory and we must allocate an appropriate page or it 1895 * can mean that the guest tried to access I/O memory, which is emulated by user 1896 * space. The distinction is based on the IPA causing the fault and whether this 1897 * memory region has been registered as standard RAM by user space. 1898 */ 1899 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) 1900 { 1901 unsigned long fault_status; 1902 phys_addr_t fault_ipa; 1903 struct kvm_memory_slot *memslot; 1904 unsigned long hva; 1905 bool is_iabt, write_fault, writable; 1906 gfn_t gfn; 1907 int ret, idx; 1908 1909 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1910 1911 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1912 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1913 1914 /* Synchronous External Abort? */ 1915 if (kvm_vcpu_dabt_isextabt(vcpu)) { 1916 /* 1917 * For RAS the host kernel may handle this abort. 1918 * There is no need to pass the error into the guest. 1919 */ 1920 if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) 1921 return 1; 1922 1923 if (unlikely(!is_iabt)) { 1924 kvm_inject_vabt(vcpu); 1925 return 1; 1926 } 1927 } 1928 1929 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 1930 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1931 1932 /* Check the stage-2 fault is trans. fault or write fault */ 1933 if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 1934 fault_status != FSC_ACCESS) { 1935 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1936 kvm_vcpu_trap_get_class(vcpu), 1937 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1938 (unsigned long)kvm_vcpu_get_hsr(vcpu)); 1939 return -EFAULT; 1940 } 1941 1942 idx = srcu_read_lock(&vcpu->kvm->srcu); 1943 1944 gfn = fault_ipa >> PAGE_SHIFT; 1945 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1946 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1947 write_fault = kvm_is_write_fault(vcpu); 1948 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1949 if (is_iabt) { 1950 /* Prefetch Abort on I/O address */ 1951 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1952 ret = 1; 1953 goto out_unlock; 1954 } 1955 1956 /* 1957 * Check for a cache maintenance operation. Since we 1958 * ended-up here, we know it is outside of any memory 1959 * slot. But we can't find out if that is for a device, 1960 * or if the guest is just being stupid. The only thing 1961 * we know for sure is that this range cannot be cached. 1962 * 1963 * So let's assume that the guest is just being 1964 * cautious, and skip the instruction. 1965 */ 1966 if (kvm_vcpu_dabt_is_cm(vcpu)) { 1967 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 1968 ret = 1; 1969 goto out_unlock; 1970 } 1971 1972 /* 1973 * The IPA is reported as [MAX:12], so we need to 1974 * complement it with the bottom 12 bits from the 1975 * faulting VA. This is always 12 bits, irrespective 1976 * of the page size. 1977 */ 1978 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 1979 ret = io_mem_abort(vcpu, run, fault_ipa); 1980 goto out_unlock; 1981 } 1982 1983 /* Userspace should not be able to register out-of-bounds IPAs */ 1984 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); 1985 1986 if (fault_status == FSC_ACCESS) { 1987 handle_access_fault(vcpu, fault_ipa); 1988 ret = 1; 1989 goto out_unlock; 1990 } 1991 1992 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); 1993 if (ret == 0) 1994 ret = 1; 1995 out_unlock: 1996 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1997 return ret; 1998 } 1999 2000 static int handle_hva_to_gpa(struct kvm *kvm, 2001 unsigned long start, 2002 unsigned long end, 2003 int (*handler)(struct kvm *kvm, 2004 gpa_t gpa, u64 size, 2005 void *data), 2006 void *data) 2007 { 2008 struct kvm_memslots *slots; 2009 struct kvm_memory_slot *memslot; 2010 int ret = 0; 2011 2012 slots = kvm_memslots(kvm); 2013 2014 /* we only care about the pages that the guest sees */ 2015 kvm_for_each_memslot(memslot, slots) { 2016 unsigned long hva_start, hva_end; 2017 gfn_t gpa; 2018 2019 hva_start = max(start, memslot->userspace_addr); 2020 hva_end = min(end, memslot->userspace_addr + 2021 (memslot->npages << PAGE_SHIFT)); 2022 if (hva_start >= hva_end) 2023 continue; 2024 2025 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; 2026 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); 2027 } 2028 2029 return ret; 2030 } 2031 2032 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2033 { 2034 unmap_stage2_range(kvm, gpa, size); 2035 return 0; 2036 } 2037 2038 int kvm_unmap_hva_range(struct kvm *kvm, 2039 unsigned long start, unsigned long end) 2040 { 2041 if (!kvm->arch.pgd) 2042 return 0; 2043 2044 trace_kvm_unmap_hva_range(start, end); 2045 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); 2046 return 0; 2047 } 2048 2049 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2050 { 2051 pte_t *pte = (pte_t *)data; 2052 2053 WARN_ON(size != PAGE_SIZE); 2054 /* 2055 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE 2056 * flag clear because MMU notifiers will have unmapped a huge PMD before 2057 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and 2058 * therefore stage2_set_pte() never needs to clear out a huge PMD 2059 * through this calling path. 2060 */ 2061 stage2_set_pte(kvm, NULL, gpa, pte, 0); 2062 return 0; 2063 } 2064 2065 2066 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 2067 { 2068 unsigned long end = hva + PAGE_SIZE; 2069 kvm_pfn_t pfn = pte_pfn(pte); 2070 pte_t stage2_pte; 2071 2072 if (!kvm->arch.pgd) 2073 return 0; 2074 2075 trace_kvm_set_spte_hva(hva); 2076 2077 /* 2078 * We've moved a page around, probably through CoW, so let's treat it 2079 * just like a translation fault and clean the cache to the PoC. 2080 */ 2081 clean_dcache_guest_page(pfn, PAGE_SIZE); 2082 stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); 2083 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 2084 2085 return 0; 2086 } 2087 2088 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2089 { 2090 pud_t *pud; 2091 pmd_t *pmd; 2092 pte_t *pte; 2093 2094 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); 2095 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) 2096 return 0; 2097 2098 if (pud) 2099 return stage2_pudp_test_and_clear_young(pud); 2100 else if (pmd) 2101 return stage2_pmdp_test_and_clear_young(pmd); 2102 else 2103 return stage2_ptep_test_and_clear_young(pte); 2104 } 2105 2106 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2107 { 2108 pud_t *pud; 2109 pmd_t *pmd; 2110 pte_t *pte; 2111 2112 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); 2113 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) 2114 return 0; 2115 2116 if (pud) 2117 return kvm_s2pud_young(*pud); 2118 else if (pmd) 2119 return pmd_young(*pmd); 2120 else 2121 return pte_young(*pte); 2122 } 2123 2124 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 2125 { 2126 if (!kvm->arch.pgd) 2127 return 0; 2128 trace_kvm_age_hva(start, end); 2129 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); 2130 } 2131 2132 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 2133 { 2134 if (!kvm->arch.pgd) 2135 return 0; 2136 trace_kvm_test_age_hva(hva); 2137 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); 2138 } 2139 2140 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 2141 { 2142 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 2143 } 2144 2145 phys_addr_t kvm_mmu_get_httbr(void) 2146 { 2147 if (__kvm_cpu_uses_extended_idmap()) 2148 return virt_to_phys(merged_hyp_pgd); 2149 else 2150 return virt_to_phys(hyp_pgd); 2151 } 2152 2153 phys_addr_t kvm_get_idmap_vector(void) 2154 { 2155 return hyp_idmap_vector; 2156 } 2157 2158 static int kvm_map_idmap_text(pgd_t *pgd) 2159 { 2160 int err; 2161 2162 /* Create the idmap in the boot page tables */ 2163 err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 2164 hyp_idmap_start, hyp_idmap_end, 2165 __phys_to_pfn(hyp_idmap_start), 2166 PAGE_HYP_EXEC); 2167 if (err) 2168 kvm_err("Failed to idmap %lx-%lx\n", 2169 hyp_idmap_start, hyp_idmap_end); 2170 2171 return err; 2172 } 2173 2174 int kvm_mmu_init(void) 2175 { 2176 int err; 2177 2178 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); 2179 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2180 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); 2181 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2182 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); 2183 2184 /* 2185 * We rely on the linker script to ensure at build time that the HYP 2186 * init code does not cross a page boundary. 2187 */ 2188 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2189 2190 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2191 kvm_debug("HYP VA range: %lx:%lx\n", 2192 kern_hyp_va(PAGE_OFFSET), 2193 kern_hyp_va((unsigned long)high_memory - 1)); 2194 2195 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2196 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2197 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2198 /* 2199 * The idmap page is intersecting with the VA space, 2200 * it is not safe to continue further. 2201 */ 2202 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2203 err = -EINVAL; 2204 goto out; 2205 } 2206 2207 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); 2208 if (!hyp_pgd) { 2209 kvm_err("Hyp mode PGD not allocated\n"); 2210 err = -ENOMEM; 2211 goto out; 2212 } 2213 2214 if (__kvm_cpu_uses_extended_idmap()) { 2215 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 2216 hyp_pgd_order); 2217 if (!boot_hyp_pgd) { 2218 kvm_err("Hyp boot PGD not allocated\n"); 2219 err = -ENOMEM; 2220 goto out; 2221 } 2222 2223 err = kvm_map_idmap_text(boot_hyp_pgd); 2224 if (err) 2225 goto out; 2226 2227 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 2228 if (!merged_hyp_pgd) { 2229 kvm_err("Failed to allocate extra HYP pgd\n"); 2230 goto out; 2231 } 2232 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, 2233 hyp_idmap_start); 2234 } else { 2235 err = kvm_map_idmap_text(hyp_pgd); 2236 if (err) 2237 goto out; 2238 } 2239 2240 io_map_base = hyp_idmap_start; 2241 return 0; 2242 out: 2243 free_hyp_pgds(); 2244 return err; 2245 } 2246 2247 void kvm_arch_commit_memory_region(struct kvm *kvm, 2248 const struct kvm_userspace_memory_region *mem, 2249 const struct kvm_memory_slot *old, 2250 const struct kvm_memory_slot *new, 2251 enum kvm_mr_change change) 2252 { 2253 /* 2254 * At this point memslot has been committed and there is an 2255 * allocated dirty_bitmap[], dirty pages will be be tracked while the 2256 * memory slot is write protected. 2257 */ 2258 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) 2259 kvm_mmu_wp_memory_region(kvm, mem->slot); 2260 } 2261 2262 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2263 struct kvm_memory_slot *memslot, 2264 const struct kvm_userspace_memory_region *mem, 2265 enum kvm_mr_change change) 2266 { 2267 hva_t hva = mem->userspace_addr; 2268 hva_t reg_end = hva + mem->memory_size; 2269 bool writable = !(mem->flags & KVM_MEM_READONLY); 2270 int ret = 0; 2271 2272 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2273 change != KVM_MR_FLAGS_ONLY) 2274 return 0; 2275 2276 /* 2277 * Prevent userspace from creating a memory region outside of the IPA 2278 * space addressable by the KVM guest IPA space. 2279 */ 2280 if (memslot->base_gfn + memslot->npages >= 2281 (kvm_phys_size(kvm) >> PAGE_SHIFT)) 2282 return -EFAULT; 2283 2284 down_read(¤t->mm->mmap_sem); 2285 /* 2286 * A memory region could potentially cover multiple VMAs, and any holes 2287 * between them, so iterate over all of them to find out if we can map 2288 * any of them right now. 2289 * 2290 * +--------------------------------------------+ 2291 * +---------------+----------------+ +----------------+ 2292 * | : VMA 1 | VMA 2 | | VMA 3 : | 2293 * +---------------+----------------+ +----------------+ 2294 * | memory region | 2295 * +--------------------------------------------+ 2296 */ 2297 do { 2298 struct vm_area_struct *vma = find_vma(current->mm, hva); 2299 hva_t vm_start, vm_end; 2300 2301 if (!vma || vma->vm_start >= reg_end) 2302 break; 2303 2304 /* 2305 * Mapping a read-only VMA is only allowed if the 2306 * memory region is configured as read-only. 2307 */ 2308 if (writable && !(vma->vm_flags & VM_WRITE)) { 2309 ret = -EPERM; 2310 break; 2311 } 2312 2313 /* 2314 * Take the intersection of this VMA with the memory region 2315 */ 2316 vm_start = max(hva, vma->vm_start); 2317 vm_end = min(reg_end, vma->vm_end); 2318 2319 if (vma->vm_flags & VM_PFNMAP) { 2320 gpa_t gpa = mem->guest_phys_addr + 2321 (vm_start - mem->userspace_addr); 2322 phys_addr_t pa; 2323 2324 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; 2325 pa += vm_start - vma->vm_start; 2326 2327 /* IO region dirty page logging not allowed */ 2328 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2329 ret = -EINVAL; 2330 goto out; 2331 } 2332 2333 ret = kvm_phys_addr_ioremap(kvm, gpa, pa, 2334 vm_end - vm_start, 2335 writable); 2336 if (ret) 2337 break; 2338 } 2339 hva = vm_end; 2340 } while (hva < reg_end); 2341 2342 if (change == KVM_MR_FLAGS_ONLY) 2343 goto out; 2344 2345 spin_lock(&kvm->mmu_lock); 2346 if (ret) 2347 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); 2348 else 2349 stage2_flush_memslot(kvm, memslot); 2350 spin_unlock(&kvm->mmu_lock); 2351 out: 2352 up_read(¤t->mm->mmap_sem); 2353 return ret; 2354 } 2355 2356 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 2357 struct kvm_memory_slot *dont) 2358 { 2359 } 2360 2361 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 2362 unsigned long npages) 2363 { 2364 return 0; 2365 } 2366 2367 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2368 { 2369 } 2370 2371 void kvm_arch_flush_shadow_all(struct kvm *kvm) 2372 { 2373 kvm_free_stage2_pgd(kvm); 2374 } 2375 2376 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2377 struct kvm_memory_slot *slot) 2378 { 2379 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2380 phys_addr_t size = slot->npages << PAGE_SHIFT; 2381 2382 spin_lock(&kvm->mmu_lock); 2383 unmap_stage2_range(kvm, gpa, size); 2384 spin_unlock(&kvm->mmu_lock); 2385 } 2386 2387 /* 2388 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2389 * 2390 * Main problems: 2391 * - S/W ops are local to a CPU (not broadcast) 2392 * - We have line migration behind our back (speculation) 2393 * - System caches don't support S/W at all (damn!) 2394 * 2395 * In the face of the above, the best we can do is to try and convert 2396 * S/W ops to VA ops. Because the guest is not allowed to infer the 2397 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2398 * which is a rather good thing for us. 2399 * 2400 * Also, it is only used when turning caches on/off ("The expected 2401 * usage of the cache maintenance instructions that operate by set/way 2402 * is associated with the cache maintenance instructions associated 2403 * with the powerdown and powerup of caches, if this is required by 2404 * the implementation."). 2405 * 2406 * We use the following policy: 2407 * 2408 * - If we trap a S/W operation, we enable VM trapping to detect 2409 * caches being turned on/off, and do a full clean. 2410 * 2411 * - We flush the caches on both caches being turned on and off. 2412 * 2413 * - Once the caches are enabled, we stop trapping VM ops. 2414 */ 2415 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2416 { 2417 unsigned long hcr = *vcpu_hcr(vcpu); 2418 2419 /* 2420 * If this is the first time we do a S/W operation 2421 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2422 * VM trapping. 2423 * 2424 * Otherwise, rely on the VM trapping to wait for the MMU + 2425 * Caches to be turned off. At that point, we'll be able to 2426 * clean the caches again. 2427 */ 2428 if (!(hcr & HCR_TVM)) { 2429 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2430 vcpu_has_cache_enabled(vcpu)); 2431 stage2_flush_vm(vcpu->kvm); 2432 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2433 } 2434 } 2435 2436 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2437 { 2438 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2439 2440 /* 2441 * If switching the MMU+caches on, need to invalidate the caches. 2442 * If switching it off, need to clean the caches. 2443 * Clean + invalidate does the trick always. 2444 */ 2445 if (now_enabled != was_enabled) 2446 stage2_flush_vm(vcpu->kvm); 2447 2448 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2449 if (now_enabled) 2450 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2451 2452 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2453 } 2454
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.