~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/powerpc/kvm/book3s_64_mmu_radix.c

Version: ~ [ linux-5.8 ] ~ [ linux-5.7.12 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.55 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.136 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.191 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.232 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.232 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  *
  4  * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  5  */
  6 
  7 #include <linux/types.h>
  8 #include <linux/string.h>
  9 #include <linux/kvm.h>
 10 #include <linux/kvm_host.h>
 11 #include <linux/anon_inodes.h>
 12 #include <linux/file.h>
 13 #include <linux/debugfs.h>
 14 #include <linux/pgtable.h>
 15 
 16 #include <asm/kvm_ppc.h>
 17 #include <asm/kvm_book3s.h>
 18 #include <asm/page.h>
 19 #include <asm/mmu.h>
 20 #include <asm/pgalloc.h>
 21 #include <asm/pte-walk.h>
 22 #include <asm/ultravisor.h>
 23 #include <asm/kvm_book3s_uvmem.h>
 24 
 25 /*
 26  * Supported radix tree geometry.
 27  * Like p9, we support either 5 or 9 bits at the first (lowest) level,
 28  * for a page size of 64k or 4k.
 29  */
 30 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
 31 
 32 unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
 33                                               gva_t eaddr, void *to, void *from,
 34                                               unsigned long n)
 35 {
 36         int uninitialized_var(old_pid), old_lpid;
 37         unsigned long quadrant, ret = n;
 38         bool is_load = !!to;
 39 
 40         /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
 41         if (kvmhv_on_pseries())
 42                 return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
 43                                           (to != NULL) ? __pa(to): 0,
 44                                           (from != NULL) ? __pa(from): 0, n);
 45 
 46         quadrant = 1;
 47         if (!pid)
 48                 quadrant = 2;
 49         if (is_load)
 50                 from = (void *) (eaddr | (quadrant << 62));
 51         else
 52                 to = (void *) (eaddr | (quadrant << 62));
 53 
 54         preempt_disable();
 55 
 56         /* switch the lpid first to avoid running host with unallocated pid */
 57         old_lpid = mfspr(SPRN_LPID);
 58         if (old_lpid != lpid)
 59                 mtspr(SPRN_LPID, lpid);
 60         if (quadrant == 1) {
 61                 old_pid = mfspr(SPRN_PID);
 62                 if (old_pid != pid)
 63                         mtspr(SPRN_PID, pid);
 64         }
 65         isync();
 66 
 67         if (is_load)
 68                 ret = copy_from_user_nofault(to, (const void __user *)from, n);
 69         else
 70                 ret = copy_to_user_nofault((void __user *)to, from, n);
 71 
 72         /* switch the pid first to avoid running host with unallocated pid */
 73         if (quadrant == 1 && pid != old_pid)
 74                 mtspr(SPRN_PID, old_pid);
 75         if (lpid != old_lpid)
 76                 mtspr(SPRN_LPID, old_lpid);
 77         isync();
 78 
 79         preempt_enable();
 80 
 81         return ret;
 82 }
 83 EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
 84 
 85 static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
 86                                           void *to, void *from, unsigned long n)
 87 {
 88         int lpid = vcpu->kvm->arch.lpid;
 89         int pid = vcpu->arch.pid;
 90 
 91         /* This would cause a data segment intr so don't allow the access */
 92         if (eaddr & (0x3FFUL << 52))
 93                 return -EINVAL;
 94 
 95         /* Should we be using the nested lpid */
 96         if (vcpu->arch.nested)
 97                 lpid = vcpu->arch.nested->shadow_lpid;
 98 
 99         /* If accessing quadrant 3 then pid is expected to be 0 */
100         if (((eaddr >> 62) & 0x3) == 0x3)
101                 pid = 0;
102 
103         eaddr &= ~(0xFFFUL << 52);
104 
105         return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
106 }
107 
108 long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
109                                  unsigned long n)
110 {
111         long ret;
112 
113         ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
114         if (ret > 0)
115                 memset(to + (n - ret), 0, ret);
116 
117         return ret;
118 }
119 EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
120 
121 long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
122                                unsigned long n)
123 {
124         return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
125 }
126 EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
127 
128 int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
129                                struct kvmppc_pte *gpte, u64 root,
130                                u64 *pte_ret_p)
131 {
132         struct kvm *kvm = vcpu->kvm;
133         int ret, level, ps;
134         unsigned long rts, bits, offset, index;
135         u64 pte, base, gpa;
136         __be64 rpte;
137 
138         rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
139                 ((root & RTS2_MASK) >> RTS2_SHIFT);
140         bits = root & RPDS_MASK;
141         base = root & RPDB_MASK;
142 
143         offset = rts + 31;
144 
145         /* Current implementations only support 52-bit space */
146         if (offset != 52)
147                 return -EINVAL;
148 
149         /* Walk each level of the radix tree */
150         for (level = 3; level >= 0; --level) {
151                 u64 addr;
152                 /* Check a valid size */
153                 if (level && bits != p9_supported_radix_bits[level])
154                         return -EINVAL;
155                 if (level == 0 && !(bits == 5 || bits == 9))
156                         return -EINVAL;
157                 offset -= bits;
158                 index = (eaddr >> offset) & ((1UL << bits) - 1);
159                 /* Check that low bits of page table base are zero */
160                 if (base & ((1UL << (bits + 3)) - 1))
161                         return -EINVAL;
162                 /* Read the entry from guest memory */
163                 addr = base + (index * sizeof(rpte));
164                 ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
165                 if (ret) {
166                         if (pte_ret_p)
167                                 *pte_ret_p = addr;
168                         return ret;
169                 }
170                 pte = __be64_to_cpu(rpte);
171                 if (!(pte & _PAGE_PRESENT))
172                         return -ENOENT;
173                 /* Check if a leaf entry */
174                 if (pte & _PAGE_PTE)
175                         break;
176                 /* Get ready to walk the next level */
177                 base = pte & RPDB_MASK;
178                 bits = pte & RPDS_MASK;
179         }
180 
181         /* Need a leaf at lowest level; 512GB pages not supported */
182         if (level < 0 || level == 3)
183                 return -EINVAL;
184 
185         /* We found a valid leaf PTE */
186         /* Offset is now log base 2 of the page size */
187         gpa = pte & 0x01fffffffffff000ul;
188         if (gpa & ((1ul << offset) - 1))
189                 return -EINVAL;
190         gpa |= eaddr & ((1ul << offset) - 1);
191         for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
192                 if (offset == mmu_psize_defs[ps].shift)
193                         break;
194         gpte->page_size = ps;
195         gpte->page_shift = offset;
196 
197         gpte->eaddr = eaddr;
198         gpte->raddr = gpa;
199 
200         /* Work out permissions */
201         gpte->may_read = !!(pte & _PAGE_READ);
202         gpte->may_write = !!(pte & _PAGE_WRITE);
203         gpte->may_execute = !!(pte & _PAGE_EXEC);
204 
205         gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
206 
207         if (pte_ret_p)
208                 *pte_ret_p = pte;
209 
210         return 0;
211 }
212 
213 /*
214  * Used to walk a partition or process table radix tree in guest memory
215  * Note: We exploit the fact that a partition table and a process
216  * table have the same layout, a partition-scoped page table and a
217  * process-scoped page table have the same layout, and the 2nd
218  * doubleword of a partition table entry has the same layout as
219  * the PTCR register.
220  */
221 int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
222                                      struct kvmppc_pte *gpte, u64 table,
223                                      int table_index, u64 *pte_ret_p)
224 {
225         struct kvm *kvm = vcpu->kvm;
226         int ret;
227         unsigned long size, ptbl, root;
228         struct prtb_entry entry;
229 
230         if ((table & PRTS_MASK) > 24)
231                 return -EINVAL;
232         size = 1ul << ((table & PRTS_MASK) + 12);
233 
234         /* Is the table big enough to contain this entry? */
235         if ((table_index * sizeof(entry)) >= size)
236                 return -EINVAL;
237 
238         /* Read the table to find the root of the radix tree */
239         ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
240         ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
241         if (ret)
242                 return ret;
243 
244         /* Root is stored in the first double word */
245         root = be64_to_cpu(entry.prtb0);
246 
247         return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
248 }
249 
250 int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
251                            struct kvmppc_pte *gpte, bool data, bool iswrite)
252 {
253         u32 pid;
254         u64 pte;
255         int ret;
256 
257         /* Work out effective PID */
258         switch (eaddr >> 62) {
259         case 0:
260                 pid = vcpu->arch.pid;
261                 break;
262         case 3:
263                 pid = 0;
264                 break;
265         default:
266                 return -EINVAL;
267         }
268 
269         ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
270                                 vcpu->kvm->arch.process_table, pid, &pte);
271         if (ret)
272                 return ret;
273 
274         /* Check privilege (applies only to process scoped translations) */
275         if (kvmppc_get_msr(vcpu) & MSR_PR) {
276                 if (pte & _PAGE_PRIVILEGED) {
277                         gpte->may_read = 0;
278                         gpte->may_write = 0;
279                         gpte->may_execute = 0;
280                 }
281         } else {
282                 if (!(pte & _PAGE_PRIVILEGED)) {
283                         /* Check AMR/IAMR to see if strict mode is in force */
284                         if (vcpu->arch.amr & (1ul << 62))
285                                 gpte->may_read = 0;
286                         if (vcpu->arch.amr & (1ul << 63))
287                                 gpte->may_write = 0;
288                         if (vcpu->arch.iamr & (1ul << 62))
289                                 gpte->may_execute = 0;
290                 }
291         }
292 
293         return 0;
294 }
295 
296 void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
297                              unsigned int pshift, unsigned int lpid)
298 {
299         unsigned long psize = PAGE_SIZE;
300         int psi;
301         long rc;
302         unsigned long rb;
303 
304         if (pshift)
305                 psize = 1UL << pshift;
306         else
307                 pshift = PAGE_SHIFT;
308 
309         addr &= ~(psize - 1);
310 
311         if (!kvmhv_on_pseries()) {
312                 radix__flush_tlb_lpid_page(lpid, addr, psize);
313                 return;
314         }
315 
316         psi = shift_to_mmu_psize(pshift);
317         rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
318         rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
319                                 lpid, rb);
320         if (rc)
321                 pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
322 }
323 
324 static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
325 {
326         long rc;
327 
328         if (!kvmhv_on_pseries()) {
329                 radix__flush_pwc_lpid(lpid);
330                 return;
331         }
332 
333         rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
334                                 lpid, TLBIEL_INVAL_SET_LPID);
335         if (rc)
336                 pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
337 }
338 
339 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
340                                       unsigned long clr, unsigned long set,
341                                       unsigned long addr, unsigned int shift)
342 {
343         return __radix_pte_update(ptep, clr, set);
344 }
345 
346 void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
347                              pte_t *ptep, pte_t pte)
348 {
349         radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
350 }
351 
352 static struct kmem_cache *kvm_pte_cache;
353 static struct kmem_cache *kvm_pmd_cache;
354 
355 static pte_t *kvmppc_pte_alloc(void)
356 {
357         pte_t *pte;
358 
359         pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
360         /* pmd_populate() will only reference _pa(pte). */
361         kmemleak_ignore(pte);
362 
363         return pte;
364 }
365 
366 static void kvmppc_pte_free(pte_t *ptep)
367 {
368         kmem_cache_free(kvm_pte_cache, ptep);
369 }
370 
371 static pmd_t *kvmppc_pmd_alloc(void)
372 {
373         pmd_t *pmd;
374 
375         pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
376         /* pud_populate() will only reference _pa(pmd). */
377         kmemleak_ignore(pmd);
378 
379         return pmd;
380 }
381 
382 static void kvmppc_pmd_free(pmd_t *pmdp)
383 {
384         kmem_cache_free(kvm_pmd_cache, pmdp);
385 }
386 
387 /* Called with kvm->mmu_lock held */
388 void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
389                       unsigned int shift,
390                       const struct kvm_memory_slot *memslot,
391                       unsigned int lpid)
392 
393 {
394         unsigned long old;
395         unsigned long gfn = gpa >> PAGE_SHIFT;
396         unsigned long page_size = PAGE_SIZE;
397         unsigned long hpa;
398 
399         old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
400         kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
401 
402         /* The following only applies to L1 entries */
403         if (lpid != kvm->arch.lpid)
404                 return;
405 
406         if (!memslot) {
407                 memslot = gfn_to_memslot(kvm, gfn);
408                 if (!memslot)
409                         return;
410         }
411         if (shift) { /* 1GB or 2MB page */
412                 page_size = 1ul << shift;
413                 if (shift == PMD_SHIFT)
414                         kvm->stat.num_2M_pages--;
415                 else if (shift == PUD_SHIFT)
416                         kvm->stat.num_1G_pages--;
417         }
418 
419         gpa &= ~(page_size - 1);
420         hpa = old & PTE_RPN_MASK;
421         kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
422 
423         if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
424                 kvmppc_update_dirty_map(memslot, gfn, page_size);
425 }
426 
427 /*
428  * kvmppc_free_p?d are used to free existing page tables, and recursively
429  * descend and clear and free children.
430  * Callers are responsible for flushing the PWC.
431  *
432  * When page tables are being unmapped/freed as part of page fault path
433  * (full == false), valid ptes are generally not expected; however, there
434  * is one situation where they arise, which is when dirty page logging is
435  * turned off for a memslot while the VM is running.  The new memslot
436  * becomes visible to page faults before the memslot commit function
437  * gets to flush the memslot, which can lead to a 2MB page mapping being
438  * installed for a guest physical address where there are already 64kB
439  * (or 4kB) mappings (of sub-pages of the same 2MB page).
440  */
441 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
442                                   unsigned int lpid)
443 {
444         if (full) {
445                 memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
446         } else {
447                 pte_t *p = pte;
448                 unsigned long it;
449 
450                 for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
451                         if (pte_val(*p) == 0)
452                                 continue;
453                         kvmppc_unmap_pte(kvm, p,
454                                          pte_pfn(*p) << PAGE_SHIFT,
455                                          PAGE_SHIFT, NULL, lpid);
456                 }
457         }
458 
459         kvmppc_pte_free(pte);
460 }
461 
462 static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
463                                   unsigned int lpid)
464 {
465         unsigned long im;
466         pmd_t *p = pmd;
467 
468         for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
469                 if (!pmd_present(*p))
470                         continue;
471                 if (pmd_is_leaf(*p)) {
472                         if (full) {
473                                 pmd_clear(p);
474                         } else {
475                                 WARN_ON_ONCE(1);
476                                 kvmppc_unmap_pte(kvm, (pte_t *)p,
477                                          pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
478                                          PMD_SHIFT, NULL, lpid);
479                         }
480                 } else {
481                         pte_t *pte;
482 
483                         pte = pte_offset_map(p, 0);
484                         kvmppc_unmap_free_pte(kvm, pte, full, lpid);
485                         pmd_clear(p);
486                 }
487         }
488         kvmppc_pmd_free(pmd);
489 }
490 
491 static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
492                                   unsigned int lpid)
493 {
494         unsigned long iu;
495         pud_t *p = pud;
496 
497         for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
498                 if (!pud_present(*p))
499                         continue;
500                 if (pud_is_leaf(*p)) {
501                         pud_clear(p);
502                 } else {
503                         pmd_t *pmd;
504 
505                         pmd = pmd_offset(p, 0);
506                         kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
507                         pud_clear(p);
508                 }
509         }
510         pud_free(kvm->mm, pud);
511 }
512 
513 void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
514 {
515         unsigned long ig;
516 
517         for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
518                 p4d_t *p4d = p4d_offset(pgd, 0);
519                 pud_t *pud;
520 
521                 if (!p4d_present(*p4d))
522                         continue;
523                 pud = pud_offset(p4d, 0);
524                 kvmppc_unmap_free_pud(kvm, pud, lpid);
525                 p4d_clear(p4d);
526         }
527 }
528 
529 void kvmppc_free_radix(struct kvm *kvm)
530 {
531         if (kvm->arch.pgtable) {
532                 kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
533                                           kvm->arch.lpid);
534                 pgd_free(kvm->mm, kvm->arch.pgtable);
535                 kvm->arch.pgtable = NULL;
536         }
537 }
538 
539 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
540                                         unsigned long gpa, unsigned int lpid)
541 {
542         pte_t *pte = pte_offset_kernel(pmd, 0);
543 
544         /*
545          * Clearing the pmd entry then flushing the PWC ensures that the pte
546          * page no longer be cached by the MMU, so can be freed without
547          * flushing the PWC again.
548          */
549         pmd_clear(pmd);
550         kvmppc_radix_flush_pwc(kvm, lpid);
551 
552         kvmppc_unmap_free_pte(kvm, pte, false, lpid);
553 }
554 
555 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
556                                         unsigned long gpa, unsigned int lpid)
557 {
558         pmd_t *pmd = pmd_offset(pud, 0);
559 
560         /*
561          * Clearing the pud entry then flushing the PWC ensures that the pmd
562          * page and any children pte pages will no longer be cached by the MMU,
563          * so can be freed without flushing the PWC again.
564          */
565         pud_clear(pud);
566         kvmppc_radix_flush_pwc(kvm, lpid);
567 
568         kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
569 }
570 
571 /*
572  * There are a number of bits which may differ between different faults to
573  * the same partition scope entry. RC bits, in the course of cleaning and
574  * aging. And the write bit can change, either the access could have been
575  * upgraded, or a read fault could happen concurrently with a write fault
576  * that sets those bits first.
577  */
578 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
579 
580 int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
581                       unsigned long gpa, unsigned int level,
582                       unsigned long mmu_seq, unsigned int lpid,
583                       unsigned long *rmapp, struct rmap_nested **n_rmap)
584 {
585         pgd_t *pgd;
586         p4d_t *p4d;
587         pud_t *pud, *new_pud = NULL;
588         pmd_t *pmd, *new_pmd = NULL;
589         pte_t *ptep, *new_ptep = NULL;
590         int ret;
591 
592         /* Traverse the guest's 2nd-level tree, allocate new levels needed */
593         pgd = pgtable + pgd_index(gpa);
594         p4d = p4d_offset(pgd, gpa);
595 
596         pud = NULL;
597         if (p4d_present(*p4d))
598                 pud = pud_offset(p4d, gpa);
599         else
600                 new_pud = pud_alloc_one(kvm->mm, gpa);
601 
602         pmd = NULL;
603         if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
604                 pmd = pmd_offset(pud, gpa);
605         else if (level <= 1)
606                 new_pmd = kvmppc_pmd_alloc();
607 
608         if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
609                 new_ptep = kvmppc_pte_alloc();
610 
611         /* Check if we might have been invalidated; let the guest retry if so */
612         spin_lock(&kvm->mmu_lock);
613         ret = -EAGAIN;
614         if (mmu_notifier_retry(kvm, mmu_seq))
615                 goto out_unlock;
616 
617         /* Now traverse again under the lock and change the tree */
618         ret = -ENOMEM;
619         if (p4d_none(*p4d)) {
620                 if (!new_pud)
621                         goto out_unlock;
622                 p4d_populate(kvm->mm, p4d, new_pud);
623                 new_pud = NULL;
624         }
625         pud = pud_offset(p4d, gpa);
626         if (pud_is_leaf(*pud)) {
627                 unsigned long hgpa = gpa & PUD_MASK;
628 
629                 /* Check if we raced and someone else has set the same thing */
630                 if (level == 2) {
631                         if (pud_raw(*pud) == pte_raw(pte)) {
632                                 ret = 0;
633                                 goto out_unlock;
634                         }
635                         /* Valid 1GB page here already, add our extra bits */
636                         WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
637                                                         PTE_BITS_MUST_MATCH);
638                         kvmppc_radix_update_pte(kvm, (pte_t *)pud,
639                                               0, pte_val(pte), hgpa, PUD_SHIFT);
640                         ret = 0;
641                         goto out_unlock;
642                 }
643                 /*
644                  * If we raced with another CPU which has just put
645                  * a 1GB pte in after we saw a pmd page, try again.
646                  */
647                 if (!new_pmd) {
648                         ret = -EAGAIN;
649                         goto out_unlock;
650                 }
651                 /* Valid 1GB page here already, remove it */
652                 kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
653                                  lpid);
654         }
655         if (level == 2) {
656                 if (!pud_none(*pud)) {
657                         /*
658                          * There's a page table page here, but we wanted to
659                          * install a large page, so remove and free the page
660                          * table page.
661                          */
662                         kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
663                 }
664                 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
665                 if (rmapp && n_rmap)
666                         kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
667                 ret = 0;
668                 goto out_unlock;
669         }
670         if (pud_none(*pud)) {
671                 if (!new_pmd)
672                         goto out_unlock;
673                 pud_populate(kvm->mm, pud, new_pmd);
674                 new_pmd = NULL;
675         }
676         pmd = pmd_offset(pud, gpa);
677         if (pmd_is_leaf(*pmd)) {
678                 unsigned long lgpa = gpa & PMD_MASK;
679 
680                 /* Check if we raced and someone else has set the same thing */
681                 if (level == 1) {
682                         if (pmd_raw(*pmd) == pte_raw(pte)) {
683                                 ret = 0;
684                                 goto out_unlock;
685                         }
686                         /* Valid 2MB page here already, add our extra bits */
687                         WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
688                                                         PTE_BITS_MUST_MATCH);
689                         kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
690                                         0, pte_val(pte), lgpa, PMD_SHIFT);
691                         ret = 0;
692                         goto out_unlock;
693                 }
694 
695                 /*
696                  * If we raced with another CPU which has just put
697                  * a 2MB pte in after we saw a pte page, try again.
698                  */
699                 if (!new_ptep) {
700                         ret = -EAGAIN;
701                         goto out_unlock;
702                 }
703                 /* Valid 2MB page here already, remove it */
704                 kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
705                                  lpid);
706         }
707         if (level == 1) {
708                 if (!pmd_none(*pmd)) {
709                         /*
710                          * There's a page table page here, but we wanted to
711                          * install a large page, so remove and free the page
712                          * table page.
713                          */
714                         kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
715                 }
716                 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
717                 if (rmapp && n_rmap)
718                         kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
719                 ret = 0;
720                 goto out_unlock;
721         }
722         if (pmd_none(*pmd)) {
723                 if (!new_ptep)
724                         goto out_unlock;
725                 pmd_populate(kvm->mm, pmd, new_ptep);
726                 new_ptep = NULL;
727         }
728         ptep = pte_offset_kernel(pmd, gpa);
729         if (pte_present(*ptep)) {
730                 /* Check if someone else set the same thing */
731                 if (pte_raw(*ptep) == pte_raw(pte)) {
732                         ret = 0;
733                         goto out_unlock;
734                 }
735                 /* Valid page here already, add our extra bits */
736                 WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
737                                                         PTE_BITS_MUST_MATCH);
738                 kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
739                 ret = 0;
740                 goto out_unlock;
741         }
742         kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
743         if (rmapp && n_rmap)
744                 kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
745         ret = 0;
746 
747  out_unlock:
748         spin_unlock(&kvm->mmu_lock);
749         if (new_pud)
750                 pud_free(kvm->mm, new_pud);
751         if (new_pmd)
752                 kvmppc_pmd_free(new_pmd);
753         if (new_ptep)
754                 kvmppc_pte_free(new_ptep);
755         return ret;
756 }
757 
758 bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
759                              unsigned long gpa, unsigned int lpid)
760 {
761         unsigned long pgflags;
762         unsigned int shift;
763         pte_t *ptep;
764 
765         /*
766          * Need to set an R or C bit in the 2nd-level tables;
767          * since we are just helping out the hardware here,
768          * it is sufficient to do what the hardware does.
769          */
770         pgflags = _PAGE_ACCESSED;
771         if (writing)
772                 pgflags |= _PAGE_DIRTY;
773 
774         if (nested)
775                 ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
776         else
777                 ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
778 
779         if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
780                 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
781                 return true;
782         }
783         return false;
784 }
785 
786 int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
787                                    unsigned long gpa,
788                                    struct kvm_memory_slot *memslot,
789                                    bool writing, bool kvm_ro,
790                                    pte_t *inserted_pte, unsigned int *levelp)
791 {
792         struct kvm *kvm = vcpu->kvm;
793         struct page *page = NULL;
794         unsigned long mmu_seq;
795         unsigned long hva, gfn = gpa >> PAGE_SHIFT;
796         bool upgrade_write = false;
797         bool *upgrade_p = &upgrade_write;
798         pte_t pte, *ptep;
799         unsigned int shift, level;
800         int ret;
801         bool large_enable;
802 
803         /* used to check for invalidations in progress */
804         mmu_seq = kvm->mmu_notifier_seq;
805         smp_rmb();
806 
807         /*
808          * Do a fast check first, since __gfn_to_pfn_memslot doesn't
809          * do it with !atomic && !async, which is how we call it.
810          * We always ask for write permission since the common case
811          * is that the page is writable.
812          */
813         hva = gfn_to_hva_memslot(memslot, gfn);
814         if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
815                 upgrade_write = true;
816         } else {
817                 unsigned long pfn;
818 
819                 /* Call KVM generic code to do the slow-path check */
820                 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
821                                            writing, upgrade_p);
822                 if (is_error_noslot_pfn(pfn))
823                         return -EFAULT;
824                 page = NULL;
825                 if (pfn_valid(pfn)) {
826                         page = pfn_to_page(pfn);
827                         if (PageReserved(page))
828                                 page = NULL;
829                 }
830         }
831 
832         /*
833          * Read the PTE from the process' radix tree and use that
834          * so we get the shift and attribute bits.
835          */
836         spin_lock(&kvm->mmu_lock);
837         ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
838         pte = __pte(0);
839         if (ptep)
840                 pte = READ_ONCE(*ptep);
841         spin_unlock(&kvm->mmu_lock);
842         /*
843          * If the PTE disappeared temporarily due to a THP
844          * collapse, just return and let the guest try again.
845          */
846         if (!pte_present(pte)) {
847                 if (page)
848                         put_page(page);
849                 return RESUME_GUEST;
850         }
851 
852         /* If we're logging dirty pages, always map single pages */
853         large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
854 
855         /* Get pte level from shift/size */
856         if (large_enable && shift == PUD_SHIFT &&
857             (gpa & (PUD_SIZE - PAGE_SIZE)) ==
858             (hva & (PUD_SIZE - PAGE_SIZE))) {
859                 level = 2;
860         } else if (large_enable && shift == PMD_SHIFT &&
861                    (gpa & (PMD_SIZE - PAGE_SIZE)) ==
862                    (hva & (PMD_SIZE - PAGE_SIZE))) {
863                 level = 1;
864         } else {
865                 level = 0;
866                 if (shift > PAGE_SHIFT) {
867                         /*
868                          * If the pte maps more than one page, bring over
869                          * bits from the virtual address to get the real
870                          * address of the specific single page we want.
871                          */
872                         unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
873                         pte = __pte(pte_val(pte) | (hva & rpnmask));
874                 }
875         }
876 
877         pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
878         if (writing || upgrade_write) {
879                 if (pte_val(pte) & _PAGE_WRITE)
880                         pte = __pte(pte_val(pte) | _PAGE_DIRTY);
881         } else {
882                 pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
883         }
884 
885         /* Allocate space in the tree and write the PTE */
886         ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
887                                 mmu_seq, kvm->arch.lpid, NULL, NULL);
888         if (inserted_pte)
889                 *inserted_pte = pte;
890         if (levelp)
891                 *levelp = level;
892 
893         if (page) {
894                 if (!ret && (pte_val(pte) & _PAGE_WRITE))
895                         set_page_dirty_lock(page);
896                 put_page(page);
897         }
898 
899         /* Increment number of large pages if we (successfully) inserted one */
900         if (!ret) {
901                 if (level == 1)
902                         kvm->stat.num_2M_pages++;
903                 else if (level == 2)
904                         kvm->stat.num_1G_pages++;
905         }
906 
907         return ret;
908 }
909 
910 int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
911                                    unsigned long ea, unsigned long dsisr)
912 {
913         struct kvm *kvm = vcpu->kvm;
914         unsigned long gpa, gfn;
915         struct kvm_memory_slot *memslot;
916         long ret;
917         bool writing = !!(dsisr & DSISR_ISSTORE);
918         bool kvm_ro = false;
919 
920         /* Check for unusual errors */
921         if (dsisr & DSISR_UNSUPP_MMU) {
922                 pr_err("KVM: Got unsupported MMU fault\n");
923                 return -EFAULT;
924         }
925         if (dsisr & DSISR_BADACCESS) {
926                 /* Reflect to the guest as DSI */
927                 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
928                 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
929                 return RESUME_GUEST;
930         }
931 
932         /* Translate the logical address */
933         gpa = vcpu->arch.fault_gpa & ~0xfffUL;
934         gpa &= ~0xF000000000000000ul;
935         gfn = gpa >> PAGE_SHIFT;
936         if (!(dsisr & DSISR_PRTABLE_FAULT))
937                 gpa |= ea & 0xfff;
938 
939         if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
940                 return kvmppc_send_page_to_uv(kvm, gfn);
941 
942         /* Get the corresponding memslot */
943         memslot = gfn_to_memslot(kvm, gfn);
944 
945         /* No memslot means it's an emulated MMIO region */
946         if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
947                 if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
948                              DSISR_SET_RC)) {
949                         /*
950                          * Bad address in guest page table tree, or other
951                          * unusual error - reflect it to the guest as DSI.
952                          */
953                         kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
954                         return RESUME_GUEST;
955                 }
956                 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
957         }
958 
959         if (memslot->flags & KVM_MEM_READONLY) {
960                 if (writing) {
961                         /* give the guest a DSI */
962                         kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
963                                                        DSISR_PROTFAULT);
964                         return RESUME_GUEST;
965                 }
966                 kvm_ro = true;
967         }
968 
969         /* Failed to set the reference/change bits */
970         if (dsisr & DSISR_SET_RC) {
971                 spin_lock(&kvm->mmu_lock);
972                 if (kvmppc_hv_handle_set_rc(kvm, false, writing,
973                                             gpa, kvm->arch.lpid))
974                         dsisr &= ~DSISR_SET_RC;
975                 spin_unlock(&kvm->mmu_lock);
976 
977                 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
978                                DSISR_PROTFAULT | DSISR_SET_RC)))
979                         return RESUME_GUEST;
980         }
981 
982         /* Try to insert a pte */
983         ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
984                                              kvm_ro, NULL, NULL);
985 
986         if (ret == 0 || ret == -EAGAIN)
987                 ret = RESUME_GUEST;
988         return ret;
989 }
990 
991 /* Called with kvm->mmu_lock held */
992 int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
993                     unsigned long gfn)
994 {
995         pte_t *ptep;
996         unsigned long gpa = gfn << PAGE_SHIFT;
997         unsigned int shift;
998 
999         if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1000                 uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1001                 return 0;
1002         }
1003 
1004         ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1005         if (ptep && pte_present(*ptep))
1006                 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1007                                  kvm->arch.lpid);
1008         return 0;
1009 }
1010 
1011 /* Called with kvm->mmu_lock held */
1012 int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1013                   unsigned long gfn)
1014 {
1015         pte_t *ptep;
1016         unsigned long gpa = gfn << PAGE_SHIFT;
1017         unsigned int shift;
1018         int ref = 0;
1019         unsigned long old, *rmapp;
1020 
1021         if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1022                 return ref;
1023 
1024         ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1025         if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1026                 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1027                                               gpa, shift);
1028                 /* XXX need to flush tlb here? */
1029                 /* Also clear bit in ptes in shadow pgtable for nested guests */
1030                 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1031                 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1032                                                old & PTE_RPN_MASK,
1033                                                1UL << shift);
1034                 ref = 1;
1035         }
1036         return ref;
1037 }
1038 
1039 /* Called with kvm->mmu_lock held */
1040 int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1041                        unsigned long gfn)
1042 {
1043         pte_t *ptep;
1044         unsigned long gpa = gfn << PAGE_SHIFT;
1045         unsigned int shift;
1046         int ref = 0;
1047 
1048         if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1049                 return ref;
1050 
1051         ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1052         if (ptep && pte_present(*ptep) && pte_young(*ptep))
1053                 ref = 1;
1054         return ref;
1055 }
1056 
1057 /* Returns the number of PAGE_SIZE pages that are dirty */
1058 static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1059                                 struct kvm_memory_slot *memslot, int pagenum)
1060 {
1061         unsigned long gfn = memslot->base_gfn + pagenum;
1062         unsigned long gpa = gfn << PAGE_SHIFT;
1063         pte_t *ptep, pte;
1064         unsigned int shift;
1065         int ret = 0;
1066         unsigned long old, *rmapp;
1067 
1068         if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1069                 return ret;
1070 
1071         /*
1072          * For performance reasons we don't hold kvm->mmu_lock while walking the
1073          * partition scoped table.
1074          */
1075         ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1076         if (!ptep)
1077                 return 0;
1078 
1079         pte = READ_ONCE(*ptep);
1080         if (pte_present(pte) && pte_dirty(pte)) {
1081                 spin_lock(&kvm->mmu_lock);
1082                 /*
1083                  * Recheck the pte again
1084                  */
1085                 if (pte_val(pte) != pte_val(*ptep)) {
1086                         /*
1087                          * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1088                          * only find PAGE_SIZE pte entries here. We can continue
1089                          * to use the pte addr returned by above page table
1090                          * walk.
1091                          */
1092                         if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1093                                 spin_unlock(&kvm->mmu_lock);
1094                                 return 0;
1095                         }
1096                 }
1097 
1098                 ret = 1;
1099                 VM_BUG_ON(shift);
1100                 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1101                                               gpa, shift);
1102                 kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1103                 /* Also clear bit in ptes in shadow pgtable for nested guests */
1104                 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1105                 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1106                                                old & PTE_RPN_MASK,
1107                                                1UL << shift);
1108                 spin_unlock(&kvm->mmu_lock);
1109         }
1110         return ret;
1111 }
1112 
1113 long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1114                         struct kvm_memory_slot *memslot, unsigned long *map)
1115 {
1116         unsigned long i, j;
1117         int npages;
1118 
1119         for (i = 0; i < memslot->npages; i = j) {
1120                 npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1121 
1122                 /*
1123                  * Note that if npages > 0 then i must be a multiple of npages,
1124                  * since huge pages are only used to back the guest at guest
1125                  * real addresses that are a multiple of their size.
1126                  * Since we have at most one PTE covering any given guest
1127                  * real address, if npages > 1 we can skip to i + npages.
1128                  */
1129                 j = i + 1;
1130                 if (npages) {
1131                         set_dirty_bits(map, i, npages);
1132                         j = i + npages;
1133                 }
1134         }
1135         return 0;
1136 }
1137 
1138 void kvmppc_radix_flush_memslot(struct kvm *kvm,
1139                                 const struct kvm_memory_slot *memslot)
1140 {
1141         unsigned long n;
1142         pte_t *ptep;
1143         unsigned long gpa;
1144         unsigned int shift;
1145 
1146         if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1147                 kvmppc_uvmem_drop_pages(memslot, kvm, true);
1148 
1149         if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1150                 return;
1151 
1152         gpa = memslot->base_gfn << PAGE_SHIFT;
1153         spin_lock(&kvm->mmu_lock);
1154         for (n = memslot->npages; n; --n) {
1155                 ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1156                 if (ptep && pte_present(*ptep))
1157                         kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1158                                          kvm->arch.lpid);
1159                 gpa += PAGE_SIZE;
1160         }
1161         /*
1162          * Increase the mmu notifier sequence number to prevent any page
1163          * fault that read the memslot earlier from writing a PTE.
1164          */
1165         kvm->mmu_notifier_seq++;
1166         spin_unlock(&kvm->mmu_lock);
1167 }
1168 
1169 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1170                                  int psize, int *indexp)
1171 {
1172         if (!mmu_psize_defs[psize].shift)
1173                 return;
1174         info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1175                 (mmu_psize_defs[psize].ap << 29);
1176         ++(*indexp);
1177 }
1178 
1179 int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1180 {
1181         int i;
1182 
1183         if (!radix_enabled())
1184                 return -EINVAL;
1185         memset(info, 0, sizeof(*info));
1186 
1187         /* 4k page size */
1188         info->geometries[0].page_shift = 12;
1189         info->geometries[0].level_bits[0] = 9;
1190         for (i = 1; i < 4; ++i)
1191                 info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1192         /* 64k page size */
1193         info->geometries[1].page_shift = 16;
1194         for (i = 0; i < 4; ++i)
1195                 info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1196 
1197         i = 0;
1198         add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1199         add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1200         add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1201         add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1202 
1203         return 0;
1204 }
1205 
1206 int kvmppc_init_vm_radix(struct kvm *kvm)
1207 {
1208         kvm->arch.pgtable = pgd_alloc(kvm->mm);
1209         if (!kvm->arch.pgtable)
1210                 return -ENOMEM;
1211         return 0;
1212 }
1213 
1214 static void pte_ctor(void *addr)
1215 {
1216         memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1217 }
1218 
1219 static void pmd_ctor(void *addr)
1220 {
1221         memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1222 }
1223 
1224 struct debugfs_radix_state {
1225         struct kvm      *kvm;
1226         struct mutex    mutex;
1227         unsigned long   gpa;
1228         int             lpid;
1229         int             chars_left;
1230         int             buf_index;
1231         char            buf[128];
1232         u8              hdr;
1233 };
1234 
1235 static int debugfs_radix_open(struct inode *inode, struct file *file)
1236 {
1237         struct kvm *kvm = inode->i_private;
1238         struct debugfs_radix_state *p;
1239 
1240         p = kzalloc(sizeof(*p), GFP_KERNEL);
1241         if (!p)
1242                 return -ENOMEM;
1243 
1244         kvm_get_kvm(kvm);
1245         p->kvm = kvm;
1246         mutex_init(&p->mutex);
1247         file->private_data = p;
1248 
1249         return nonseekable_open(inode, file);
1250 }
1251 
1252 static int debugfs_radix_release(struct inode *inode, struct file *file)
1253 {
1254         struct debugfs_radix_state *p = file->private_data;
1255 
1256         kvm_put_kvm(p->kvm);
1257         kfree(p);
1258         return 0;
1259 }
1260 
1261 static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1262                                  size_t len, loff_t *ppos)
1263 {
1264         struct debugfs_radix_state *p = file->private_data;
1265         ssize_t ret, r;
1266         unsigned long n;
1267         struct kvm *kvm;
1268         unsigned long gpa;
1269         pgd_t *pgt;
1270         struct kvm_nested_guest *nested;
1271         pgd_t *pgdp;
1272         p4d_t p4d, *p4dp;
1273         pud_t pud, *pudp;
1274         pmd_t pmd, *pmdp;
1275         pte_t *ptep;
1276         int shift;
1277         unsigned long pte;
1278 
1279         kvm = p->kvm;
1280         if (!kvm_is_radix(kvm))
1281                 return 0;
1282 
1283         ret = mutex_lock_interruptible(&p->mutex);
1284         if (ret)
1285                 return ret;
1286 
1287         if (p->chars_left) {
1288                 n = p->chars_left;
1289                 if (n > len)
1290                         n = len;
1291                 r = copy_to_user(buf, p->buf + p->buf_index, n);
1292                 n -= r;
1293                 p->chars_left -= n;
1294                 p->buf_index += n;
1295                 buf += n;
1296                 len -= n;
1297                 ret = n;
1298                 if (r) {
1299                         if (!n)
1300                                 ret = -EFAULT;
1301                         goto out;
1302                 }
1303         }
1304 
1305         gpa = p->gpa;
1306         nested = NULL;
1307         pgt = NULL;
1308         while (len != 0 && p->lpid >= 0) {
1309                 if (gpa >= RADIX_PGTABLE_RANGE) {
1310                         gpa = 0;
1311                         pgt = NULL;
1312                         if (nested) {
1313                                 kvmhv_put_nested(nested);
1314                                 nested = NULL;
1315                         }
1316                         p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1317                         p->hdr = 0;
1318                         if (p->lpid < 0)
1319                                 break;
1320                 }
1321                 if (!pgt) {
1322                         if (p->lpid == 0) {
1323                                 pgt = kvm->arch.pgtable;
1324                         } else {
1325                                 nested = kvmhv_get_nested(kvm, p->lpid, false);
1326                                 if (!nested) {
1327                                         gpa = RADIX_PGTABLE_RANGE;
1328                                         continue;
1329                                 }
1330                                 pgt = nested->shadow_pgtable;
1331                         }
1332                 }
1333                 n = 0;
1334                 if (!p->hdr) {
1335                         if (p->lpid > 0)
1336                                 n = scnprintf(p->buf, sizeof(p->buf),
1337                                               "\nNested LPID %d: ", p->lpid);
1338                         n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1339                                       "pgdir: %lx\n", (unsigned long)pgt);
1340                         p->hdr = 1;
1341                         goto copy;
1342                 }
1343 
1344                 pgdp = pgt + pgd_index(gpa);
1345                 p4dp = p4d_offset(pgdp, gpa);
1346                 p4d = READ_ONCE(*p4dp);
1347                 if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1348                         gpa = (gpa & P4D_MASK) + P4D_SIZE;
1349                         continue;
1350                 }
1351 
1352                 pudp = pud_offset(&p4d, gpa);
1353                 pud = READ_ONCE(*pudp);
1354                 if (!(pud_val(pud) & _PAGE_PRESENT)) {
1355                         gpa = (gpa & PUD_MASK) + PUD_SIZE;
1356                         continue;
1357                 }
1358                 if (pud_val(pud) & _PAGE_PTE) {
1359                         pte = pud_val(pud);
1360                         shift = PUD_SHIFT;
1361                         goto leaf;
1362                 }
1363 
1364                 pmdp = pmd_offset(&pud, gpa);
1365                 pmd = READ_ONCE(*pmdp);
1366                 if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1367                         gpa = (gpa & PMD_MASK) + PMD_SIZE;
1368                         continue;
1369                 }
1370                 if (pmd_val(pmd) & _PAGE_PTE) {
1371                         pte = pmd_val(pmd);
1372                         shift = PMD_SHIFT;
1373                         goto leaf;
1374                 }
1375 
1376                 ptep = pte_offset_kernel(&pmd, gpa);
1377                 pte = pte_val(READ_ONCE(*ptep));
1378                 if (!(pte & _PAGE_PRESENT)) {
1379                         gpa += PAGE_SIZE;
1380                         continue;
1381                 }
1382                 shift = PAGE_SHIFT;
1383         leaf:
1384                 n = scnprintf(p->buf, sizeof(p->buf),
1385                               " %lx: %lx %d\n", gpa, pte, shift);
1386                 gpa += 1ul << shift;
1387         copy:
1388                 p->chars_left = n;
1389                 if (n > len)
1390                         n = len;
1391                 r = copy_to_user(buf, p->buf, n);
1392                 n -= r;
1393                 p->chars_left -= n;
1394                 p->buf_index = n;
1395                 buf += n;
1396                 len -= n;
1397                 ret += n;
1398                 if (r) {
1399                         if (!ret)
1400                                 ret = -EFAULT;
1401                         break;
1402                 }
1403         }
1404         p->gpa = gpa;
1405         if (nested)
1406                 kvmhv_put_nested(nested);
1407 
1408  out:
1409         mutex_unlock(&p->mutex);
1410         return ret;
1411 }
1412 
1413 static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1414                            size_t len, loff_t *ppos)
1415 {
1416         return -EACCES;
1417 }
1418 
1419 static const struct file_operations debugfs_radix_fops = {
1420         .owner   = THIS_MODULE,
1421         .open    = debugfs_radix_open,
1422         .release = debugfs_radix_release,
1423         .read    = debugfs_radix_read,
1424         .write   = debugfs_radix_write,
1425         .llseek  = generic_file_llseek,
1426 };
1427 
1428 void kvmhv_radix_debugfs_init(struct kvm *kvm)
1429 {
1430         debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm,
1431                             &debugfs_radix_fops);
1432 }
1433 
1434 int kvmppc_radix_init(void)
1435 {
1436         unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1437 
1438         kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1439         if (!kvm_pte_cache)
1440                 return -ENOMEM;
1441 
1442         size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1443 
1444         kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1445         if (!kvm_pmd_cache) {
1446                 kmem_cache_destroy(kvm_pte_cache);
1447                 return -ENOMEM;
1448         }
1449 
1450         return 0;
1451 }
1452 
1453 void kvmppc_radix_exit(void)
1454 {
1455         kmem_cache_destroy(kvm_pte_cache);
1456         kmem_cache_destroy(kvm_pmd_cache);
1457 }
1458 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp