~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/kvm/mmu/mmu.c

Version: ~ [ linux-5.13-rc5 ] ~ [ linux-5.12.9 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.42 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.124 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.193 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.235 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.271 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.271 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * Kernel-based Virtual Machine driver for Linux
  4  *
  5  * This module enables machines with Intel VT-x extensions to run virtual
  6  * machines without emulation or binary translation.
  7  *
  8  * MMU support
  9  *
 10  * Copyright (C) 2006 Qumranet, Inc.
 11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 12  *
 13  * Authors:
 14  *   Yaniv Kamay  <yaniv@qumranet.com>
 15  *   Avi Kivity   <avi@qumranet.com>
 16  */
 17 
 18 #include "irq.h"
 19 #include "mmu.h"
 20 #include "x86.h"
 21 #include "kvm_cache_regs.h"
 22 #include "cpuid.h"
 23 
 24 #include <linux/kvm_host.h>
 25 #include <linux/types.h>
 26 #include <linux/string.h>
 27 #include <linux/mm.h>
 28 #include <linux/highmem.h>
 29 #include <linux/moduleparam.h>
 30 #include <linux/export.h>
 31 #include <linux/swap.h>
 32 #include <linux/hugetlb.h>
 33 #include <linux/compiler.h>
 34 #include <linux/srcu.h>
 35 #include <linux/slab.h>
 36 #include <linux/sched/signal.h>
 37 #include <linux/uaccess.h>
 38 #include <linux/hash.h>
 39 #include <linux/kern_levels.h>
 40 #include <linux/kthread.h>
 41 
 42 #include <asm/page.h>
 43 #include <asm/memtype.h>
 44 #include <asm/cmpxchg.h>
 45 #include <asm/e820/api.h>
 46 #include <asm/io.h>
 47 #include <asm/vmx.h>
 48 #include <asm/kvm_page_track.h>
 49 #include "trace.h"
 50 
 51 extern bool itlb_multihit_kvm_mitigation;
 52 
 53 static int __read_mostly nx_huge_pages = -1;
 54 #ifdef CONFIG_PREEMPT_RT
 55 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
 56 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
 57 #else
 58 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
 59 #endif
 60 
 61 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
 62 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
 63 
 64 static struct kernel_param_ops nx_huge_pages_ops = {
 65         .set = set_nx_huge_pages,
 66         .get = param_get_bool,
 67 };
 68 
 69 static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
 70         .set = set_nx_huge_pages_recovery_ratio,
 71         .get = param_get_uint,
 72 };
 73 
 74 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
 75 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
 76 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
 77                 &nx_huge_pages_recovery_ratio, 0644);
 78 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
 79 
 80 /*
 81  * When setting this variable to true it enables Two-Dimensional-Paging
 82  * where the hardware walks 2 page tables:
 83  * 1. the guest-virtual to guest-physical
 84  * 2. while doing 1. it walks guest-physical to host-physical
 85  * If the hardware supports that we don't need to do shadow paging.
 86  */
 87 bool tdp_enabled = false;
 88 
 89 enum {
 90         AUDIT_PRE_PAGE_FAULT,
 91         AUDIT_POST_PAGE_FAULT,
 92         AUDIT_PRE_PTE_WRITE,
 93         AUDIT_POST_PTE_WRITE,
 94         AUDIT_PRE_SYNC,
 95         AUDIT_POST_SYNC
 96 };
 97 
 98 #undef MMU_DEBUG
 99 
100 #ifdef MMU_DEBUG
101 static bool dbg = 0;
102 module_param(dbg, bool, 0644);
103 
104 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
105 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
106 #define MMU_WARN_ON(x) WARN_ON(x)
107 #else
108 #define pgprintk(x...) do { } while (0)
109 #define rmap_printk(x...) do { } while (0)
110 #define MMU_WARN_ON(x) do { } while (0)
111 #endif
112 
113 #define PTE_PREFETCH_NUM                8
114 
115 #define PT_FIRST_AVAIL_BITS_SHIFT 10
116 #define PT64_SECOND_AVAIL_BITS_SHIFT 54
117 
118 /*
119  * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
120  * Access Tracking SPTEs.
121  */
122 #define SPTE_SPECIAL_MASK (3ULL << 52)
123 #define SPTE_AD_ENABLED_MASK (0ULL << 52)
124 #define SPTE_AD_DISABLED_MASK (1ULL << 52)
125 #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
126 #define SPTE_MMIO_MASK (3ULL << 52)
127 
128 #define PT64_LEVEL_BITS 9
129 
130 #define PT64_LEVEL_SHIFT(level) \
131                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
132 
133 #define PT64_INDEX(address, level)\
134         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
135 
136 
137 #define PT32_LEVEL_BITS 10
138 
139 #define PT32_LEVEL_SHIFT(level) \
140                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
141 
142 #define PT32_LVL_OFFSET_MASK(level) \
143         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
144                                                 * PT32_LEVEL_BITS))) - 1))
145 
146 #define PT32_INDEX(address, level)\
147         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
148 
149 
150 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
151 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
152 #else
153 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
154 #endif
155 #define PT64_LVL_ADDR_MASK(level) \
156         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
157                                                 * PT64_LEVEL_BITS))) - 1))
158 #define PT64_LVL_OFFSET_MASK(level) \
159         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
160                                                 * PT64_LEVEL_BITS))) - 1))
161 
162 #define PT32_BASE_ADDR_MASK PAGE_MASK
163 #define PT32_DIR_BASE_ADDR_MASK \
164         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
165 #define PT32_LVL_ADDR_MASK(level) \
166         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
167                                             * PT32_LEVEL_BITS))) - 1))
168 
169 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
170                         | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
171 
172 #define ACC_EXEC_MASK    1
173 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
174 #define ACC_USER_MASK    PT_USER_MASK
175 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
176 
177 /* The mask for the R/X bits in EPT PTEs */
178 #define PT64_EPT_READABLE_MASK                  0x1ull
179 #define PT64_EPT_EXECUTABLE_MASK                0x4ull
180 
181 #include <trace/events/kvm.h>
182 
183 #define SPTE_HOST_WRITEABLE     (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
184 #define SPTE_MMU_WRITEABLE      (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
185 
186 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
187 
188 /* make pte_list_desc fit well in cache line */
189 #define PTE_LIST_EXT 3
190 
191 /*
192  * Return values of handle_mmio_page_fault and mmu.page_fault:
193  * RET_PF_RETRY: let CPU fault again on the address.
194  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
195  *
196  * For handle_mmio_page_fault only:
197  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
198  */
199 enum {
200         RET_PF_RETRY = 0,
201         RET_PF_EMULATE = 1,
202         RET_PF_INVALID = 2,
203 };
204 
205 struct pte_list_desc {
206         u64 *sptes[PTE_LIST_EXT];
207         struct pte_list_desc *more;
208 };
209 
210 struct kvm_shadow_walk_iterator {
211         u64 addr;
212         hpa_t shadow_addr;
213         u64 *sptep;
214         int level;
215         unsigned index;
216 };
217 
218 static const union kvm_mmu_page_role mmu_base_role_mask = {
219         .cr0_wp = 1,
220         .gpte_is_8_bytes = 1,
221         .nxe = 1,
222         .smep_andnot_wp = 1,
223         .smap_andnot_wp = 1,
224         .smm = 1,
225         .guest_mode = 1,
226         .ad_disabled = 1,
227 };
228 
229 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
230         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
231                                          (_root), (_addr));                \
232              shadow_walk_okay(&(_walker));                                 \
233              shadow_walk_next(&(_walker)))
234 
235 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
236         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
237              shadow_walk_okay(&(_walker));                      \
238              shadow_walk_next(&(_walker)))
239 
240 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
241         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
242              shadow_walk_okay(&(_walker)) &&                            \
243                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
244              __shadow_walk_next(&(_walker), spte))
245 
246 static struct kmem_cache *pte_list_desc_cache;
247 static struct kmem_cache *mmu_page_header_cache;
248 static struct percpu_counter kvm_total_used_mmu_pages;
249 
250 static u64 __read_mostly shadow_nx_mask;
251 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
252 static u64 __read_mostly shadow_user_mask;
253 static u64 __read_mostly shadow_accessed_mask;
254 static u64 __read_mostly shadow_dirty_mask;
255 static u64 __read_mostly shadow_mmio_mask;
256 static u64 __read_mostly shadow_mmio_value;
257 static u64 __read_mostly shadow_mmio_access_mask;
258 static u64 __read_mostly shadow_present_mask;
259 static u64 __read_mostly shadow_me_mask;
260 
261 /*
262  * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
263  * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
264  * pages.
265  */
266 static u64 __read_mostly shadow_acc_track_mask;
267 
268 /*
269  * The mask/shift to use for saving the original R/X bits when marking the PTE
270  * as not-present for access tracking purposes. We do not save the W bit as the
271  * PTEs being access tracked also need to be dirty tracked, so the W bit will be
272  * restored only when a write is attempted to the page.
273  */
274 static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
275                                                     PT64_EPT_EXECUTABLE_MASK;
276 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
277 
278 /*
279  * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
280  * to guard against L1TF attacks.
281  */
282 static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
283 
284 /*
285  * The number of high-order 1 bits to use in the mask above.
286  */
287 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
288 
289 /*
290  * In some cases, we need to preserve the GFN of a non-present or reserved
291  * SPTE when we usurp the upper five bits of the physical address space to
292  * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
293  * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
294  * left into the reserved bits, i.e. the GFN in the SPTE will be split into
295  * high and low parts.  This mask covers the lower bits of the GFN.
296  */
297 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
298 
299 /*
300  * The number of non-reserved physical address bits irrespective of features
301  * that repurpose legal bits, e.g. MKTME.
302  */
303 static u8 __read_mostly shadow_phys_bits;
304 
305 static void mmu_spte_set(u64 *sptep, u64 spte);
306 static bool is_executable_pte(u64 spte);
307 static union kvm_mmu_page_role
308 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
309 
310 #define CREATE_TRACE_POINTS
311 #include "mmutrace.h"
312 
313 
314 static inline bool kvm_available_flush_tlb_with_range(void)
315 {
316         return kvm_x86_ops->tlb_remote_flush_with_range;
317 }
318 
319 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
320                 struct kvm_tlb_range *range)
321 {
322         int ret = -ENOTSUPP;
323 
324         if (range && kvm_x86_ops->tlb_remote_flush_with_range)
325                 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
326 
327         if (ret)
328                 kvm_flush_remote_tlbs(kvm);
329 }
330 
331 static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
332                 u64 start_gfn, u64 pages)
333 {
334         struct kvm_tlb_range range;
335 
336         range.start_gfn = start_gfn;
337         range.pages = pages;
338 
339         kvm_flush_remote_tlbs_with_range(kvm, &range);
340 }
341 
342 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
343 {
344         BUG_ON((u64)(unsigned)access_mask != access_mask);
345         BUG_ON((mmio_mask & mmio_value) != mmio_value);
346         WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
347         WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
348         shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
349         shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
350         shadow_mmio_access_mask = access_mask;
351 }
352 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
353 
354 static bool is_mmio_spte(u64 spte)
355 {
356         return (spte & shadow_mmio_mask) == shadow_mmio_value;
357 }
358 
359 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
360 {
361         return sp->role.ad_disabled;
362 }
363 
364 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
365 {
366         /*
367          * When using the EPT page-modification log, the GPAs in the log
368          * would come from L2 rather than L1.  Therefore, we need to rely
369          * on write protection to record dirty pages.  This also bypasses
370          * PML, since writes now result in a vmexit.
371          */
372         return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
373 }
374 
375 static inline bool spte_ad_enabled(u64 spte)
376 {
377         MMU_WARN_ON(is_mmio_spte(spte));
378         return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
379 }
380 
381 static inline bool spte_ad_need_write_protect(u64 spte)
382 {
383         MMU_WARN_ON(is_mmio_spte(spte));
384         return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
385 }
386 
387 static bool is_nx_huge_page_enabled(void)
388 {
389         return READ_ONCE(nx_huge_pages);
390 }
391 
392 static inline u64 spte_shadow_accessed_mask(u64 spte)
393 {
394         MMU_WARN_ON(is_mmio_spte(spte));
395         return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
396 }
397 
398 static inline u64 spte_shadow_dirty_mask(u64 spte)
399 {
400         MMU_WARN_ON(is_mmio_spte(spte));
401         return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
402 }
403 
404 static inline bool is_access_track_spte(u64 spte)
405 {
406         return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
407 }
408 
409 /*
410  * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
411  * the memslots generation and is derived as follows:
412  *
413  * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
414  * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
415  *
416  * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
417  * the MMIO generation number, as doing so would require stealing a bit from
418  * the "real" generation number and thus effectively halve the maximum number
419  * of MMIO generations that can be handled before encountering a wrap (which
420  * requires a full MMU zap).  The flag is instead explicitly queried when
421  * checking for MMIO spte cache hits.
422  */
423 #define MMIO_SPTE_GEN_MASK              GENMASK_ULL(17, 0)
424 
425 #define MMIO_SPTE_GEN_LOW_START         3
426 #define MMIO_SPTE_GEN_LOW_END           11
427 #define MMIO_SPTE_GEN_LOW_MASK          GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
428                                                     MMIO_SPTE_GEN_LOW_START)
429 
430 #define MMIO_SPTE_GEN_HIGH_START        PT64_SECOND_AVAIL_BITS_SHIFT
431 #define MMIO_SPTE_GEN_HIGH_END          62
432 #define MMIO_SPTE_GEN_HIGH_MASK         GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
433                                                     MMIO_SPTE_GEN_HIGH_START)
434 
435 static u64 generation_mmio_spte_mask(u64 gen)
436 {
437         u64 mask;
438 
439         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
440         BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
441 
442         mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
443         mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
444         return mask;
445 }
446 
447 static u64 get_mmio_spte_generation(u64 spte)
448 {
449         u64 gen;
450 
451         gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
452         gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
453         return gen;
454 }
455 
456 static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
457 {
458 
459         u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
460         u64 mask = generation_mmio_spte_mask(gen);
461         u64 gpa = gfn << PAGE_SHIFT;
462 
463         access &= shadow_mmio_access_mask;
464         mask |= shadow_mmio_value | access;
465         mask |= gpa | shadow_nonpresent_or_rsvd_mask;
466         mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
467                 << shadow_nonpresent_or_rsvd_mask_len;
468 
469         return mask;
470 }
471 
472 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
473                            unsigned int access)
474 {
475         u64 mask = make_mmio_spte(vcpu, gfn, access);
476         unsigned int gen = get_mmio_spte_generation(mask);
477 
478         access = mask & ACC_ALL;
479 
480         trace_mark_mmio_spte(sptep, gfn, access, gen);
481         mmu_spte_set(sptep, mask);
482 }
483 
484 static gfn_t get_mmio_spte_gfn(u64 spte)
485 {
486         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
487 
488         gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
489                & shadow_nonpresent_or_rsvd_mask;
490 
491         return gpa >> PAGE_SHIFT;
492 }
493 
494 static unsigned get_mmio_spte_access(u64 spte)
495 {
496         return spte & shadow_mmio_access_mask;
497 }
498 
499 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
500                           kvm_pfn_t pfn, unsigned int access)
501 {
502         if (unlikely(is_noslot_pfn(pfn))) {
503                 mark_mmio_spte(vcpu, sptep, gfn, access);
504                 return true;
505         }
506 
507         return false;
508 }
509 
510 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
511 {
512         u64 kvm_gen, spte_gen, gen;
513 
514         gen = kvm_vcpu_memslots(vcpu)->generation;
515         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
516                 return false;
517 
518         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
519         spte_gen = get_mmio_spte_generation(spte);
520 
521         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
522         return likely(kvm_gen == spte_gen);
523 }
524 
525 /*
526  * Sets the shadow PTE masks used by the MMU.
527  *
528  * Assumptions:
529  *  - Setting either @accessed_mask or @dirty_mask requires setting both
530  *  - At least one of @accessed_mask or @acc_track_mask must be set
531  */
532 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
533                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
534                 u64 acc_track_mask, u64 me_mask)
535 {
536         BUG_ON(!dirty_mask != !accessed_mask);
537         BUG_ON(!accessed_mask && !acc_track_mask);
538         BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
539 
540         shadow_user_mask = user_mask;
541         shadow_accessed_mask = accessed_mask;
542         shadow_dirty_mask = dirty_mask;
543         shadow_nx_mask = nx_mask;
544         shadow_x_mask = x_mask;
545         shadow_present_mask = p_mask;
546         shadow_acc_track_mask = acc_track_mask;
547         shadow_me_mask = me_mask;
548 }
549 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
550 
551 static u8 kvm_get_shadow_phys_bits(void)
552 {
553         /*
554          * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
555          * in CPU detection code, but the processor treats those reduced bits as
556          * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
557          * the physical address bits reported by CPUID.
558          */
559         if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
560                 return cpuid_eax(0x80000008) & 0xff;
561 
562         /*
563          * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
564          * custom CPUID.  Proceed with whatever the kernel found since these features
565          * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
566          */
567         return boot_cpu_data.x86_phys_bits;
568 }
569 
570 static void kvm_mmu_reset_all_pte_masks(void)
571 {
572         u8 low_phys_bits;
573 
574         shadow_user_mask = 0;
575         shadow_accessed_mask = 0;
576         shadow_dirty_mask = 0;
577         shadow_nx_mask = 0;
578         shadow_x_mask = 0;
579         shadow_mmio_mask = 0;
580         shadow_present_mask = 0;
581         shadow_acc_track_mask = 0;
582 
583         shadow_phys_bits = kvm_get_shadow_phys_bits();
584 
585         /*
586          * If the CPU has 46 or less physical address bits, then set an
587          * appropriate mask to guard against L1TF attacks. Otherwise, it is
588          * assumed that the CPU is not vulnerable to L1TF.
589          *
590          * Some Intel CPUs address the L1 cache using more PA bits than are
591          * reported by CPUID. Use the PA width of the L1 cache when possible
592          * to achieve more effective mitigation, e.g. if system RAM overlaps
593          * the most significant bits of legal physical address space.
594          */
595         shadow_nonpresent_or_rsvd_mask = 0;
596         low_phys_bits = boot_cpu_data.x86_phys_bits;
597         if (boot_cpu_has_bug(X86_BUG_L1TF) &&
598             !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
599                           52 - shadow_nonpresent_or_rsvd_mask_len)) {
600                 low_phys_bits = boot_cpu_data.x86_cache_bits
601                         - shadow_nonpresent_or_rsvd_mask_len;
602                 shadow_nonpresent_or_rsvd_mask =
603                         rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
604         }
605 
606         shadow_nonpresent_or_rsvd_lower_gfn_mask =
607                 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
608 }
609 
610 static int is_cpuid_PSE36(void)
611 {
612         return 1;
613 }
614 
615 static int is_nx(struct kvm_vcpu *vcpu)
616 {
617         return vcpu->arch.efer & EFER_NX;
618 }
619 
620 static int is_shadow_present_pte(u64 pte)
621 {
622         return (pte != 0) && !is_mmio_spte(pte);
623 }
624 
625 static int is_large_pte(u64 pte)
626 {
627         return pte & PT_PAGE_SIZE_MASK;
628 }
629 
630 static int is_last_spte(u64 pte, int level)
631 {
632         if (level == PT_PAGE_TABLE_LEVEL)
633                 return 1;
634         if (is_large_pte(pte))
635                 return 1;
636         return 0;
637 }
638 
639 static bool is_executable_pte(u64 spte)
640 {
641         return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
642 }
643 
644 static kvm_pfn_t spte_to_pfn(u64 pte)
645 {
646         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
647 }
648 
649 static gfn_t pse36_gfn_delta(u32 gpte)
650 {
651         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
652 
653         return (gpte & PT32_DIR_PSE36_MASK) << shift;
654 }
655 
656 #ifdef CONFIG_X86_64
657 static void __set_spte(u64 *sptep, u64 spte)
658 {
659         WRITE_ONCE(*sptep, spte);
660 }
661 
662 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
663 {
664         WRITE_ONCE(*sptep, spte);
665 }
666 
667 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
668 {
669         return xchg(sptep, spte);
670 }
671 
672 static u64 __get_spte_lockless(u64 *sptep)
673 {
674         return READ_ONCE(*sptep);
675 }
676 #else
677 union split_spte {
678         struct {
679                 u32 spte_low;
680                 u32 spte_high;
681         };
682         u64 spte;
683 };
684 
685 static void count_spte_clear(u64 *sptep, u64 spte)
686 {
687         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
688 
689         if (is_shadow_present_pte(spte))
690                 return;
691 
692         /* Ensure the spte is completely set before we increase the count */
693         smp_wmb();
694         sp->clear_spte_count++;
695 }
696 
697 static void __set_spte(u64 *sptep, u64 spte)
698 {
699         union split_spte *ssptep, sspte;
700 
701         ssptep = (union split_spte *)sptep;
702         sspte = (union split_spte)spte;
703 
704         ssptep->spte_high = sspte.spte_high;
705 
706         /*
707          * If we map the spte from nonpresent to present, We should store
708          * the high bits firstly, then set present bit, so cpu can not
709          * fetch this spte while we are setting the spte.
710          */
711         smp_wmb();
712 
713         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
714 }
715 
716 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
717 {
718         union split_spte *ssptep, sspte;
719 
720         ssptep = (union split_spte *)sptep;
721         sspte = (union split_spte)spte;
722 
723         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
724 
725         /*
726          * If we map the spte from present to nonpresent, we should clear
727          * present bit firstly to avoid vcpu fetch the old high bits.
728          */
729         smp_wmb();
730 
731         ssptep->spte_high = sspte.spte_high;
732         count_spte_clear(sptep, spte);
733 }
734 
735 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
736 {
737         union split_spte *ssptep, sspte, orig;
738 
739         ssptep = (union split_spte *)sptep;
740         sspte = (union split_spte)spte;
741 
742         /* xchg acts as a barrier before the setting of the high bits */
743         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
744         orig.spte_high = ssptep->spte_high;
745         ssptep->spte_high = sspte.spte_high;
746         count_spte_clear(sptep, spte);
747 
748         return orig.spte;
749 }
750 
751 /*
752  * The idea using the light way get the spte on x86_32 guest is from
753  * gup_get_pte (mm/gup.c).
754  *
755  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
756  * coalesces them and we are running out of the MMU lock.  Therefore
757  * we need to protect against in-progress updates of the spte.
758  *
759  * Reading the spte while an update is in progress may get the old value
760  * for the high part of the spte.  The race is fine for a present->non-present
761  * change (because the high part of the spte is ignored for non-present spte),
762  * but for a present->present change we must reread the spte.
763  *
764  * All such changes are done in two steps (present->non-present and
765  * non-present->present), hence it is enough to count the number of
766  * present->non-present updates: if it changed while reading the spte,
767  * we might have hit the race.  This is done using clear_spte_count.
768  */
769 static u64 __get_spte_lockless(u64 *sptep)
770 {
771         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
772         union split_spte spte, *orig = (union split_spte *)sptep;
773         int count;
774 
775 retry:
776         count = sp->clear_spte_count;
777         smp_rmb();
778 
779         spte.spte_low = orig->spte_low;
780         smp_rmb();
781 
782         spte.spte_high = orig->spte_high;
783         smp_rmb();
784 
785         if (unlikely(spte.spte_low != orig->spte_low ||
786               count != sp->clear_spte_count))
787                 goto retry;
788 
789         return spte.spte;
790 }
791 #endif
792 
793 static bool spte_can_locklessly_be_made_writable(u64 spte)
794 {
795         return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
796                 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
797 }
798 
799 static bool spte_has_volatile_bits(u64 spte)
800 {
801         if (!is_shadow_present_pte(spte))
802                 return false;
803 
804         /*
805          * Always atomically update spte if it can be updated
806          * out of mmu-lock, it can ensure dirty bit is not lost,
807          * also, it can help us to get a stable is_writable_pte()
808          * to ensure tlb flush is not missed.
809          */
810         if (spte_can_locklessly_be_made_writable(spte) ||
811             is_access_track_spte(spte))
812                 return true;
813 
814         if (spte_ad_enabled(spte)) {
815                 if ((spte & shadow_accessed_mask) == 0 ||
816                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
817                         return true;
818         }
819 
820         return false;
821 }
822 
823 static bool is_accessed_spte(u64 spte)
824 {
825         u64 accessed_mask = spte_shadow_accessed_mask(spte);
826 
827         return accessed_mask ? spte & accessed_mask
828                              : !is_access_track_spte(spte);
829 }
830 
831 static bool is_dirty_spte(u64 spte)
832 {
833         u64 dirty_mask = spte_shadow_dirty_mask(spte);
834 
835         return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
836 }
837 
838 /* Rules for using mmu_spte_set:
839  * Set the sptep from nonpresent to present.
840  * Note: the sptep being assigned *must* be either not present
841  * or in a state where the hardware will not attempt to update
842  * the spte.
843  */
844 static void mmu_spte_set(u64 *sptep, u64 new_spte)
845 {
846         WARN_ON(is_shadow_present_pte(*sptep));
847         __set_spte(sptep, new_spte);
848 }
849 
850 /*
851  * Update the SPTE (excluding the PFN), but do not track changes in its
852  * accessed/dirty status.
853  */
854 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
855 {
856         u64 old_spte = *sptep;
857 
858         WARN_ON(!is_shadow_present_pte(new_spte));
859 
860         if (!is_shadow_present_pte(old_spte)) {
861                 mmu_spte_set(sptep, new_spte);
862                 return old_spte;
863         }
864 
865         if (!spte_has_volatile_bits(old_spte))
866                 __update_clear_spte_fast(sptep, new_spte);
867         else
868                 old_spte = __update_clear_spte_slow(sptep, new_spte);
869 
870         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
871 
872         return old_spte;
873 }
874 
875 /* Rules for using mmu_spte_update:
876  * Update the state bits, it means the mapped pfn is not changed.
877  *
878  * Whenever we overwrite a writable spte with a read-only one we
879  * should flush remote TLBs. Otherwise rmap_write_protect
880  * will find a read-only spte, even though the writable spte
881  * might be cached on a CPU's TLB, the return value indicates this
882  * case.
883  *
884  * Returns true if the TLB needs to be flushed
885  */
886 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
887 {
888         bool flush = false;
889         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
890 
891         if (!is_shadow_present_pte(old_spte))
892                 return false;
893 
894         /*
895          * For the spte updated out of mmu-lock is safe, since
896          * we always atomically update it, see the comments in
897          * spte_has_volatile_bits().
898          */
899         if (spte_can_locklessly_be_made_writable(old_spte) &&
900               !is_writable_pte(new_spte))
901                 flush = true;
902 
903         /*
904          * Flush TLB when accessed/dirty states are changed in the page tables,
905          * to guarantee consistency between TLB and page tables.
906          */
907 
908         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
909                 flush = true;
910                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
911         }
912 
913         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
914                 flush = true;
915                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
916         }
917 
918         return flush;
919 }
920 
921 /*
922  * Rules for using mmu_spte_clear_track_bits:
923  * It sets the sptep from present to nonpresent, and track the
924  * state bits, it is used to clear the last level sptep.
925  * Returns non-zero if the PTE was previously valid.
926  */
927 static int mmu_spte_clear_track_bits(u64 *sptep)
928 {
929         kvm_pfn_t pfn;
930         u64 old_spte = *sptep;
931 
932         if (!spte_has_volatile_bits(old_spte))
933                 __update_clear_spte_fast(sptep, 0ull);
934         else
935                 old_spte = __update_clear_spte_slow(sptep, 0ull);
936 
937         if (!is_shadow_present_pte(old_spte))
938                 return 0;
939 
940         pfn = spte_to_pfn(old_spte);
941 
942         /*
943          * KVM does not hold the refcount of the page used by
944          * kvm mmu, before reclaiming the page, we should
945          * unmap it from mmu first.
946          */
947         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
948 
949         if (is_accessed_spte(old_spte))
950                 kvm_set_pfn_accessed(pfn);
951 
952         if (is_dirty_spte(old_spte))
953                 kvm_set_pfn_dirty(pfn);
954 
955         return 1;
956 }
957 
958 /*
959  * Rules for using mmu_spte_clear_no_track:
960  * Directly clear spte without caring the state bits of sptep,
961  * it is used to set the upper level spte.
962  */
963 static void mmu_spte_clear_no_track(u64 *sptep)
964 {
965         __update_clear_spte_fast(sptep, 0ull);
966 }
967 
968 static u64 mmu_spte_get_lockless(u64 *sptep)
969 {
970         return __get_spte_lockless(sptep);
971 }
972 
973 static u64 mark_spte_for_access_track(u64 spte)
974 {
975         if (spte_ad_enabled(spte))
976                 return spte & ~shadow_accessed_mask;
977 
978         if (is_access_track_spte(spte))
979                 return spte;
980 
981         /*
982          * Making an Access Tracking PTE will result in removal of write access
983          * from the PTE. So, verify that we will be able to restore the write
984          * access in the fast page fault path later on.
985          */
986         WARN_ONCE((spte & PT_WRITABLE_MASK) &&
987                   !spte_can_locklessly_be_made_writable(spte),
988                   "kvm: Writable SPTE is not locklessly dirty-trackable\n");
989 
990         WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
991                           shadow_acc_track_saved_bits_shift),
992                   "kvm: Access Tracking saved bit locations are not zero\n");
993 
994         spte |= (spte & shadow_acc_track_saved_bits_mask) <<
995                 shadow_acc_track_saved_bits_shift;
996         spte &= ~shadow_acc_track_mask;
997 
998         return spte;
999 }
1000 
1001 /* Restore an acc-track PTE back to a regular PTE */
1002 static u64 restore_acc_track_spte(u64 spte)
1003 {
1004         u64 new_spte = spte;
1005         u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
1006                          & shadow_acc_track_saved_bits_mask;
1007 
1008         WARN_ON_ONCE(spte_ad_enabled(spte));
1009         WARN_ON_ONCE(!is_access_track_spte(spte));
1010 
1011         new_spte &= ~shadow_acc_track_mask;
1012         new_spte &= ~(shadow_acc_track_saved_bits_mask <<
1013                       shadow_acc_track_saved_bits_shift);
1014         new_spte |= saved_bits;
1015 
1016         return new_spte;
1017 }
1018 
1019 /* Returns the Accessed status of the PTE and resets it at the same time. */
1020 static bool mmu_spte_age(u64 *sptep)
1021 {
1022         u64 spte = mmu_spte_get_lockless(sptep);
1023 
1024         if (!is_accessed_spte(spte))
1025                 return false;
1026 
1027         if (spte_ad_enabled(spte)) {
1028                 clear_bit((ffs(shadow_accessed_mask) - 1),
1029                           (unsigned long *)sptep);
1030         } else {
1031                 /*
1032                  * Capture the dirty status of the page, so that it doesn't get
1033                  * lost when the SPTE is marked for access tracking.
1034                  */
1035                 if (is_writable_pte(spte))
1036                         kvm_set_pfn_dirty(spte_to_pfn(spte));
1037 
1038                 spte = mark_spte_for_access_track(spte);
1039                 mmu_spte_update_no_track(sptep, spte);
1040         }
1041 
1042         return true;
1043 }
1044 
1045 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
1046 {
1047         /*
1048          * Prevent page table teardown by making any free-er wait during
1049          * kvm_flush_remote_tlbs() IPI to all active vcpus.
1050          */
1051         local_irq_disable();
1052 
1053         /*
1054          * Make sure a following spte read is not reordered ahead of the write
1055          * to vcpu->mode.
1056          */
1057         smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1058 }
1059 
1060 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
1061 {
1062         /*
1063          * Make sure the write to vcpu->mode is not reordered in front of
1064          * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
1065          * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
1066          */
1067         smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1068         local_irq_enable();
1069 }
1070 
1071 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
1072                                   struct kmem_cache *base_cache, int min)
1073 {
1074         void *obj;
1075 
1076         if (cache->nobjs >= min)
1077                 return 0;
1078         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1079                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1080                 if (!obj)
1081                         return cache->nobjs >= min ? 0 : -ENOMEM;
1082                 cache->objects[cache->nobjs++] = obj;
1083         }
1084         return 0;
1085 }
1086 
1087 static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
1088 {
1089         return cache->nobjs;
1090 }
1091 
1092 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
1093                                   struct kmem_cache *cache)
1094 {
1095         while (mc->nobjs)
1096                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1097 }
1098 
1099 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1100                                        int min)
1101 {
1102         void *page;
1103 
1104         if (cache->nobjs >= min)
1105                 return 0;
1106         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1107                 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1108                 if (!page)
1109                         return cache->nobjs >= min ? 0 : -ENOMEM;
1110                 cache->objects[cache->nobjs++] = page;
1111         }
1112         return 0;
1113 }
1114 
1115 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1116 {
1117         while (mc->nobjs)
1118                 free_page((unsigned long)mc->objects[--mc->nobjs]);
1119 }
1120 
1121 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1122 {
1123         int r;
1124 
1125         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1126                                    pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1127         if (r)
1128                 goto out;
1129         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1130         if (r)
1131                 goto out;
1132         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1133                                    mmu_page_header_cache, 4);
1134 out:
1135         return r;
1136 }
1137 
1138 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1139 {
1140         mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1141                                 pte_list_desc_cache);
1142         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1143         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1144                                 mmu_page_header_cache);
1145 }
1146 
1147 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1148 {
1149         void *p;
1150 
1151         BUG_ON(!mc->nobjs);
1152         p = mc->objects[--mc->nobjs];
1153         return p;
1154 }
1155 
1156 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1157 {
1158         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1159 }
1160 
1161 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1162 {
1163         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1164 }
1165 
1166 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1167 {
1168         if (!sp->role.direct)
1169                 return sp->gfns[index];
1170 
1171         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1172 }
1173 
1174 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1175 {
1176         if (!sp->role.direct) {
1177                 sp->gfns[index] = gfn;
1178                 return;
1179         }
1180 
1181         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1182                 pr_err_ratelimited("gfn mismatch under direct page %llx "
1183                                    "(expected %llx, got %llx)\n",
1184                                    sp->gfn,
1185                                    kvm_mmu_page_get_gfn(sp, index), gfn);
1186 }
1187 
1188 /*
1189  * Return the pointer to the large page information for a given gfn,
1190  * handling slots that are not large page aligned.
1191  */
1192 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1193                                               struct kvm_memory_slot *slot,
1194                                               int level)
1195 {
1196         unsigned long idx;
1197 
1198         idx = gfn_to_index(gfn, slot->base_gfn, level);
1199         return &slot->arch.lpage_info[level - 2][idx];
1200 }
1201 
1202 static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1203                                             gfn_t gfn, int count)
1204 {
1205         struct kvm_lpage_info *linfo;
1206         int i;
1207 
1208         for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1209                 linfo = lpage_info_slot(gfn, slot, i);
1210                 linfo->disallow_lpage += count;
1211                 WARN_ON(linfo->disallow_lpage < 0);
1212         }
1213 }
1214 
1215 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1216 {
1217         update_gfn_disallow_lpage_count(slot, gfn, 1);
1218 }
1219 
1220 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1221 {
1222         update_gfn_disallow_lpage_count(slot, gfn, -1);
1223 }
1224 
1225 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1226 {
1227         struct kvm_memslots *slots;
1228         struct kvm_memory_slot *slot;
1229         gfn_t gfn;
1230 
1231         kvm->arch.indirect_shadow_pages++;
1232         gfn = sp->gfn;
1233         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1234         slot = __gfn_to_memslot(slots, gfn);
1235 
1236         /* the non-leaf shadow pages are keeping readonly. */
1237         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1238                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1239                                                     KVM_PAGE_TRACK_WRITE);
1240 
1241         kvm_mmu_gfn_disallow_lpage(slot, gfn);
1242 }
1243 
1244 static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1245 {
1246         if (sp->lpage_disallowed)
1247                 return;
1248 
1249         ++kvm->stat.nx_lpage_splits;
1250         list_add_tail(&sp->lpage_disallowed_link,
1251                       &kvm->arch.lpage_disallowed_mmu_pages);
1252         sp->lpage_disallowed = true;
1253 }
1254 
1255 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1256 {
1257         struct kvm_memslots *slots;
1258         struct kvm_memory_slot *slot;
1259         gfn_t gfn;
1260 
1261         kvm->arch.indirect_shadow_pages--;
1262         gfn = sp->gfn;
1263         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1264         slot = __gfn_to_memslot(slots, gfn);
1265         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1266                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1267                                                        KVM_PAGE_TRACK_WRITE);
1268 
1269         kvm_mmu_gfn_allow_lpage(slot, gfn);
1270 }
1271 
1272 static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1273 {
1274         --kvm->stat.nx_lpage_splits;
1275         sp->lpage_disallowed = false;
1276         list_del(&sp->lpage_disallowed_link);
1277 }
1278 
1279 static struct kvm_memory_slot *
1280 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1281                             bool no_dirty_log)
1282 {
1283         struct kvm_memory_slot *slot;
1284 
1285         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1286         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1287                 return NULL;
1288         if (no_dirty_log && slot->dirty_bitmap)
1289                 return NULL;
1290 
1291         return slot;
1292 }
1293 
1294 /*
1295  * About rmap_head encoding:
1296  *
1297  * If the bit zero of rmap_head->val is clear, then it points to the only spte
1298  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
1299  * pte_list_desc containing more mappings.
1300  */
1301 
1302 /*
1303  * Returns the number of pointers in the rmap chain, not counting the new one.
1304  */
1305 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1306                         struct kvm_rmap_head *rmap_head)
1307 {
1308         struct pte_list_desc *desc;
1309         int i, count = 0;
1310 
1311         if (!rmap_head->val) {
1312                 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1313                 rmap_head->val = (unsigned long)spte;
1314         } else if (!(rmap_head->val & 1)) {
1315                 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1316                 desc = mmu_alloc_pte_list_desc(vcpu);
1317                 desc->sptes[0] = (u64 *)rmap_head->val;
1318                 desc->sptes[1] = spte;
1319                 rmap_head->val = (unsigned long)desc | 1;
1320                 ++count;
1321         } else {
1322                 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1323                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1324                 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1325                         desc = desc->more;
1326                         count += PTE_LIST_EXT;
1327                 }
1328                 if (desc->sptes[PTE_LIST_EXT-1]) {
1329                         desc->more = mmu_alloc_pte_list_desc(vcpu);
1330                         desc = desc->more;
1331                 }
1332                 for (i = 0; desc->sptes[i]; ++i)
1333                         ++count;
1334                 desc->sptes[i] = spte;
1335         }
1336         return count;
1337 }
1338 
1339 static void
1340 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1341                            struct pte_list_desc *desc, int i,
1342                            struct pte_list_desc *prev_desc)
1343 {
1344         int j;
1345 
1346         for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1347                 ;
1348         desc->sptes[i] = desc->sptes[j];
1349         desc->sptes[j] = NULL;
1350         if (j != 0)
1351                 return;
1352         if (!prev_desc && !desc->more)
1353                 rmap_head->val = 0;
1354         else
1355                 if (prev_desc)
1356                         prev_desc->more = desc->more;
1357                 else
1358                         rmap_head->val = (unsigned long)desc->more | 1;
1359         mmu_free_pte_list_desc(desc);
1360 }
1361 
1362 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1363 {
1364         struct pte_list_desc *desc;
1365         struct pte_list_desc *prev_desc;
1366         int i;
1367 
1368         if (!rmap_head->val) {
1369                 pr_err("%s: %p 0->BUG\n", __func__, spte);
1370                 BUG();
1371         } else if (!(rmap_head->val & 1)) {
1372                 rmap_printk("%s:  %p 1->0\n", __func__, spte);
1373                 if ((u64 *)rmap_head->val != spte) {
1374                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
1375                         BUG();
1376                 }
1377                 rmap_head->val = 0;
1378         } else {
1379                 rmap_printk("%s:  %p many->many\n", __func__, spte);
1380                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1381                 prev_desc = NULL;
1382                 while (desc) {
1383                         for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1384                                 if (desc->sptes[i] == spte) {
1385                                         pte_list_desc_remove_entry(rmap_head,
1386                                                         desc, i, prev_desc);
1387                                         return;
1388                                 }
1389                         }
1390                         prev_desc = desc;
1391                         desc = desc->more;
1392                 }
1393                 pr_err("%s: %p many->many\n", __func__, spte);
1394                 BUG();
1395         }
1396 }
1397 
1398 static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1399 {
1400         mmu_spte_clear_track_bits(sptep);
1401         __pte_list_remove(sptep, rmap_head);
1402 }
1403 
1404 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1405                                            struct kvm_memory_slot *slot)
1406 {
1407         unsigned long idx;
1408 
1409         idx = gfn_to_index(gfn, slot->base_gfn, level);
1410         return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1411 }
1412 
1413 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1414                                          struct kvm_mmu_page *sp)
1415 {
1416         struct kvm_memslots *slots;
1417         struct kvm_memory_slot *slot;
1418 
1419         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1420         slot = __gfn_to_memslot(slots, gfn);
1421         return __gfn_to_rmap(gfn, sp->role.level, slot);
1422 }
1423 
1424 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1425 {
1426         struct kvm_mmu_memory_cache *cache;
1427 
1428         cache = &vcpu->arch.mmu_pte_list_desc_cache;
1429         return mmu_memory_cache_free_objects(cache);
1430 }
1431 
1432 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1433 {
1434         struct kvm_mmu_page *sp;
1435         struct kvm_rmap_head *rmap_head;
1436 
1437         sp = page_header(__pa(spte));
1438         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1439         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1440         return pte_list_add(vcpu, spte, rmap_head);
1441 }
1442 
1443 static void rmap_remove(struct kvm *kvm, u64 *spte)
1444 {
1445         struct kvm_mmu_page *sp;
1446         gfn_t gfn;
1447         struct kvm_rmap_head *rmap_head;
1448 
1449         sp = page_header(__pa(spte));
1450         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1451         rmap_head = gfn_to_rmap(kvm, gfn, sp);
1452         __pte_list_remove(spte, rmap_head);
1453 }
1454 
1455 /*
1456  * Used by the following functions to iterate through the sptes linked by a
1457  * rmap.  All fields are private and not assumed to be used outside.
1458  */
1459 struct rmap_iterator {
1460         /* private fields */
1461         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1462         int pos;                        /* index of the sptep */
1463 };
1464 
1465 /*
1466  * Iteration must be started by this function.  This should also be used after
1467  * removing/dropping sptes from the rmap link because in such cases the
1468  * information in the iterator may not be valid.
1469  *
1470  * Returns sptep if found, NULL otherwise.
1471  */
1472 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1473                            struct rmap_iterator *iter)
1474 {
1475         u64 *sptep;
1476 
1477         if (!rmap_head->val)
1478                 return NULL;
1479 
1480         if (!(rmap_head->val & 1)) {
1481                 iter->desc = NULL;
1482                 sptep = (u64 *)rmap_head->val;
1483                 goto out;
1484         }
1485 
1486         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1487         iter->pos = 0;
1488         sptep = iter->desc->sptes[iter->pos];
1489 out:
1490         BUG_ON(!is_shadow_present_pte(*sptep));
1491         return sptep;
1492 }
1493 
1494 /*
1495  * Must be used with a valid iterator: e.g. after rmap_get_first().
1496  *
1497  * Returns sptep if found, NULL otherwise.
1498  */
1499 static u64 *rmap_get_next(struct rmap_iterator *iter)
1500 {
1501         u64 *sptep;
1502 
1503         if (iter->desc) {
1504                 if (iter->pos < PTE_LIST_EXT - 1) {
1505                         ++iter->pos;
1506                         sptep = iter->desc->sptes[iter->pos];
1507                         if (sptep)
1508                                 goto out;
1509                 }
1510 
1511                 iter->desc = iter->desc->more;
1512 
1513                 if (iter->desc) {
1514                         iter->pos = 0;
1515                         /* desc->sptes[0] cannot be NULL */
1516                         sptep = iter->desc->sptes[iter->pos];
1517                         goto out;
1518                 }
1519         }
1520 
1521         return NULL;
1522 out:
1523         BUG_ON(!is_shadow_present_pte(*sptep));
1524         return sptep;
1525 }
1526 
1527 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1528         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1529              _spte_; _spte_ = rmap_get_next(_iter_))
1530 
1531 static void drop_spte(struct kvm *kvm, u64 *sptep)
1532 {
1533         if (mmu_spte_clear_track_bits(sptep))
1534                 rmap_remove(kvm, sptep);
1535 }
1536 
1537 
1538 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1539 {
1540         if (is_large_pte(*sptep)) {
1541                 WARN_ON(page_header(__pa(sptep))->role.level ==
1542                         PT_PAGE_TABLE_LEVEL);
1543                 drop_spte(kvm, sptep);
1544                 --kvm->stat.lpages;
1545                 return true;
1546         }
1547 
1548         return false;
1549 }
1550 
1551 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1552 {
1553         if (__drop_large_spte(vcpu->kvm, sptep)) {
1554                 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1555 
1556                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1557                         KVM_PAGES_PER_HPAGE(sp->role.level));
1558         }
1559 }
1560 
1561 /*
1562  * Write-protect on the specified @sptep, @pt_protect indicates whether
1563  * spte write-protection is caused by protecting shadow page table.
1564  *
1565  * Note: write protection is difference between dirty logging and spte
1566  * protection:
1567  * - for dirty logging, the spte can be set to writable at anytime if
1568  *   its dirty bitmap is properly set.
1569  * - for spte protection, the spte can be writable only after unsync-ing
1570  *   shadow page.
1571  *
1572  * Return true if tlb need be flushed.
1573  */
1574 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1575 {
1576         u64 spte = *sptep;
1577 
1578         if (!is_writable_pte(spte) &&
1579               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1580                 return false;
1581 
1582         rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1583 
1584         if (pt_protect)
1585                 spte &= ~SPTE_MMU_WRITEABLE;
1586         spte = spte & ~PT_WRITABLE_MASK;
1587 
1588         return mmu_spte_update(sptep, spte);
1589 }
1590 
1591 static bool __rmap_write_protect(struct kvm *kvm,
1592                                  struct kvm_rmap_head *rmap_head,
1593                                  bool pt_protect)
1594 {
1595         u64 *sptep;
1596         struct rmap_iterator iter;
1597         bool flush = false;
1598 
1599         for_each_rmap_spte(rmap_head, &iter, sptep)
1600                 flush |= spte_write_protect(sptep, pt_protect);
1601 
1602         return flush;
1603 }
1604 
1605 static bool spte_clear_dirty(u64 *sptep)
1606 {
1607         u64 spte = *sptep;
1608 
1609         rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1610 
1611         MMU_WARN_ON(!spte_ad_enabled(spte));
1612         spte &= ~shadow_dirty_mask;
1613         return mmu_spte_update(sptep, spte);
1614 }
1615 
1616 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1617 {
1618         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1619                                                (unsigned long *)sptep);
1620         if (was_writable && !spte_ad_enabled(*sptep))
1621                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1622 
1623         return was_writable;
1624 }
1625 
1626 /*
1627  * Gets the GFN ready for another round of dirty logging by clearing the
1628  *      - D bit on ad-enabled SPTEs, and
1629  *      - W bit on ad-disabled SPTEs.
1630  * Returns true iff any D or W bits were cleared.
1631  */
1632 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1633 {
1634         u64 *sptep;
1635         struct rmap_iterator iter;
1636         bool flush = false;
1637 
1638         for_each_rmap_spte(rmap_head, &iter, sptep)
1639                 if (spte_ad_need_write_protect(*sptep))
1640                         flush |= spte_wrprot_for_clear_dirty(sptep);
1641                 else
1642                         flush |= spte_clear_dirty(sptep);
1643 
1644         return flush;
1645 }
1646 
1647 static bool spte_set_dirty(u64 *sptep)
1648 {
1649         u64 spte = *sptep;
1650 
1651         rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1652 
1653         /*
1654          * Similar to the !kvm_x86_ops->slot_disable_log_dirty case,
1655          * do not bother adding back write access to pages marked
1656          * SPTE_AD_WRPROT_ONLY_MASK.
1657          */
1658         spte |= shadow_dirty_mask;
1659 
1660         return mmu_spte_update(sptep, spte);
1661 }
1662 
1663 static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1664 {
1665         u64 *sptep;
1666         struct rmap_iterator iter;
1667         bool flush = false;
1668 
1669         for_each_rmap_spte(rmap_head, &iter, sptep)
1670                 if (spte_ad_enabled(*sptep))
1671                         flush |= spte_set_dirty(sptep);
1672 
1673         return flush;
1674 }
1675 
1676 /**
1677  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1678  * @kvm: kvm instance
1679  * @slot: slot to protect
1680  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1681  * @mask: indicates which pages we should protect
1682  *
1683  * Used when we do not need to care about huge page mappings: e.g. during dirty
1684  * logging we do not have any such mappings.
1685  */
1686 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1687                                      struct kvm_memory_slot *slot,
1688                                      gfn_t gfn_offset, unsigned long mask)
1689 {
1690         struct kvm_rmap_head *rmap_head;
1691 
1692         while (mask) {
1693                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1694                                           PT_PAGE_TABLE_LEVEL, slot);
1695                 __rmap_write_protect(kvm, rmap_head, false);
1696 
1697                 /* clear the first set bit */
1698                 mask &= mask - 1;
1699         }
1700 }
1701 
1702 /**
1703  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1704  * protect the page if the D-bit isn't supported.
1705  * @kvm: kvm instance
1706  * @slot: slot to clear D-bit
1707  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1708  * @mask: indicates which pages we should clear D-bit
1709  *
1710  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1711  */
1712 void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1713                                      struct kvm_memory_slot *slot,
1714                                      gfn_t gfn_offset, unsigned long mask)
1715 {
1716         struct kvm_rmap_head *rmap_head;
1717 
1718         while (mask) {
1719                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1720                                           PT_PAGE_TABLE_LEVEL, slot);
1721                 __rmap_clear_dirty(kvm, rmap_head);
1722 
1723                 /* clear the first set bit */
1724                 mask &= mask - 1;
1725         }
1726 }
1727 EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1728 
1729 /**
1730  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1731  * PT level pages.
1732  *
1733  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1734  * enable dirty logging for them.
1735  *
1736  * Used when we do not need to care about huge page mappings: e.g. during dirty
1737  * logging we do not have any such mappings.
1738  */
1739 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1740                                 struct kvm_memory_slot *slot,
1741                                 gfn_t gfn_offset, unsigned long mask)
1742 {
1743         if (kvm_x86_ops->enable_log_dirty_pt_masked)
1744                 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1745                                 mask);
1746         else
1747                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1748 }
1749 
1750 /**
1751  * kvm_arch_write_log_dirty - emulate dirty page logging
1752  * @vcpu: Guest mode vcpu
1753  *
1754  * Emulate arch specific page modification logging for the
1755  * nested hypervisor
1756  */
1757 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1758 {
1759         if (kvm_x86_ops->write_log_dirty)
1760                 return kvm_x86_ops->write_log_dirty(vcpu);
1761 
1762         return 0;
1763 }
1764 
1765 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1766                                     struct kvm_memory_slot *slot, u64 gfn)
1767 {
1768         struct kvm_rmap_head *rmap_head;
1769         int i;
1770         bool write_protected = false;
1771 
1772         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1773                 rmap_head = __gfn_to_rmap(gfn, i, slot);
1774                 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1775         }
1776 
1777         return write_protected;
1778 }
1779 
1780 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1781 {
1782         struct kvm_memory_slot *slot;
1783 
1784         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1785         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1786 }
1787 
1788 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1789 {
1790         u64 *sptep;
1791         struct rmap_iterator iter;
1792         bool flush = false;
1793 
1794         while ((sptep = rmap_get_first(rmap_head, &iter))) {
1795                 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1796 
1797                 pte_list_remove(rmap_head, sptep);
1798                 flush = true;
1799         }
1800 
1801         return flush;
1802 }
1803 
1804 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1805                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
1806                            unsigned long data)
1807 {
1808         return kvm_zap_rmapp(kvm, rmap_head);
1809 }
1810 
1811 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1812                              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1813                              unsigned long data)
1814 {
1815         u64 *sptep;
1816         struct rmap_iterator iter;
1817         int need_flush = 0;
1818         u64 new_spte;
1819         pte_t *ptep = (pte_t *)data;
1820         kvm_pfn_t new_pfn;
1821 
1822         WARN_ON(pte_huge(*ptep));
1823         new_pfn = pte_pfn(*ptep);
1824 
1825 restart:
1826         for_each_rmap_spte(rmap_head, &iter, sptep) {
1827                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1828                             sptep, *sptep, gfn, level);
1829 
1830                 need_flush = 1;
1831 
1832                 if (pte_write(*ptep)) {
1833                         pte_list_remove(rmap_head, sptep);
1834                         goto restart;
1835                 } else {
1836                         new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1837                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
1838 
1839                         new_spte &= ~PT_WRITABLE_MASK;
1840                         new_spte &= ~SPTE_HOST_WRITEABLE;
1841 
1842                         new_spte = mark_spte_for_access_track(new_spte);
1843 
1844                         mmu_spte_clear_track_bits(sptep);
1845                         mmu_spte_set(sptep, new_spte);
1846                 }
1847         }
1848 
1849         if (need_flush && kvm_available_flush_tlb_with_range()) {
1850                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1851                 return 0;
1852         }
1853 
1854         return need_flush;
1855 }
1856 
1857 struct slot_rmap_walk_iterator {
1858         /* input fields. */
1859         struct kvm_memory_slot *slot;
1860         gfn_t start_gfn;
1861         gfn_t end_gfn;
1862         int start_level;
1863         int end_level;
1864 
1865         /* output fields. */
1866         gfn_t gfn;
1867         struct kvm_rmap_head *rmap;
1868         int level;
1869 
1870         /* private field. */
1871         struct kvm_rmap_head *end_rmap;
1872 };
1873 
1874 static void
1875 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1876 {
1877         iterator->level = level;
1878         iterator->gfn = iterator->start_gfn;
1879         iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1880         iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1881                                            iterator->slot);
1882 }
1883 
1884 static void
1885 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1886                     struct kvm_memory_slot *slot, int start_level,
1887                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1888 {
1889         iterator->slot = slot;
1890         iterator->start_level = start_level;
1891         iterator->end_level = end_level;
1892         iterator->start_gfn = start_gfn;
1893         iterator->end_gfn = end_gfn;
1894 
1895         rmap_walk_init_level(iterator, iterator->start_level);
1896 }
1897 
1898 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1899 {
1900         return !!iterator->rmap;
1901 }
1902 
1903 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1904 {
1905         if (++iterator->rmap <= iterator->end_rmap) {
1906                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1907                 return;
1908         }
1909 
1910         if (++iterator->level > iterator->end_level) {
1911                 iterator->rmap = NULL;
1912                 return;
1913         }
1914 
1915         rmap_walk_init_level(iterator, iterator->level);
1916 }
1917 
1918 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1919            _start_gfn, _end_gfn, _iter_)                                \
1920         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1921                                  _end_level_, _start_gfn, _end_gfn);    \
1922              slot_rmap_walk_okay(_iter_);                               \
1923              slot_rmap_walk_next(_iter_))
1924 
1925 static int kvm_handle_hva_range(struct kvm *kvm,
1926                                 unsigned long start,
1927                                 unsigned long end,
1928                                 unsigned long data,
1929                                 int (*handler)(struct kvm *kvm,
1930                                                struct kvm_rmap_head *rmap_head,
1931                                                struct kvm_memory_slot *slot,
1932                                                gfn_t gfn,
1933                                                int level,
1934                                                unsigned long data))
1935 {
1936         struct kvm_memslots *slots;
1937         struct kvm_memory_slot *memslot;
1938         struct slot_rmap_walk_iterator iterator;
1939         int ret = 0;
1940         int i;
1941 
1942         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1943                 slots = __kvm_memslots(kvm, i);
1944                 kvm_for_each_memslot(memslot, slots) {
1945                         unsigned long hva_start, hva_end;
1946                         gfn_t gfn_start, gfn_end;
1947 
1948                         hva_start = max(start, memslot->userspace_addr);
1949                         hva_end = min(end, memslot->userspace_addr +
1950                                       (memslot->npages << PAGE_SHIFT));
1951                         if (hva_start >= hva_end)
1952                                 continue;
1953                         /*
1954                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
1955                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1956                          */
1957                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1958                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1959 
1960                         for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1961                                                  PT_MAX_HUGEPAGE_LEVEL,
1962                                                  gfn_start, gfn_end - 1,
1963                                                  &iterator)
1964                                 ret |= handler(kvm, iterator.rmap, memslot,
1965                                                iterator.gfn, iterator.level, data);
1966                 }
1967         }
1968 
1969         return ret;
1970 }
1971 
1972 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1973                           unsigned long data,
1974                           int (*handler)(struct kvm *kvm,
1975                                          struct kvm_rmap_head *rmap_head,
1976                                          struct kvm_memory_slot *slot,
1977                                          gfn_t gfn, int level,
1978                                          unsigned long data))
1979 {
1980         return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1981 }
1982 
1983 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1984 {
1985         return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1986 }
1987 
1988 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1989 {
1990         return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1991 }
1992 
1993 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1994                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
1995                          unsigned long data)
1996 {
1997         u64 *sptep;
1998         struct rmap_iterator uninitialized_var(iter);
1999         int young = 0;
2000 
2001         for_each_rmap_spte(rmap_head, &iter, sptep)
2002                 young |= mmu_spte_age(sptep);
2003 
2004         trace_kvm_age_page(gfn, level, slot, young);
2005         return young;
2006 }
2007 
2008 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2009                               struct kvm_memory_slot *slot, gfn_t gfn,
2010                               int level, unsigned long data)
2011 {
2012         u64 *sptep;
2013         struct rmap_iterator iter;
2014 
2015         for_each_rmap_spte(rmap_head, &iter, sptep)
2016                 if (is_accessed_spte(*sptep))
2017                         return 1;
2018         return 0;
2019 }
2020 
2021 #define RMAP_RECYCLE_THRESHOLD 1000
2022 
2023 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
2024 {
2025         struct kvm_rmap_head *rmap_head;
2026         struct kvm_mmu_page *sp;
2027 
2028         sp = page_header(__pa(spte));
2029 
2030         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
2031 
2032         kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
2033         kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
2034                         KVM_PAGES_PER_HPAGE(sp->role.level));
2035 }
2036 
2037 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2038 {
2039         return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
2040 }
2041 
2042 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2043 {
2044         return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
2045 }
2046 
2047 #ifdef MMU_DEBUG
2048 static int is_empty_shadow_page(u64 *spt)
2049 {
2050         u64 *pos;
2051         u64 *end;
2052 
2053         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2054                 if (is_shadow_present_pte(*pos)) {
2055                         printk(KERN_ERR "%s: %p %llx\n", __func__,
2056                                pos, *pos);
2057                         return 0;
2058                 }
2059         return 1;
2060 }
2061 #endif
2062 
2063 /*
2064  * This value is the sum of all of the kvm instances's
2065  * kvm->arch.n_used_mmu_pages values.  We need a global,
2066  * aggregate version in order to make the slab shrinker
2067  * faster
2068  */
2069 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2070 {
2071         kvm->arch.n_used_mmu_pages += nr;
2072         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2073 }
2074 
2075 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2076 {
2077         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2078         hlist_del(&sp->hash_link);
2079         list_del(&sp->link);
2080         free_page((unsigned long)sp->spt);
2081         if (!sp->role.direct)
2082                 free_page((unsigned long)sp->gfns);
2083         kmem_cache_free(mmu_page_header_cache, sp);
2084 }
2085 
2086 static unsigned kvm_page_table_hashfn(gfn_t gfn)
2087 {
2088         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2089 }
2090 
2091 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2092                                     struct kvm_mmu_page *sp, u64 *parent_pte)
2093 {
2094         if (!parent_pte)
2095                 return;
2096 
2097         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2098 }
2099 
2100 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2101                                        u64 *parent_pte)
2102 {
2103         __pte_list_remove(parent_pte, &sp->parent_ptes);
2104 }
2105 
2106 static void drop_parent_pte(struct kvm_mmu_page *sp,
2107                             u64 *parent_pte)
2108 {
2109         mmu_page_remove_parent_pte(sp, parent_pte);
2110         mmu_spte_clear_no_track(parent_pte);
2111 }
2112 
2113 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2114 {
2115         struct kvm_mmu_page *sp;
2116 
2117         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2118         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2119         if (!direct)
2120                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2121         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2122 
2123         /*
2124          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2125          * depends on valid pages being added to the head of the list.  See
2126          * comments in kvm_zap_obsolete_pages().
2127          */
2128         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2129         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2130         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2131         return sp;
2132 }
2133 
2134 static void mark_unsync(u64 *spte);
2135 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2136 {
2137         u64 *sptep;
2138         struct rmap_iterator iter;
2139 
2140         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2141                 mark_unsync(sptep);
2142         }
2143 }
2144 
2145 static void mark_unsync(u64 *spte)
2146 {
2147         struct kvm_mmu_page *sp;
2148         unsigned int index;
2149 
2150         sp = page_header(__pa(spte));
2151         index = spte - sp->spt;
2152         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2153                 return;
2154         if (sp->unsync_children++)
2155                 return;
2156         kvm_mmu_mark_parents_unsync(sp);
2157 }
2158 
2159 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2160                                struct kvm_mmu_page *sp)
2161 {
2162         return 0;
2163 }
2164 
2165 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2166 {
2167 }
2168 
2169 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2170                                  struct kvm_mmu_page *sp, u64 *spte,
2171                                  const void *pte)
2172 {
2173         WARN_ON(1);
2174 }
2175 
2176 #define KVM_PAGE_ARRAY_NR 16
2177 
2178 struct kvm_mmu_pages {
2179         struct mmu_page_and_offset {
2180                 struct kvm_mmu_page *sp;
2181                 unsigned int idx;
2182         } page[KVM_PAGE_ARRAY_NR];
2183         unsigned int nr;
2184 };
2185 
2186 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2187                          int idx)
2188 {
2189         int i;
2190 
2191         if (sp->unsync)
2192                 for (i=0; i < pvec->nr; i++)
2193                         if (pvec->page[i].sp == sp)
2194                                 return 0;
2195 
2196         pvec->page[pvec->nr].sp = sp;
2197         pvec->page[pvec->nr].idx = idx;
2198         pvec->nr++;
2199         return (pvec->nr == KVM_PAGE_ARRAY_NR);
2200 }
2201 
2202 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2203 {
2204         --sp->unsync_children;
2205         WARN_ON((int)sp->unsync_children < 0);
2206         __clear_bit(idx, sp->unsync_child_bitmap);
2207 }
2208 
2209 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2210                            struct kvm_mmu_pages *pvec)
2211 {
2212         int i, ret, nr_unsync_leaf = 0;
2213 
2214         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2215                 struct kvm_mmu_page *child;
2216                 u64 ent = sp->spt[i];
2217 
2218                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2219                         clear_unsync_child_bit(sp, i);
2220                         continue;
2221                 }
2222 
2223                 child = page_header(ent & PT64_BASE_ADDR_MASK);
2224 
2225                 if (child->unsync_children) {
2226                         if (mmu_pages_add(pvec, child, i))
2227                                 return -ENOSPC;
2228 
2229                         ret = __mmu_unsync_walk(child, pvec);
2230                         if (!ret) {
2231                                 clear_unsync_child_bit(sp, i);
2232                                 continue;
2233                         } else if (ret > 0) {
2234                                 nr_unsync_leaf += ret;
2235                         } else
2236                                 return ret;
2237                 } else if (child->unsync) {
2238                         nr_unsync_leaf++;
2239                         if (mmu_pages_add(pvec, child, i))
2240                                 return -ENOSPC;
2241                 } else
2242                         clear_unsync_child_bit(sp, i);
2243         }
2244 
2245         return nr_unsync_leaf;
2246 }
2247 
2248 #define INVALID_INDEX (-1)
2249 
2250 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2251                            struct kvm_mmu_pages *pvec)
2252 {
2253         pvec->nr = 0;
2254         if (!sp->unsync_children)
2255                 return 0;
2256 
2257         mmu_pages_add(pvec, sp, INVALID_INDEX);
2258         return __mmu_unsync_walk(sp, pvec);
2259 }
2260 
2261 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2262 {
2263         WARN_ON(!sp->unsync);
2264         trace_kvm_mmu_sync_page(sp);
2265         sp->unsync = 0;
2266         --kvm->stat.mmu_unsync;
2267 }
2268 
2269 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2270                                      struct list_head *invalid_list);
2271 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2272                                     struct list_head *invalid_list);
2273 
2274 
2275 #define for_each_valid_sp(_kvm, _sp, _gfn)                              \
2276         hlist_for_each_entry(_sp,                                       \
2277           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2278                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
2279                 } else
2280 
2281 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
2282         for_each_valid_sp(_kvm, _sp, _gfn)                              \
2283                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2284 
2285 static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2286 {
2287         return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2288 }
2289 
2290 /* @sp->gfn should be write-protected at the call site */
2291 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2292                             struct list_head *invalid_list)
2293 {
2294         if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2295             vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2296                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2297                 return false;
2298         }
2299 
2300         return true;
2301 }
2302 
2303 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2304                                         struct list_head *invalid_list,
2305                                         bool remote_flush)
2306 {
2307         if (!remote_flush && list_empty(invalid_list))
2308                 return false;
2309 
2310         if (!list_empty(invalid_list))
2311                 kvm_mmu_commit_zap_page(kvm, invalid_list);
2312         else
2313                 kvm_flush_remote_tlbs(kvm);
2314         return true;
2315 }
2316 
2317 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2318                                  struct list_head *invalid_list,
2319                                  bool remote_flush, bool local_flush)
2320 {
2321         if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2322                 return;
2323 
2324         if (local_flush)
2325                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2326 }
2327 
2328 #ifdef CONFIG_KVM_MMU_AUDIT
2329 #include "mmu_audit.c"
2330 #else
2331 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2332 static void mmu_audit_disable(void) { }
2333 #endif
2334 
2335 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2336 {
2337         return sp->role.invalid ||
2338                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2339 }
2340 
2341 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2342                          struct list_head *invalid_list)
2343 {
2344         kvm_unlink_unsync_page(vcpu->kvm, sp);
2345         return __kvm_sync_page(vcpu, sp, invalid_list);
2346 }
2347 
2348 /* @gfn should be write-protected at the call site */
2349 static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2350                            struct list_head *invalid_list)
2351 {
2352         struct kvm_mmu_page *s;
2353         bool ret = false;
2354 
2355         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2356                 if (!s->unsync)
2357                         continue;
2358 
2359                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2360                 ret |= kvm_sync_page(vcpu, s, invalid_list);
2361         }
2362 
2363         return ret;
2364 }
2365 
2366 struct mmu_page_path {
2367         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2368         unsigned int idx[PT64_ROOT_MAX_LEVEL];
2369 };
2370 
2371 #define for_each_sp(pvec, sp, parents, i)                       \
2372                 for (i = mmu_pages_first(&pvec, &parents);      \
2373                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
2374                         i = mmu_pages_next(&pvec, &parents, i))
2375 
2376 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2377                           struct mmu_page_path *parents,
2378                           int i)
2379 {
2380         int n;
2381 
2382         for (n = i+1; n < pvec->nr; n++) {
2383                 struct kvm_mmu_page *sp = pvec->page[n].sp;
2384                 unsigned idx = pvec->page[n].idx;
2385                 int level = sp->role.level;
2386 
2387                 parents->idx[level-1] = idx;
2388                 if (level == PT_PAGE_TABLE_LEVEL)
2389                         break;
2390 
2391                 parents->parent[level-2] = sp;
2392         }
2393 
2394         return n;
2395 }
2396 
2397 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2398                            struct mmu_page_path *parents)
2399 {
2400         struct kvm_mmu_page *sp;
2401         int level;
2402 
2403         if (pvec->nr == 0)
2404                 return 0;
2405 
2406         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2407 
2408         sp = pvec->page[0].sp;
2409         level = sp->role.level;
2410         WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2411 
2412         parents->parent[level-2] = sp;
2413 
2414         /* Also set up a sentinel.  Further entries in pvec are all
2415          * children of sp, so this element is never overwritten.
2416          */
2417         parents->parent[level-1] = NULL;
2418         return mmu_pages_next(pvec, parents, 0);
2419 }
2420 
2421 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2422 {
2423         struct kvm_mmu_page *sp;
2424         unsigned int level = 0;
2425 
2426         do {
2427                 unsigned int idx = parents->idx[level];
2428                 sp = parents->parent[level];
2429                 if (!sp)
2430                         return;
2431 
2432                 WARN_ON(idx == INVALID_INDEX);
2433                 clear_unsync_child_bit(sp, idx);
2434                 level++;
2435         } while (!sp->unsync_children);
2436 }
2437 
2438 static void mmu_sync_children(struct kvm_vcpu *vcpu,
2439                               struct kvm_mmu_page *parent)
2440 {
2441         int i;
2442         struct kvm_mmu_page *sp;
2443         struct mmu_page_path parents;
2444         struct kvm_mmu_pages pages;
2445         LIST_HEAD(invalid_list);
2446         bool flush = false;
2447 
2448         while (mmu_unsync_walk(parent, &pages)) {
2449                 bool protected = false;
2450 
2451                 for_each_sp(pages, sp, parents, i)
2452                         protected |= rmap_write_protect(vcpu, sp->gfn);
2453 
2454                 if (protected) {
2455                         kvm_flush_remote_tlbs(vcpu->kvm);
2456                         flush = false;
2457                 }
2458 
2459                 for_each_sp(pages, sp, parents, i) {
2460                         flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2461                         mmu_pages_clear_parents(&parents);
2462                 }
2463                 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2464                         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2465                         cond_resched_lock(&vcpu->kvm->mmu_lock);
2466                         flush = false;
2467                 }
2468         }
2469 
2470         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2471 }
2472 
2473 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2474 {
2475         atomic_set(&sp->write_flooding_count,  0);
2476 }
2477 
2478 static void clear_sp_write_flooding_count(u64 *spte)
2479 {
2480         struct kvm_mmu_page *sp =  page_header(__pa(spte));
2481 
2482         __clear_sp_write_flooding_count(sp);
2483 }
2484 
2485 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2486                                              gfn_t gfn,
2487                                              gva_t gaddr,
2488                                              unsigned level,
2489                                              int direct,
2490                                              unsigned int access)
2491 {
2492         union kvm_mmu_page_role role;
2493         unsigned quadrant;
2494         struct kvm_mmu_page *sp;
2495         bool need_sync = false;
2496         bool flush = false;
2497         int collisions = 0;
2498         LIST_HEAD(invalid_list);
2499 
2500         role = vcpu->arch.mmu->mmu_role.base;
2501         role.level = level;
2502         role.direct = direct;
2503         if (role.direct)
2504                 role.gpte_is_8_bytes = true;
2505         role.access = access;
2506         if (!vcpu->arch.mmu->direct_map
2507             && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2508                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2509                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2510                 role.quadrant = quadrant;
2511         }
2512         for_each_valid_sp(vcpu->kvm, sp, gfn) {
2513                 if (sp->gfn != gfn) {
2514                         collisions++;
2515                         continue;
2516                 }
2517 
2518                 if (!need_sync && sp->unsync)
2519                         need_sync = true;
2520 
2521                 if (sp->role.word != role.word)
2522                         continue;
2523 
2524                 if (sp->unsync) {
2525                         /* The page is good, but __kvm_sync_page might still end
2526                          * up zapping it.  If so, break in order to rebuild it.
2527                          */
2528                         if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2529                                 break;
2530 
2531                         WARN_ON(!list_empty(&invalid_list));
2532                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2533                 }
2534 
2535                 if (sp->unsync_children)
2536                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2537 
2538                 __clear_sp_write_flooding_count(sp);
2539                 trace_kvm_mmu_get_page(sp, false);
2540                 goto out;
2541         }
2542 
2543         ++vcpu->kvm->stat.mmu_cache_miss;
2544 
2545         sp = kvm_mmu_alloc_page(vcpu, direct);
2546 
2547         sp->gfn = gfn;
2548         sp->role = role;
2549         hlist_add_head(&sp->hash_link,
2550                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2551         if (!direct) {
2552                 /*
2553                  * we should do write protection before syncing pages
2554                  * otherwise the content of the synced shadow page may
2555                  * be inconsistent with guest page table.
2556                  */
2557                 account_shadowed(vcpu->kvm, sp);
2558                 if (level == PT_PAGE_TABLE_LEVEL &&
2559                       rmap_write_protect(vcpu, gfn))
2560                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2561 
2562                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2563                         flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2564         }
2565         clear_page(sp->spt);
2566         trace_kvm_mmu_get_page(sp, true);
2567 
2568         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2569 out:
2570         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2571                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2572         return sp;
2573 }
2574 
2575 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2576                                         struct kvm_vcpu *vcpu, hpa_t root,
2577                                         u64 addr)
2578 {
2579         iterator->addr = addr;
2580         iterator->shadow_addr = root;
2581         iterator->level = vcpu->arch.mmu->shadow_root_level;
2582 
2583         if (iterator->level == PT64_ROOT_4LEVEL &&
2584             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2585             !vcpu->arch.mmu->direct_map)
2586                 --iterator->level;
2587 
2588         if (iterator->level == PT32E_ROOT_LEVEL) {
2589                 /*
2590                  * prev_root is currently only used for 64-bit hosts. So only
2591                  * the active root_hpa is valid here.
2592                  */
2593                 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2594 
2595                 iterator->shadow_addr
2596                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2597                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2598                 --iterator->level;
2599                 if (!iterator->shadow_addr)
2600                         iterator->level = 0;
2601         }
2602 }
2603 
2604 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2605                              struct kvm_vcpu *vcpu, u64 addr)
2606 {
2607         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2608                                     addr);
2609 }
2610 
2611 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2612 {
2613         if (iterator->level < PT_PAGE_TABLE_LEVEL)
2614                 return false;
2615 
2616         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2617         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2618         return true;
2619 }
2620 
2621 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2622                                u64 spte)
2623 {
2624         if (is_last_spte(spte, iterator->level)) {
2625                 iterator->level = 0;
2626                 return;
2627         }
2628 
2629         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2630         --iterator->level;
2631 }
2632 
2633 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2634 {
2635         __shadow_walk_next(iterator, *iterator->sptep);
2636 }
2637 
2638 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2639                              struct kvm_mmu_page *sp)
2640 {
2641         u64 spte;
2642 
2643         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2644 
2645         spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2646                shadow_user_mask | shadow_x_mask | shadow_me_mask;
2647 
2648         if (sp_ad_disabled(sp))
2649                 spte |= SPTE_AD_DISABLED_MASK;
2650         else
2651                 spte |= shadow_accessed_mask;
2652 
2653         mmu_spte_set(sptep, spte);
2654 
2655         mmu_page_add_parent_pte(vcpu, sp, sptep);
2656 
2657         if (sp->unsync_children || sp->unsync)
2658                 mark_unsync(sptep);
2659 }
2660 
2661 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2662                                    unsigned direct_access)
2663 {
2664         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2665                 struct kvm_mmu_page *child;
2666 
2667                 /*
2668                  * For the direct sp, if the guest pte's dirty bit
2669                  * changed form clean to dirty, it will corrupt the
2670                  * sp's access: allow writable in the read-only sp,
2671                  * so we should update the spte at this point to get
2672                  * a new sp with the correct access.
2673                  */
2674                 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2675                 if (child->role.access == direct_access)
2676                         return;
2677 
2678                 drop_parent_pte(child, sptep);
2679                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2680         }
2681 }
2682 
2683 static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2684                              u64 *spte)
2685 {
2686         u64 pte;
2687         struct kvm_mmu_page *child;
2688 
2689         pte = *spte;
2690         if (is_shadow_present_pte(pte)) {
2691                 if (is_last_spte(pte, sp->role.level)) {
2692                         drop_spte(kvm, spte);
2693                         if (is_large_pte(pte))
2694                                 --kvm->stat.lpages;
2695                 } else {
2696                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2697                         drop_parent_pte(child, spte);
2698                 }
2699                 return true;
2700         }
2701 
2702         if (is_mmio_spte(pte))
2703                 mmu_spte_clear_no_track(spte);
2704 
2705         return false;
2706 }
2707 
2708 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2709                                          struct kvm_mmu_page *sp)
2710 {
2711         unsigned i;
2712 
2713         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2714                 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2715 }
2716 
2717 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2718 {
2719         u64 *sptep;
2720         struct rmap_iterator iter;
2721 
2722         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2723                 drop_parent_pte(sp, sptep);
2724 }
2725 
2726 static int mmu_zap_unsync_children(struct kvm *kvm,
2727                                    struct kvm_mmu_page *parent,
2728                                    struct list_head *invalid_list)
2729 {
2730         int i, zapped = 0;
2731         struct mmu_page_path parents;
2732         struct kvm_mmu_pages pages;
2733 
2734         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2735                 return 0;
2736 
2737         while (mmu_unsync_walk(parent, &pages)) {
2738                 struct kvm_mmu_page *sp;
2739 
2740                 for_each_sp(pages, sp, parents, i) {
2741                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2742                         mmu_pages_clear_parents(&parents);
2743                         zapped++;
2744                 }
2745         }
2746 
2747         return zapped;
2748 }
2749 
2750 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2751                                        struct kvm_mmu_page *sp,
2752                                        struct list_head *invalid_list,
2753                                        int *nr_zapped)
2754 {
2755         bool list_unstable;
2756 
2757         trace_kvm_mmu_prepare_zap_page(sp);
2758         ++kvm->stat.mmu_shadow_zapped;
2759         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2760         kvm_mmu_page_unlink_children(kvm, sp);
2761         kvm_mmu_unlink_parents(kvm, sp);
2762 
2763         /* Zapping children means active_mmu_pages has become unstable. */
2764         list_unstable = *nr_zapped;
2765 
2766         if (!sp->role.invalid && !sp->role.direct)
2767                 unaccount_shadowed(kvm, sp);
2768 
2769         if (sp->unsync)
2770                 kvm_unlink_unsync_page(kvm, sp);
2771         if (!sp->root_count) {
2772                 /* Count self */
2773                 (*nr_zapped)++;
2774                 list_move(&sp->link, invalid_list);
2775                 kvm_mod_used_mmu_pages(kvm, -1);
2776         } else {
2777                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2778 
2779                 /*
2780                  * Obsolete pages cannot be used on any vCPUs, see the comment
2781                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2782                  * treats invalid shadow pages as being obsolete.
2783                  */
2784                 if (!is_obsolete_sp(kvm, sp))
2785                         kvm_reload_remote_mmus(kvm);
2786         }
2787 
2788         if (sp->lpage_disallowed)
2789                 unaccount_huge_nx_page(kvm, sp);
2790 
2791         sp->role.invalid = 1;
2792         return list_unstable;
2793 }
2794 
2795 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2796                                      struct list_head *invalid_list)
2797 {
2798         int nr_zapped;
2799 
2800         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2801         return nr_zapped;
2802 }
2803 
2804 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2805                                     struct list_head *invalid_list)
2806 {
2807         struct kvm_mmu_page *sp, *nsp;
2808 
2809         if (list_empty(invalid_list))
2810                 return;
2811 
2812         /*
2813          * We need to make sure everyone sees our modifications to
2814          * the page tables and see changes to vcpu->mode here. The barrier
2815          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2816          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2817          *
2818          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2819          * guest mode and/or lockless shadow page table walks.
2820          */
2821         kvm_flush_remote_tlbs(kvm);
2822 
2823         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2824                 WARN_ON(!sp->role.invalid || sp->root_count);
2825                 kvm_mmu_free_page(sp);
2826         }
2827 }
2828 
2829 static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2830                                         struct list_head *invalid_list)
2831 {
2832         struct kvm_mmu_page *sp;
2833 
2834         if (list_empty(&kvm->arch.active_mmu_pages))
2835                 return false;
2836 
2837         sp = list_last_entry(&kvm->arch.active_mmu_pages,
2838                              struct kvm_mmu_page, link);
2839         return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2840 }
2841 
2842 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2843 {
2844         LIST_HEAD(invalid_list);
2845 
2846         if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
2847                 return 0;
2848 
2849         while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
2850                 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
2851                         break;
2852 
2853                 ++vcpu->kvm->stat.mmu_recycled;
2854         }
2855         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2856 
2857         if (!kvm_mmu_available_pages(vcpu->kvm))
2858                 return -ENOSPC;
2859         return 0;
2860 }
2861 
2862 /*
2863  * Changing the number of mmu pages allocated to the vm
2864  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2865  */
2866 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2867 {
2868         LIST_HEAD(invalid_list);
2869 
2870         spin_lock(&kvm->mmu_lock);
2871 
2872         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2873                 /* Need to free some mmu pages to achieve the goal. */
2874                 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2875                         if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2876                                 break;
2877 
2878                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2879                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2880         }
2881 
2882         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2883 
2884         spin_unlock(&kvm->mmu_lock);
2885 }
2886 
2887 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2888 {
2889         struct kvm_mmu_page *sp;
2890         LIST_HEAD(invalid_list);
2891         int r;
2892 
2893         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2894         r = 0;
2895         spin_lock(&kvm->mmu_lock);
2896         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2897                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2898                          sp->role.word);
2899                 r = 1;
2900                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2901         }
2902         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2903         spin_unlock(&kvm->mmu_lock);
2904 
2905         return r;
2906 }
2907 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2908 
2909 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2910 {
2911         trace_kvm_mmu_unsync_page(sp);
2912         ++vcpu->kvm->stat.mmu_unsync;
2913         sp->unsync = 1;
2914 
2915         kvm_mmu_mark_parents_unsync(sp);
2916 }
2917 
2918 static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2919                                    bool can_unsync)
2920 {
2921         struct kvm_mmu_page *sp;
2922 
2923         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2924                 return true;
2925 
2926         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2927                 if (!can_unsync)
2928                         return true;
2929 
2930                 if (sp->unsync)
2931                         continue;
2932 
2933                 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2934                 kvm_unsync_page(vcpu, sp);
2935         }
2936 
2937         /*
2938          * We need to ensure that the marking of unsync pages is visible
2939          * before the SPTE is updated to allow writes because
2940          * kvm_mmu_sync_roots() checks the unsync flags without holding
2941          * the MMU lock and so can race with this. If the SPTE was updated
2942          * before the page had been marked as unsync-ed, something like the
2943          * following could happen:
2944          *
2945          * CPU 1                    CPU 2
2946          * ---------------------------------------------------------------------
2947          * 1.2 Host updates SPTE
2948          *     to be writable
2949          *                      2.1 Guest writes a GPTE for GVA X.
2950          *                          (GPTE being in the guest page table shadowed
2951          *                           by the SP from CPU 1.)
2952          *                          This reads SPTE during the page table walk.
2953          *                          Since SPTE.W is read as 1, there is no
2954          *                          fault.
2955          *
2956          *                      2.2 Guest issues TLB flush.
2957          *                          That causes a VM Exit.
2958          *
2959          *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
2960          *                          Since it is false, so it just returns.
2961          *
2962          *                      2.4 Guest accesses GVA X.
2963          *                          Since the mapping in the SP was not updated,
2964          *                          so the old mapping for GVA X incorrectly
2965          *                          gets used.
2966          * 1.1 Host marks SP
2967          *     as unsync
2968          *     (sp->unsync = true)
2969          *
2970          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2971          * the situation in 2.4 does not arise. The implicit barrier in 2.2
2972          * pairs with this write barrier.
2973          */
2974         smp_wmb();
2975 
2976         return false;
2977 }
2978 
2979 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2980 {
2981         if (pfn_valid(pfn))
2982                 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2983                         /*
2984                          * Some reserved pages, such as those from NVDIMM
2985                          * DAX devices, are not for MMIO, and can be mapped
2986                          * with cached memory type for better performance.
2987                          * However, the above check misconceives those pages
2988                          * as MMIO, and results in KVM mapping them with UC
2989                          * memory type, which would hurt the performance.
2990                          * Therefore, we check the host memory type in addition
2991                          * and only treat UC/UC-/WC pages as MMIO.
2992                          */
2993                         (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
2994 
2995         return !e820__mapped_raw_any(pfn_to_hpa(pfn),
2996                                      pfn_to_hpa(pfn + 1) - 1,
2997                                      E820_TYPE_RAM);
2998 }
2999 
3000 /* Bits which may be returned by set_spte() */
3001 #define SET_SPTE_WRITE_PROTECTED_PT     BIT(0)
3002 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH  BIT(1)
3003 
3004 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3005                     unsigned int pte_access, int level,
3006                     gfn_t gfn, kvm_pfn_t pfn, bool speculative,
3007                     bool can_unsync, bool host_writable)
3008 {
3009         u64 spte = 0;
3010         int ret = 0;
3011         struct kvm_mmu_page *sp;
3012 
3013         if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
3014                 return 0;
3015 
3016         sp = page_header(__pa(sptep));
3017         if (sp_ad_disabled(sp))
3018                 spte |= SPTE_AD_DISABLED_MASK;
3019         else if (kvm_vcpu_ad_need_write_protect(vcpu))
3020                 spte |= SPTE_AD_WRPROT_ONLY_MASK;
3021 
3022         /*
3023          * For the EPT case, shadow_present_mask is 0 if hardware
3024          * supports exec-only page table entries.  In that case,
3025          * ACC_USER_MASK and shadow_user_mask are used to represent
3026          * read access.  See FNAME(gpte_access) in paging_tmpl.h.
3027          */
3028         spte |= shadow_present_mask;
3029         if (!speculative)
3030                 spte |= spte_shadow_accessed_mask(spte);
3031 
3032         if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
3033             is_nx_huge_page_enabled()) {
3034                 pte_access &= ~ACC_EXEC_MASK;
3035         }
3036 
3037         if (pte_access & ACC_EXEC_MASK)
3038                 spte |= shadow_x_mask;
3039         else
3040                 spte |= shadow_nx_mask;
3041 
3042         if (pte_access & ACC_USER_MASK)
3043                 spte |= shadow_user_mask;
3044 
3045         if (level > PT_PAGE_TABLE_LEVEL)
3046                 spte |= PT_PAGE_SIZE_MASK;
3047         if (tdp_enabled)
3048                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
3049                         kvm_is_mmio_pfn(pfn));
3050 
3051         if (host_writable)
3052                 spte |= SPTE_HOST_WRITEABLE;
3053         else
3054                 pte_access &= ~ACC_WRITE_MASK;
3055 
3056         if (!kvm_is_mmio_pfn(pfn))
3057                 spte |= shadow_me_mask;
3058 
3059         spte |= (u64)pfn << PAGE_SHIFT;
3060 
3061         if (pte_access & ACC_WRITE_MASK) {
3062                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
3063 
3064                 /*
3065                  * Optimization: for pte sync, if spte was writable the hash
3066                  * lookup is unnecessary (and expensive). Write protection
3067                  * is responsibility of mmu_get_page / kvm_sync_page.
3068                  * Same reasoning can be applied to dirty page accounting.
3069                  */
3070                 if (!can_unsync && is_writable_pte(*sptep))
3071                         goto set_pte;
3072 
3073                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
3074                         pgprintk("%s: found shadow page for %llx, marking ro\n",
3075                                  __func__, gfn);
3076                         ret |= SET_SPTE_WRITE_PROTECTED_PT;
3077                         pte_access &= ~ACC_WRITE_MASK;
3078                         spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
3079                 }
3080         }
3081 
3082         if (pte_access & ACC_WRITE_MASK) {
3083                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3084                 spte |= spte_shadow_dirty_mask(spte);
3085         }
3086 
3087         if (speculative)
3088                 spte = mark_spte_for_access_track(spte);
3089 
3090 set_pte:
3091         if (mmu_spte_update(sptep, spte))
3092                 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3093         return ret;
3094 }
3095 
3096 static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3097                         unsigned int pte_access, int write_fault, int level,
3098                         gfn_t gfn, kvm_pfn_t pfn, bool speculative,
3099                         bool host_writable)
3100 {
3101         int was_rmapped = 0;
3102         int rmap_count;
3103         int set_spte_ret;
3104         int ret = RET_PF_RETRY;
3105         bool flush = false;
3106 
3107         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3108                  *sptep, write_fault, gfn);
3109 
3110         if (is_shadow_present_pte(*sptep)) {
3111                 /*
3112                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
3113                  * the parent of the now unreachable PTE.
3114                  */
3115                 if (level > PT_PAGE_TABLE_LEVEL &&
3116                     !is_large_pte(*sptep)) {
3117                         struct kvm_mmu_page *child;
3118                         u64 pte = *sptep;
3119 
3120                         child = page_header(pte & PT64_BASE_ADDR_MASK);
3121                         drop_parent_pte(child, sptep);
3122                         flush = true;
3123                 } else if (pfn != spte_to_pfn(*sptep)) {
3124                         pgprintk("hfn old %llx new %llx\n",
3125                                  spte_to_pfn(*sptep), pfn);
3126                         drop_spte(vcpu->kvm, sptep);
3127                         flush = true;
3128                 } else
3129                         was_rmapped = 1;
3130         }
3131 
3132         set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3133                                 speculative, true, host_writable);
3134         if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3135                 if (write_fault)
3136                         ret = RET_PF_EMULATE;
3137                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3138         }
3139 
3140         if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3141                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3142                                 KVM_PAGES_PER_HPAGE(level));
3143 
3144         if (unlikely(is_mmio_spte(*sptep)))
3145                 ret = RET_PF_EMULATE;
3146 
3147         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3148         trace_kvm_mmu_set_spte(level, gfn, sptep);
3149         if (!was_rmapped && is_large_pte(*sptep))
3150                 ++vcpu->kvm->stat.lpages;
3151 
3152         if (is_shadow_present_pte(*sptep)) {
3153                 if (!was_rmapped) {
3154                         rmap_count = rmap_add(vcpu, sptep, gfn);
3155                         if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3156                                 rmap_recycle(vcpu, sptep, gfn);
3157                 }
3158         }
3159 
3160         return ret;
3161 }
3162 
3163 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3164                                      bool no_dirty_log)
3165 {
3166         struct kvm_memory_slot *slot;
3167 
3168         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3169         if (!slot)
3170                 return KVM_PFN_ERR_FAULT;
3171 
3172         return gfn_to_pfn_memslot_atomic(slot, gfn);
3173 }
3174 
3175 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3176                                     struct kvm_mmu_page *sp,
3177                                     u64 *start, u64 *end)
3178 {
3179         struct page *pages[PTE_PREFETCH_NUM];
3180         struct kvm_memory_slot *slot;
3181         unsigned int access = sp->role.access;
3182         int i, ret;
3183         gfn_t gfn;
3184 
3185         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3186         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3187         if (!slot)
3188                 return -1;
3189 
3190         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3191         if (ret <= 0)
3192                 return -1;
3193 
3194         for (i = 0; i < ret; i++, gfn++, start++) {
3195                 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3196                              page_to_pfn(pages[i]), true, true);
3197                 put_page(pages[i]);
3198         }
3199 
3200         return 0;
3201 }
3202 
3203 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3204                                   struct kvm_mmu_page *sp, u64 *sptep)
3205 {
3206         u64 *spte, *start = NULL;
3207         int i;
3208 
3209         WARN_ON(!sp->role.direct);
3210 
3211         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3212         spte = sp->spt + i;
3213 
3214         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3215                 if (is_shadow_present_pte(*spte) || spte == sptep) {
3216                         if (!start)
3217                                 continue;
3218                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3219                                 break;
3220                         start = NULL;
3221                 } else if (!start)
3222                         start = spte;
3223         }
3224 }
3225 
3226 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3227 {
3228         struct kvm_mmu_page *sp;
3229 
3230         sp = page_header(__pa(sptep));
3231 
3232         /*
3233          * Without accessed bits, there's no way to distinguish between
3234          * actually accessed translations and prefetched, so disable pte
3235          * prefetch if accessed bits aren't available.
3236          */
3237         if (sp_ad_disabled(sp))
3238                 return;
3239 
3240         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3241                 return;
3242 
3243         __direct_pte_prefetch(vcpu, sp, sptep);
3244 }
3245 
3246 static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
3247                                   kvm_pfn_t pfn, struct kvm_memory_slot *slot)
3248 {
3249         unsigned long hva;
3250         pte_t *pte;
3251         int level;
3252 
3253         BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K ||
3254                      PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M ||
3255                      PT_PDPE_LEVEL != (int)PG_LEVEL_1G);
3256 
3257         if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
3258                 return PT_PAGE_TABLE_LEVEL;
3259 
3260         /*
3261          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
3262          * is not solely for performance, it's also necessary to avoid the
3263          * "writable" check in __gfn_to_hva_many(), which will always fail on
3264          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
3265          * page fault steps have already verified the guest isn't writing a
3266          * read-only memslot.
3267          */
3268         hva = __gfn_to_hva_memslot(slot, gfn);
3269 
3270         pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
3271         if (unlikely(!pte))
3272                 return PT_PAGE_TABLE_LEVEL;
3273 
3274         return level;
3275 }
3276 
3277 static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
3278                                    int max_level, kvm_pfn_t *pfnp)
3279 {
3280         struct kvm_memory_slot *slot;
3281         struct kvm_lpage_info *linfo;
3282         kvm_pfn_t pfn = *pfnp;
3283         kvm_pfn_t mask;
3284         int level;
3285 
3286         if (unlikely(max_level == PT_PAGE_TABLE_LEVEL))
3287                 return PT_PAGE_TABLE_LEVEL;
3288 
3289         if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
3290                 return PT_PAGE_TABLE_LEVEL;
3291 
3292         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
3293         if (!slot)
3294                 return PT_PAGE_TABLE_LEVEL;
3295 
3296         max_level = min(max_level, kvm_x86_ops->get_lpage_level());
3297         for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) {
3298                 linfo = lpage_info_slot(gfn, slot, max_level);
3299                 if (!linfo->disallow_lpage)
3300                         break;
3301         }
3302 
3303         if (max_level == PT_PAGE_TABLE_LEVEL)
3304                 return PT_PAGE_TABLE_LEVEL;
3305 
3306         level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
3307         if (level == PT_PAGE_TABLE_LEVEL)
3308                 return level;
3309 
3310         level = min(level, max_level);
3311 
3312         /*
3313          * mmu_notifier_retry() was successful and mmu_lock is held, so
3314          * the pmd can't be split from under us.
3315          */
3316         mask = KVM_PAGES_PER_HPAGE(level) - 1;
3317         VM_BUG_ON((gfn & mask) != (pfn & mask));
3318         *pfnp = pfn & ~mask;
3319 
3320         return level;
3321 }
3322 
3323 static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3324                                        gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3325 {
3326         int level = *levelp;
3327         u64 spte = *it.sptep;
3328 
3329         if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
3330             is_nx_huge_page_enabled() &&
3331             is_shadow_present_pte(spte) &&
3332             !is_large_pte(spte)) {
3333                 /*
3334                  * A small SPTE exists for this pfn, but FNAME(fetch)
3335                  * and __direct_map would like to create a large PTE
3336                  * instead: just force them to go down another level,
3337                  * patching back for them into pfn the next 9 bits of
3338                  * the address.
3339                  */
3340                 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3341                 *pfnp |= gfn & page_mask;
3342                 (*levelp)--;
3343         }
3344 }
3345 
3346 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3347                         int map_writable, int max_level, kvm_pfn_t pfn,
3348                         bool prefault, bool account_disallowed_nx_lpage)
3349 {
3350         struct kvm_shadow_walk_iterator it;
3351         struct kvm_mmu_page *sp;
3352         int level, ret;
3353         gfn_t gfn = gpa >> PAGE_SHIFT;
3354         gfn_t base_gfn = gfn;
3355 
3356         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
3357                 return RET_PF_RETRY;
3358 
3359         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn);
3360 
3361         trace_kvm_mmu_spte_requested(gpa, level, pfn);
3362         for_each_shadow_entry(vcpu, gpa, it) {
3363                 /*
3364                  * We cannot overwrite existing page tables with an NX
3365                  * large page, as the leaf could be executable.
3366                  */
3367                 disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3368 
3369                 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3370                 if (it.level == level)
3371                         break;
3372 
3373                 drop_large_spte(vcpu, it.sptep);
3374                 if (!is_shadow_present_pte(*it.sptep)) {
3375                         sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3376                                               it.level - 1, true, ACC_ALL);
3377 
3378                         link_shadow_page(vcpu, it.sptep, sp);
3379                         if (account_disallowed_nx_lpage)
3380                                 account_huge_nx_page(vcpu->kvm, sp);
3381                 }
3382         }
3383 
3384         ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3385                            write, level, base_gfn, pfn, prefault,
3386                            map_writable);
3387         direct_pte_prefetch(vcpu, it.sptep);
3388         ++vcpu->stat.pf_fixed;
3389         return ret;
3390 }
3391 
3392 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3393 {
3394         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3395 }
3396 
3397 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3398 {
3399         /*
3400          * Do not cache the mmio info caused by writing the readonly gfn
3401          * into the spte otherwise read access on readonly gfn also can
3402          * caused mmio page fault and treat it as mmio access.
3403          */
3404         if (pfn == KVM_PFN_ERR_RO_FAULT)
3405                 return RET_PF_EMULATE;
3406 
3407         if (pfn == KVM_PFN_ERR_HWPOISON) {
3408                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3409                 return RET_PF_RETRY;
3410         }
3411 
3412         return -EFAULT;
3413 }
3414 
3415 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3416                                 kvm_pfn_t pfn, unsigned int access,
3417                                 int *ret_val)
3418 {
3419         /* The pfn is invalid, report the error! */
3420         if (unlikely(is_error_pfn(pfn))) {
3421                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3422                 return true;
3423         }
3424 
3425         if (unlikely(is_noslot_pfn(pfn)))
3426                 vcpu_cache_mmio_info(vcpu, gva, gfn,
3427                                      access & shadow_mmio_access_mask);
3428 
3429         return false;
3430 }
3431 
3432 static bool page_fault_can_be_fast(u32 error_code)
3433 {
3434         /*
3435          * Do not fix the mmio spte with invalid generation number which
3436          * need to be updated by slow page fault path.
3437          */
3438         if (unlikely(error_code & PFERR_RSVD_MASK))
3439                 return false;
3440 
3441         /* See if the page fault is due to an NX violation */
3442         if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3443                       == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3444                 return false;
3445 
3446         /*
3447          * #PF can be fast if:
3448          * 1. The shadow page table entry is not present, which could mean that
3449          *    the fault is potentially caused by access tracking (if enabled).
3450          * 2. The shadow page table entry is present and the fault
3451          *    is caused by write-protect, that means we just need change the W
3452          *    bit of the spte which can be done out of mmu-lock.
3453          *
3454          * However, if access tracking is disabled we know that a non-present
3455          * page must be a genuine page fault where we have to create a new SPTE.
3456          * So, if access tracking is disabled, we return true only for write
3457          * accesses to a present page.
3458          */
3459 
3460         return shadow_acc_track_mask != 0 ||
3461                ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3462                 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3463 }
3464 
3465 /*
3466  * Returns true if the SPTE was fixed successfully. Otherwise,
3467  * someone else modified the SPTE from its original value.
3468  */
3469 static bool
3470 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3471                         u64 *sptep, u64 old_spte, u64 new_spte)
3472 {
3473         gfn_t gfn;
3474 
3475         WARN_ON(!sp->role.direct);
3476 
3477         /*
3478          * Theoretically we could also set dirty bit (and flush TLB) here in
3479          * order to eliminate unnecessary PML logging. See comments in
3480          * set_spte. But fast_page_fault is very unlikely to happen with PML
3481          * enabled, so we do not do this. This might result in the same GPA
3482          * to be logged in PML buffer again when the write really happens, and
3483          * eventually to be called by mark_page_dirty twice. But it's also no
3484          * harm. This also avoids the TLB flush needed after setting dirty bit
3485          * so non-PML cases won't be impacted.
3486          *
3487          * Compare with set_spte where instead shadow_dirty_mask is set.
3488          */
3489         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3490                 return false;
3491 
3492         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3493                 /*
3494                  * The gfn of direct spte is stable since it is
3495                  * calculated by sp->gfn.
3496                  */
3497                 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3498                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3499         }
3500 
3501         return true;
3502 }
3503 
3504 static bool is_access_allowed(u32 fault_err_code, u64 spte)
3505 {
3506         if (fault_err_code & PFERR_FETCH_MASK)
3507                 return is_executable_pte(spte);
3508 
3509         if (fault_err_code & PFERR_WRITE_MASK)
3510                 return is_writable_pte(spte);
3511 
3512         /* Fault was on Read access */
3513         return spte & PT_PRESENT_MASK;
3514 }
3515 
3516 /*
3517  * Return value:
3518  * - true: let the vcpu to access on the same address again.
3519  * - false: let the real page fault path to fix it.
3520  */
3521 static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
3522                             u32 error_code)
3523 {
3524         struct kvm_shadow_walk_iterator iterator;
3525         struct kvm_mmu_page *sp;
3526         bool fault_handled = false;
3527         u64 spte = 0ull;
3528         uint retry_count = 0;
3529 
3530         if (!page_fault_can_be_fast(error_code))
3531                 return false;
3532 
3533         walk_shadow_page_lockless_begin(vcpu);
3534 
3535         do {
3536                 u64 new_spte;
3537 
3538                 for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
3539                         if (!is_shadow_present_pte(spte))
3540                                 break;
3541 
3542                 sp = page_header(__pa(iterator.sptep));
3543                 if (!is_last_spte(spte, sp->role.level))
3544                         break;
3545 
3546                 /*
3547                  * Check whether the memory access that caused the fault would
3548                  * still cause it if it were to be performed right now. If not,
3549                  * then this is a spurious fault caused by TLB lazily flushed,
3550                  * or some other CPU has already fixed the PTE after the
3551                  * current CPU took the fault.
3552                  *
3553                  * Need not check the access of upper level table entries since
3554                  * they are always ACC_ALL.
3555                  */
3556                 if (is_access_allowed(error_code, spte)) {
3557                         fault_handled = true;
3558                         break;
3559                 }
3560 
3561                 new_spte = spte;
3562 
3563                 if (is_access_track_spte(spte))
3564                         new_spte = restore_acc_track_spte(new_spte);
3565 
3566                 /*
3567                  * Currently, to simplify the code, write-protection can
3568                  * be removed in the fast path only if the SPTE was
3569                  * write-protected for dirty-logging or access tracking.
3570                  */
3571                 if ((error_code & PFERR_WRITE_MASK) &&
3572                     spte_can_locklessly_be_made_writable(spte))
3573                 {
3574                         new_spte |= PT_WRITABLE_MASK;
3575 
3576                         /*
3577                          * Do not fix write-permission on the large spte.  Since
3578                          * we only dirty the first page into the dirty-bitmap in
3579                          * fast_pf_fix_direct_spte(), other pages are missed
3580                          * if its slot has dirty logging enabled.
3581                          *
3582                          * Instead, we let the slow page fault path create a
3583                          * normal spte to fix the access.
3584                          *
3585                          * See the comments in kvm_arch_commit_memory_region().
3586                          */
3587                         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3588                                 break;
3589                 }
3590 
3591                 /* Verify that the fault can be handled in the fast path */
3592                 if (new_spte == spte ||
3593                     !is_access_allowed(error_code, new_spte))
3594                         break;
3595 
3596                 /*
3597                  * Currently, fast page fault only works for direct mapping
3598                  * since the gfn is not stable for indirect shadow page. See
3599                  * Documentation/virt/kvm/locking.txt to get more detail.
3600                  */
3601                 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3602                                                         iterator.sptep, spte,
3603                                                         new_spte);
3604                 if (fault_handled)
3605                         break;
3606 
3607                 if (++retry_count > 4) {
3608                         printk_once(KERN_WARNING
3609                                 "kvm: Fast #PF retrying more than 4 times.\n");
3610                         break;
3611                 }
3612 
3613         } while (true);
3614 
3615         trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
3616                               spte, fault_handled);
3617         walk_shadow_page_lockless_end(vcpu);
3618 
3619         return fault_handled;
3620 }
3621 
3622 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3623                                struct list_head *invalid_list)
3624 {
3625         struct kvm_mmu_page *sp;
3626 
3627         if (!VALID_PAGE(*root_hpa))
3628                 return;
3629 
3630         sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3631         --sp->root_count;
3632         if (!sp->root_count && sp->role.invalid)
3633                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3634 
3635         *root_hpa = INVALID_PAGE;
3636 }
3637 
3638 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3639 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3640                         ulong roots_to_free)
3641 {
3642         int i;
3643         LIST_HEAD(invalid_list);
3644         bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3645 
3646         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3647 
3648         /* Before acquiring the MMU lock, see if we need to do any real work. */
3649         if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3650                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3651                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3652                             VALID_PAGE(mmu->prev_roots[i].hpa))
3653                                 break;
3654 
3655                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3656                         return;
3657         }
3658 
3659         spin_lock(&vcpu->kvm->mmu_lock);
3660 
3661         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3662                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3663                         mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3664                                            &invalid_list);
3665 
3666         if (free_active_root) {
3667                 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3668                     (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3669                         mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3670                                            &invalid_list);
3671                 } else {
3672                         for (i = 0; i < 4; ++i)
3673                                 if (mmu->pae_root[i] != 0)
3674                                         mmu_free_root_page(vcpu->kvm,
3675                                                            &mmu->pae_root[i],
3676                                                            &invalid_list);
3677                         mmu->root_hpa = INVALID_PAGE;
3678                 }
3679                 mmu->root_cr3 = 0;
3680         }
3681 
3682         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3683         spin_unlock(&vcpu->kvm->mmu_lock);
3684 }
3685 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3686 
3687 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3688 {
3689         int ret = 0;
3690 
3691         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3692                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3693                 ret = 1;
3694         }
3695 
3696         return ret;
3697 }
3698 
3699 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3700 {
3701         struct kvm_mmu_page *sp;
3702         unsigned i;
3703 
3704         if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3705                 spin_lock(&vcpu->kvm->mmu_lock);
3706                 if(make_mmu_pages_available(vcpu) < 0) {
3707                         spin_unlock(&vcpu->kvm->mmu_lock);
3708                         return -ENOSPC;
3709                 }
3710                 sp = kvm_mmu_get_page(vcpu, 0, 0,
3711                                 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3712                 ++sp->root_count;
3713                 spin_unlock(&vcpu->kvm->mmu_lock);
3714                 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3715         } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3716                 for (i = 0; i < 4; ++i) {
3717                         hpa_t root = vcpu->arch.mmu->pae_root[i];
3718 
3719                         MMU_WARN_ON(VALID_PAGE(root));
3720                         spin_lock(&vcpu->kvm->mmu_lock);
3721                         if (make_mmu_pages_available(vcpu) < 0) {
3722                                 spin_unlock(&vcpu->kvm->mmu_lock);
3723                                 return -ENOSPC;
3724                         }
3725                         sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3726                                         i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3727                         root = __pa(sp->spt);
3728                         ++sp->root_count;
3729                         spin_unlock(&vcpu->kvm->mmu_lock);
3730                         vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3731                 }
3732                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3733         } else
3734                 BUG();
3735         vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3736 
3737         return 0;
3738 }
3739 
3740 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3741 {
3742         struct kvm_mmu_page *sp;
3743         u64 pdptr, pm_mask;
3744         gfn_t root_gfn, root_cr3;
3745         int i;
3746 
3747         root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3748         root_gfn = root_cr3 >> PAGE_SHIFT;
3749 
3750         if (mmu_check_root(vcpu, root_gfn))
3751                 return 1;
3752 
3753         /*
3754          * Do we shadow a long mode page table? If so we need to
3755          * write-protect the guests page table root.
3756          */
3757         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3758                 hpa_t root = vcpu->arch.mmu->root_hpa;
3759 
3760                 MMU_WARN_ON(VALID_PAGE(root));
3761 
3762                 spin_lock(&vcpu->kvm->mmu_lock);
3763                 if (make_mmu_pages_available(vcpu) < 0) {
3764                         spin_unlock(&vcpu->kvm->mmu_lock);
3765                         return -ENOSPC;
3766                 }
3767                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3768                                 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3769                 root = __pa(sp->spt);
3770                 ++sp->root_count;
3771                 spin_unlock(&vcpu->kvm->mmu_lock);
3772                 vcpu->arch.mmu->root_hpa = root;
3773                 goto set_root_cr3;
3774         }
3775 
3776         /*
3777          * We shadow a 32 bit page table. This may be a legacy 2-level
3778          * or a PAE 3-level page table. In either case we need to be aware that
3779          * the shadow page table may be a PAE or a long mode page table.
3780          */
3781         pm_mask = PT_PRESENT_MASK;
3782         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3783                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3784 
3785         for (i = 0; i < 4; ++i) {
3786                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3787 
3788                 MMU_WARN_ON(VALID_PAGE(root));
3789                 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3790                         pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3791                         if (!(pdptr & PT_PRESENT_MASK)) {
3792                                 vcpu->arch.mmu->pae_root[i] = 0;
3793                                 continue;
3794                         }
3795                         root_gfn = pdptr >> PAGE_SHIFT;
3796                         if (mmu_check_root(vcpu, root_gfn))
3797                                 return 1;
3798                 }
3799                 spin_lock(&vcpu->kvm->mmu_lock);
3800                 if (make_mmu_pages_available(vcpu) < 0) {
3801                         spin_unlock(&vcpu->kvm->mmu_lock);
3802                         return -ENOSPC;
3803                 }
3804                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3805                                       0, ACC_ALL);
3806                 root = __pa(sp->spt);
3807                 ++sp->root_count;
3808                 spin_unlock(&vcpu->kvm->mmu_lock);
3809 
3810                 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3811         }
3812         vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3813 
3814         /*
3815          * If we shadow a 32 bit page table with a long mode page
3816          * table we enter this path.
3817          */
3818         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3819                 if (vcpu->arch.mmu->lm_root == NULL) {
3820                         /*
3821                          * The additional page necessary for this is only
3822                          * allocated on demand.
3823                          */
3824 
3825                         u64 *lm_root;
3826 
3827                         lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3828                         if (lm_root == NULL)
3829                                 return 1;
3830 
3831                         lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3832 
3833                         vcpu->arch.mmu->lm_root = lm_root;
3834                 }
3835 
3836                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3837         }
3838 
3839 set_root_cr3:
3840         vcpu->arch.mmu->root_cr3 = root_cr3;
3841 
3842         return 0;
3843 }
3844 
3845 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3846 {
3847         if (vcpu->arch.mmu->direct_map)
3848                 return mmu_alloc_direct_roots(vcpu);
3849         else
3850                 return mmu_alloc_shadow_roots(vcpu);
3851 }
3852 
3853 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3854 {
3855         int i;
3856         struct kvm_mmu_page *sp;
3857 
3858         if (vcpu->arch.mmu->direct_map)
3859                 return;
3860 
3861         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3862                 return;
3863 
3864         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3865 
3866         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3867                 hpa_t root = vcpu->arch.mmu->root_hpa;
3868                 sp = page_header(root);
3869 
3870                 /*
3871                  * Even if another CPU was marking the SP as unsync-ed
3872                  * simultaneously, any guest page table changes are not
3873                  * guaranteed to be visible anyway until this VCPU issues a TLB
3874                  * flush strictly after those changes are made. We only need to
3875                  * ensure that the other CPU sets these flags before any actual
3876                  * changes to the page tables are made. The comments in
3877                  * mmu_need_write_protect() describe what could go wrong if this
3878                  * requirement isn't satisfied.
3879                  */
3880                 if (!smp_load_acquire(&sp->unsync) &&
3881                     !smp_load_acquire(&sp->unsync_children))
3882                         return;
3883 
3884                 spin_lock(&vcpu->kvm->mmu_lock);
3885                 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3886 
3887                 mmu_sync_children(vcpu, sp);
3888 
3889                 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3890                 spin_unlock(&vcpu->kvm->mmu_lock);
3891                 return;
3892         }
3893 
3894         spin_lock(&vcpu->kvm->mmu_lock);
3895         kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3896 
3897         for (i = 0; i < 4; ++i) {
3898                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3899 
3900                 if (root && VALID_PAGE(root)) {
3901                         root &= PT64_BASE_ADDR_MASK;
3902                         sp = page_header(root);
3903                         mmu_sync_children(vcpu, sp);
3904                 }
3905         }
3906 
3907         kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3908         spin_unlock(&vcpu->kvm->mmu_lock);
3909 }
3910 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3911 
3912 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
3913                                   u32 access, struct x86_exception *exception)
3914 {
3915         if (exception)
3916                 exception->error_code = 0;
3917         return vaddr;
3918 }
3919 
3920 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
3921                                          u32 access,
3922                                          struct x86_exception *exception)
3923 {
3924         if (exception)
3925                 exception->error_code = 0;
3926         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3927 }
3928 
3929 static bool
3930 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3931 {
3932         int bit7 = (pte >> 7) & 1;
3933 
3934         return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
3935 }
3936 
3937 static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
3938 {
3939         return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
3940 }
3941 
3942 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3943 {
3944         /*
3945          * A nested guest cannot use the MMIO cache if it is using nested
3946          * page tables, because cr2 is a nGPA while the cache stores GPAs.
3947          */
3948         if (mmu_is_nested(vcpu))
3949                 return false;
3950 
3951         if (direct)
3952                 return vcpu_match_mmio_gpa(vcpu, addr);
3953 
3954         return vcpu_match_mmio_gva(vcpu, addr);
3955 }
3956 
3957 /* return true if reserved bit is detected on spte. */
3958 static bool
3959 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3960 {
3961         struct kvm_shadow_walk_iterator iterator;
3962         u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3963         struct rsvd_bits_validate *rsvd_check;
3964         int root, leaf;
3965         bool reserved = false;
3966 
3967         rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
3968 
3969         walk_shadow_page_lockless_begin(vcpu);
3970 
3971         for (shadow_walk_init(&iterator, vcpu, addr),
3972                  leaf = root = iterator.level;
3973              shadow_walk_okay(&iterator);
3974              __shadow_walk_next(&iterator, spte)) {
3975                 spte = mmu_spte_get_lockless(iterator.sptep);
3976 
3977                 sptes[leaf - 1] = spte;
3978                 leaf--;
3979 
3980                 if (!is_shadow_present_pte(spte))
3981                         break;
3982 
3983                 /*
3984                  * Use a bitwise-OR instead of a logical-OR to aggregate the
3985                  * reserved bit and EPT's invalid memtype/XWR checks to avoid
3986                  * adding a Jcc in the loop.
3987                  */
3988                 reserved |= __is_bad_mt_xwr(rsvd_check, spte) |
3989                             __is_rsvd_bits_set(rsvd_check, spte, iterator.level);
3990         }
3991 
3992         walk_shadow_page_lockless_end(vcpu);
3993 
3994         if (reserved) {
3995                 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3996                        __func__, addr);
3997                 while (root > leaf) {
3998                         pr_err("------ spte 0x%llx level %d.\n",
3999                                sptes[root - 1], root);
4000                         root--;
4001                 }
4002         }
4003 
4004         *sptep = spte;
4005         return reserved;
4006 }
4007 
4008 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4009 {
4010         u64 spte;
4011         bool reserved;
4012 
4013         if (mmio_info_in_cache(vcpu, addr, direct))
4014                 return RET_PF_EMULATE;
4015 
4016         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
4017         if (WARN_ON(reserved))
4018                 return -EINVAL;
4019 
4020         if (is_mmio_spte(spte)) {
4021                 gfn_t gfn = get_mmio_spte_gfn(spte);
4022                 unsigned int access = get_mmio_spte_access(spte);
4023 
4024                 if (!check_mmio_spte(vcpu, spte))
4025                         return RET_PF_INVALID;
4026 
4027                 if (direct)
4028                         addr = 0;
4029 
4030                 trace_handle_mmio_page_fault(addr, gfn, access);
4031                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4032                 return RET_PF_EMULATE;
4033         }
4034 
4035         /*
4036          * If the page table is zapped by other cpus, let CPU fault again on
4037          * the address.
4038          */
4039         return RET_PF_RETRY;
4040 }
4041 
4042 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4043                                          u32 error_code, gfn_t gfn)
4044 {
4045         if (unlikely(error_code & PFERR_RSVD_MASK))
4046                 return false;
4047 
4048         if (!(error_code & PFERR_PRESENT_MASK) ||
4049               !(error_code & PFERR_WRITE_MASK))
4050                 return false;
4051 
4052         /*
4053          * guest is writing the page which is write tracked which can
4054          * not be fixed by page fault handler.
4055          */
4056         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
4057                 return true;
4058 
4059         return false;
4060 }
4061 
4062 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4063 {
4064         struct kvm_shadow_walk_iterator iterator;
4065         u64 spte;
4066 
4067         walk_shadow_page_lockless_begin(vcpu);
4068         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4069                 clear_sp_write_flooding_count(iterator.sptep);
4070                 if (!is_shadow_present_pte(spte))
4071                         break;
4072         }
4073         walk_shadow_page_lockless_end(vcpu);
4074 }
4075 
4076 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4077                                    gfn_t gfn)
4078 {
4079         struct kvm_arch_async_pf arch;
4080 
4081         arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4082         arch.gfn = gfn;
4083         arch.direct_map = vcpu->arch.mmu->direct_map;
4084         arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4085 
4086         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4087                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4088 }
4089 
4090 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4091                          gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
4092                          bool *writable)
4093 {
4094         struct kvm_memory_slot *slot;
4095         bool async;
4096 
4097         /*
4098          * Don't expose private memslots to L2.
4099          */
4100         if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4101                 *pfn = KVM_PFN_NOSLOT;
4102                 return false;
4103         }
4104 
4105         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4106         async = false;
4107         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4108         if (!async)
4109                 return false; /* *pfn has correct page already */
4110 
4111         if (!prefault && kvm_can_do_async_pf(vcpu)) {
4112                 trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
4113                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4114                         trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
4115                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4116                         return true;
4117                 } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
4118                         return true;
4119         }
4120 
4121         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4122         return false;
4123 }
4124 
4125 static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
4126                              bool prefault, int max_level, bool is_tdp)
4127 {
4128         bool write = error_code & PFERR_WRITE_MASK;
4129         bool exec = error_code & PFERR_FETCH_MASK;
4130         bool lpage_disallowed = exec && is_nx_huge_page_enabled();
4131         bool map_writable;
4132 
4133         gfn_t gfn = gpa >> PAGE_SHIFT;
4134         unsigned long mmu_seq;
4135         kvm_pfn_t pfn;
4136         int r;
4137 
4138         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4139                 return RET_PF_EMULATE;
4140 
4141         r = mmu_topup_memory_caches(vcpu);
4142         if (r)
4143                 return r;
4144 
4145         if (lpage_disallowed)
4146                 max_level = PT_PAGE_TABLE_LEVEL;
4147 
4148         if (fast_page_fault(vcpu, gpa, error_code))
4149                 return RET_PF_RETRY;
4150 
4151         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4152         smp_rmb();
4153 
4154         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4155                 return RET_PF_RETRY;
4156 
4157         if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
4158                 return r;
4159 
4160         r = RET_PF_RETRY;
4161         spin_lock(&vcpu->kvm->mmu_lock);
4162         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4163                 goto out_unlock;
4164         if (make_mmu_pages_available(vcpu) < 0)
4165                 goto out_unlock;
4166         r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
4167                          prefault, is_tdp && lpage_disallowed);
4168 
4169 out_unlock:
4170         spin_unlock(&vcpu->kvm->mmu_lock);
4171         kvm_release_pfn_clean(pfn);
4172         return r;
4173 }
4174 
4175 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
4176                                 u32 error_code, bool prefault)
4177 {
4178         pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
4179 
4180         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4181         return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
4182                                  PT_DIRECTORY_LEVEL, false);
4183 }
4184 
4185 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4186                                 u64 fault_address, char *insn, int insn_len)
4187 {
4188         int r = 1;
4189 
4190 #ifndef CONFIG_X86_64
4191         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4192         if (WARN_ON_ONCE(fault_address >> 32))
4193                 return -EFAULT;
4194 #endif
4195 
4196         vcpu->arch.l1tf_flush_l1d = true;
4197         switch (vcpu->arch.apf.host_apf_reason) {
4198         default:
4199                 trace_kvm_page_fault(fault_address, error_code);
4200 
4201                 if (kvm_event_needs_reinjection(vcpu))
4202                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4203                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4204                                 insn_len);
4205                 break;
4206         case KVM_PV_REASON_PAGE_NOT_PRESENT:
4207                 vcpu->arch.apf.host_apf_reason = 0;
4208                 local_irq_disable();
4209                 kvm_async_pf_task_wait(fault_address, 0);
4210                 local_irq_enable();
4211                 break;
4212         case KVM_PV_REASON_PAGE_READY:
4213                 vcpu->arch.apf.host_apf_reason = 0;
4214                 local_irq_disable();
4215                 kvm_async_pf_task_wake(fault_address);
4216                 local_irq_enable();
4217                 break;
4218         }
4219         return r;
4220 }
4221 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4222 
4223 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
4224                        bool prefault)
4225 {
4226         int max_level;
4227 
4228         for (max_level = PT_MAX_HUGEPAGE_LEVEL;
4229              max_level > PT_PAGE_TABLE_LEVEL;
4230              max_level--) {
4231                 int page_num = KVM_PAGES_PER_HPAGE(max_level);
4232                 gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
4233 
4234                 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4235                         break;
4236         }
4237 
4238         return direct_page_fault(vcpu, gpa, error_code, prefault,
4239                                  max_level, true);
4240 }
4241 
4242 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4243                                    struct kvm_mmu *context)
4244 {
4245         context->page_fault = nonpaging_page_fault;
4246         context->gva_to_gpa = nonpaging_gva_to_gpa;
4247         context->sync_page = nonpaging_sync_page;
4248         context->invlpg = nonpaging_invlpg;
4249         context->update_pte = nonpaging_update_pte;
4250         context->root_level = 0;
4251         context->shadow_root_level = PT32E_ROOT_LEVEL;
4252         context->direct_map = true;
4253         context->nx = false;
4254 }
4255 
4256 /*
4257  * Find out if a previously cached root matching the new CR3/role is available.
4258  * The current root is also inserted into the cache.
4259  * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
4260  * returned.
4261  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
4262  * false is returned. This root should now be freed by the caller.
4263  */
4264 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4265                                   union kvm_mmu_page_role new_role)
4266 {
4267         uint i;
4268         struct kvm_mmu_root_info root;
4269         struct kvm_mmu *mmu = vcpu->arch.mmu;
4270 
4271         root.cr3 = mmu->root_cr3;
4272         root.hpa = mmu->root_hpa;
4273 
4274         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4275                 swap(root, mmu->prev_roots[i]);
4276 
4277                 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4278                     page_header(root.hpa) != NULL &&
4279                     new_role.word == page_header(root.hpa)->role.word)
4280                         break;
4281         }
4282 
4283         mmu->root_hpa = root.hpa;
4284         mmu->root_cr3 = root.cr3;
4285 
4286         return i < KVM_MMU_NUM_PREV_ROOTS;
4287 }
4288 
4289 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4290                             union kvm_mmu_page_role new_role,
4291                             bool skip_tlb_flush)
4292 {
4293         struct kvm_mmu *mmu = vcpu->arch.mmu;
4294 
4295         /*
4296          * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
4297          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4298          * later if necessary.
4299          */
4300         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4301             mmu->root_level >= PT64_ROOT_4LEVEL) {
4302                 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4303                         return false;
4304 
4305                 if (cached_root_available(vcpu, new_cr3, new_role)) {
4306                         /*
4307                          * It is possible that the cached previous root page is
4308                          * obsolete because of a change in the MMU generation
4309                          * number. However, changing the generation number is
4310                          * accompanied by KVM_REQ_MMU_RELOAD, which will free
4311                          * the root set here and allocate a new one.
4312                          */
4313                         kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4314                         if (!skip_tlb_flush) {
4315                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4316                                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
4317                         }
4318 
4319                         /*
4320                          * The last MMIO access's GVA and GPA are cached in the
4321                          * VCPU. When switching to a new CR3, that GVA->GPA
4322                          * mapping may no longer be valid. So clear any cached
4323                          * MMIO info even when we don't need to sync the shadow
4324                          * page tables.
4325                          */
4326                         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4327 
4328                         __clear_sp_write_flooding_count(
4329                                 page_header(mmu->root_hpa));
4330 
4331                         return true;
4332                 }
4333         }
4334 
4335         return false;
4336 }
4337 
4338 static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4339                               union kvm_mmu_page_role new_role,
4340                               bool skip_tlb_flush)
4341 {
4342         if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4343                 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4344                                    KVM_MMU_ROOT_CURRENT);
4345 }
4346 
4347 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4348 {
4349         __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4350                           skip_tlb_flush);
4351 }
4352 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4353 
4354 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4355 {
4356         return kvm_read_cr3(vcpu);
4357 }
4358 
4359 static void inject_page_fault(struct kvm_vcpu *vcpu,
4360                               struct x86_exception *fault)
4361 {
4362         vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4363 }
4364 
4365 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4366                            unsigned int access, int *nr_present)
4367 {
4368         if (unlikely(is_mmio_spte(*sptep))) {
4369                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4370                         mmu_spte_clear_no_track(sptep);
4371                         return true;
4372                 }
4373 
4374                 (*nr_present)++;
4375                 mark_mmio_spte(vcpu, sptep, gfn, access);
4376                 return true;
4377         }
4378 
4379         return false;
4380 }
4381 
4382 static inline bool is_last_gpte(struct kvm_mmu *mmu,
4383                                 unsigned level, unsigned gpte)
4384 {
4385         /*
4386          * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
4387          * If it is clear, there are no large pages at this level, so clear
4388          * PT_PAGE_SIZE_MASK in gpte if that is the case.
4389          */
4390         gpte &= level - mmu->last_nonleaf_level;
4391 
4392         /*
4393          * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
4394          * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
4395          * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
4396          */
4397         gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4398 
4399         return gpte & PT_PAGE_SIZE_MASK;
4400 }
4401 
4402 #define PTTYPE_EPT 18 /* arbitrary */
4403 #define PTTYPE PTTYPE_EPT
4404 #include "paging_tmpl.h"
4405 #undef PTTYPE
4406 
4407 #define PTTYPE 64
4408 #include "paging_tmpl.h"
4409 #undef PTTYPE
4410 
4411 #define PTTYPE 32
4412 #include "paging_tmpl.h"
4413 #undef PTTYPE
4414 
4415 static void
4416 __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4417                         struct rsvd_bits_validate *rsvd_check,
4418                         int maxphyaddr, int level, bool nx, bool gbpages,
4419                         bool pse, bool amd)
4420 {
4421         u64 exb_bit_rsvd = 0;
4422         u64 gbpages_bit_rsvd = 0;
4423         u64 nonleaf_bit8_rsvd = 0;
4424 
4425         rsvd_check->bad_mt_xwr = 0;
4426 
4427         if (!nx)
4428                 exb_bit_rsvd = rsvd_bits(63, 63);
4429         if (!gbpages)
4430                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4431 
4432         /*
4433          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4434          * leaf entries) on AMD CPUs only.
4435          */
4436         if (amd)
4437                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4438 
4439         switch (level) {
4440         case PT32_ROOT_LEVEL:
4441                 /* no rsvd bits for 2 level 4K page table entries */
4442                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4443                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4444                 rsvd_check->rsvd_bits_mask[1][0] =
4445                         rsvd_check->rsvd_bits_mask[0][0];
4446 
4447                 if (!pse) {
4448                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4449                         break;
4450                 }
4451 
4452                 if (is_cpuid_PSE36())
4453                         /* 36bits PSE 4MB page */
4454                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4455                 else
4456                         /* 32 bits PSE 4MB page */
4457                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4458                 break;
4459         case PT32E_ROOT_LEVEL:
4460                 rsvd_check->rsvd_bits_mask[0][2] =
4461                         rsvd_bits(maxphyaddr, 63) |
4462                         rsvd_bits(5, 8) | rsvd_bits(1, 2);      /* PDPTE */
4463                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4464                         rsvd_bits(maxphyaddr, 62);      /* PDE */
4465                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4466                         rsvd_bits(maxphyaddr, 62);      /* PTE */
4467                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4468                         rsvd_bits(maxphyaddr, 62) |
4469                         rsvd_bits(13, 20);              /* large page */
4470                 rsvd_check->rsvd_bits_mask[1][0] =
4471                         rsvd_check->rsvd_bits_mask[0][0];
4472                 break;
4473         case PT64_ROOT_5LEVEL:
4474                 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4475                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4476                         rsvd_bits(maxphyaddr, 51);
4477                 rsvd_check->rsvd_bits_mask[1][4] =
4478                         rsvd_check->rsvd_bits_mask[0][4];
4479                 /* fall through */
4480         case PT64_ROOT_4LEVEL:
4481                 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4482                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4483                         rsvd_bits(maxphyaddr, 51);
4484                 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4485                         nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4486                         rsvd_bits(maxphyaddr, 51);
4487                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4488                         rsvd_bits(maxphyaddr, 51);
4489                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4490                         rsvd_bits(maxphyaddr, 51);
4491                 rsvd_check->rsvd_bits_mask[1][3] =
4492                         rsvd_check->rsvd_bits_mask[0][3];
4493                 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4494                         gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4495                         rsvd_bits(13, 29);
4496                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4497                         rsvd_bits(maxphyaddr, 51) |
4498                         rsvd_bits(13, 20);              /* large page */
4499                 rsvd_check->rsvd_bits_mask[1][0] =
4500                         rsvd_check->rsvd_bits_mask[0][0];
4501                 break;
4502         }
4503 }
4504 
4505 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4506                                   struct kvm_mmu *context)
4507 {
4508         __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4509                                 cpuid_maxphyaddr(vcpu), context->root_level,
4510                                 context->nx,
4511                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4512                                 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4513 }
4514 
4515 static void
4516 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4517                             int maxphyaddr, bool execonly)
4518 {
4519         u64 bad_mt_xwr;
4520 
4521         rsvd_check->rsvd_bits_mask[0][4] =
4522                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4523         rsvd_check->rsvd_bits_mask[0][3] =
4524                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4525         rsvd_check->rsvd_bits_mask[0][2] =
4526                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4527         rsvd_check->rsvd_bits_mask[0][1] =
4528                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4529         rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4530 
4531         /* large page */
4532         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4533         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4534         rsvd_check->rsvd_bits_mask[1][2] =
4535                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4536         rsvd_check->rsvd_bits_mask[1][1] =
4537                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4538         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4539 
4540         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4541         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4542         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4543         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4544         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4545         if (!execonly) {
4546                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4547                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4548         }
4549         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4550 }
4551 
4552 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4553                 struct kvm_mmu *context, bool execonly)
4554 {
4555         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4556                                     cpuid_maxphyaddr(vcpu), execonly);
4557 }
4558 
4559 /*
4560  * the page table on host is the shadow page table for the page
4561  * table in guest or amd nested guest, its mmu features completely
4562  * follow the features in guest.
4563  */
4564 void
4565 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4566 {
4567         bool uses_nx = context->nx ||
4568                 context->mmu_role.base.smep_andnot_wp;
4569         struct rsvd_bits_validate *shadow_zero_check;
4570         int i;
4571 
4572         /*
4573          * Passing "true" to the last argument is okay; it adds a check
4574          * on bit 8 of the SPTEs which KVM doesn't use anyway.
4575          */
4576         shadow_zero_check = &context->shadow_zero_check;
4577         __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4578                                 shadow_phys_bits,
4579                                 context->shadow_root_level, uses_nx,
4580                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4581                                 is_pse(vcpu), true);
4582 
4583         if (!shadow_me_mask)
4584                 return;
4585 
4586         for (i = context->shadow_root_level; --i >= 0;) {
4587                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4588                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4589         }
4590 
4591 }
4592 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4593 
4594 static inline bool boot_cpu_is_amd(void)
4595 {
4596         WARN_ON_ONCE(!tdp_enabled);
4597         return shadow_x_mask == 0;
4598 }
4599 
4600 /*
4601  * the direct page table on host, use as much mmu features as
4602  * possible, however, kvm currently does not do execution-protection.
4603  */
4604 static void
4605 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4606                                 struct kvm_mmu *context)
4607 {
4608         struct rsvd_bits_validate *shadow_zero_check;
4609         int i;
4610 
4611         shadow_zero_check = &context->shadow_zero_check;
4612 
4613         if (boot_cpu_is_amd())
4614                 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4615                                         shadow_phys_bits,
4616                                         context->shadow_root_level, false,
4617                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4618                                         true, true);
4619         else
4620                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4621                                             shadow_phys_bits,
4622                                             false);
4623 
4624         if (!shadow_me_mask)
4625                 return;
4626 
4627         for (i = context->shadow_root_level; --i >= 0;) {
4628                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4629                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4630         }
4631 }
4632 
4633 /*
4634  * as the comments in reset_shadow_zero_bits_mask() except it
4635  * is the shadow page table for intel nested guest.
4636  */
4637 static void
4638 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4639                                 struct kvm_mmu *context, bool execonly)
4640 {
4641         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4642                                     shadow_phys_bits, execonly);
4643 }
4644 
4645 #define BYTE_MASK(access) \
4646         ((1 & (access) ? 2 : 0) | \
4647          (2 & (access) ? 4 : 0) | \
4648          (3 & (access) ? 8 : 0) | \
4649          (4 & (access) ? 16 : 0) | \
4650          (5 & (access) ? 32 : 0) | \
4651          (6 & (access) ? 64 : 0) | \
4652          (7 & (access) ? 128 : 0))
4653 
4654 
4655 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4656                                       struct kvm_mmu *mmu, bool ept)
4657 {
4658         unsigned byte;
4659 
4660         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4661         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4662         const u8 u = BYTE_MASK(ACC_USER_MASK);
4663 
4664         bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4665         bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4666         bool cr0_wp = is_write_protection(vcpu);
4667 
4668         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4669                 unsigned pfec = byte << 1;
4670 
4671                 /*
4672                  * Each "*f" variable has a 1 bit for each UWX value
4673                  * that causes a fault with the given PFEC.
4674                  */
4675 
4676                 /* Faults from writes to non-writable pages */
4677                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4678                 /* Faults from user mode accesses to supervisor pages */
4679                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4680                 /* Faults from fetches of non-executable pages*/
4681                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4682                 /* Faults from kernel mode fetches of user pages */
4683                 u8 smepf = 0;
4684                 /* Faults from kernel mode accesses of user pages */
4685                 u8 smapf = 0;
4686 
4687                 if (!ept) {
4688                         /* Faults from kernel mode accesses to user pages */
4689                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4690 
4691                         /* Not really needed: !nx will cause pte.nx to fault */
4692                         if (!mmu->nx)
4693                                 ff = 0;
4694 
4695                         /* Allow supervisor writes if !cr0.wp */
4696                         if (!cr0_wp)
4697                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4698 
4699                         /* Disallow supervisor fetches of user code if cr4.smep */
4700                         if (cr4_smep)
4701                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4702 
4703                         /*
4704                          * SMAP:kernel-mode data accesses from user-mode
4705                          * mappings should fault. A fault is considered
4706                          * as a SMAP violation if all of the following
4707                          * conditions are true:
4708                          *   - X86_CR4_SMAP is set in CR4
4709                          *   - A user page is accessed
4710                          *   - The access is not a fetch
4711                          *   - Page fault in kernel mode
4712                          *   - if CPL = 3 or X86_EFLAGS_AC is clear
4713                          *
4714                          * Here, we cover the first three conditions.
4715                          * The fourth is computed dynamically in permission_fault();
4716                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4717                          * *not* subject to SMAP restrictions.
4718                          */
4719                         if (cr4_smap)
4720                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4721                 }
4722 
4723                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4724         }
4725 }
4726 
4727 /*
4728 * PKU is an additional mechanism by which the paging controls access to
4729 * user-mode addresses based on the value in the PKRU register.  Protection
4730 * key violations are reported through a bit in the page fault error code.
4731 * Unlike other bits of the error code, the PK bit is not known at the
4732 * call site of e.g. gva_to_gpa; it must be computed directly in
4733 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4734 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4735 *
4736 * In particular the following conditions come from the error code, the
4737 * page tables and the machine state:
4738 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4739 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4740 * - PK is always zero if U=0 in the page tables
4741 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4742 *
4743 * The PKRU bitmask caches the result of these four conditions.  The error
4744 * code (minus the P bit) and the page table's U bit form an index into the
4745 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4746 * with the two bits of the PKRU register corresponding to the protection key.
4747 * For the first three conditions above the bits will be 00, thus masking
4748 * away both AD and WD.  For all reads or if the last condition holds, WD
4749 * only will be masked away.
4750 */
4751 static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4752                                 bool ept)
4753 {
4754         unsigned bit;
4755         bool wp;
4756 
4757         if (ept) {
4758                 mmu->pkru_mask = 0;
4759                 return;
4760         }
4761 
4762         /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
4763         if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4764                 mmu->pkru_mask = 0;
4765                 return;
4766         }
4767 
4768         wp = is_write_protection(vcpu);
4769 
4770         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4771                 unsigned pfec, pkey_bits;
4772                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4773 
4774                 pfec = bit << 1;
4775                 ff = pfec & PFERR_FETCH_MASK;
4776                 uf = pfec & PFERR_USER_MASK;
4777                 wf = pfec & PFERR_WRITE_MASK;
4778 
4779                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4780                 pte_user = pfec & PFERR_RSVD_MASK;
4781 
4782                 /*
4783                  * Only need to check the access which is not an
4784                  * instruction fetch and is to a user page.
4785                  */
4786                 check_pkey = (!ff && pte_user);
4787                 /*
4788                  * write access is controlled by PKRU if it is a
4789                  * user access or CR0.WP = 1.
4790                  */
4791                 check_write = check_pkey && wf && (uf || wp);
4792 
4793                 /* PKRU.AD stops both read and write access. */
4794                 pkey_bits = !!check_pkey;
4795                 /* PKRU.WD stops write access. */
4796                 pkey_bits |= (!!check_write) << 1;
4797 
4798                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4799         }
4800 }
4801 
4802 static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4803 {
4804         unsigned root_level = mmu->root_level;
4805 
4806         mmu->last_nonleaf_level = root_level;
4807         if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4808                 mmu->last_nonleaf_level++;
4809 }
4810 
4811 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4812                                          struct kvm_mmu *context,
4813                                          int level)
4814 {
4815         context->nx = is_nx(vcpu);
4816         context->root_level = level;
4817 
4818         reset_rsvds_bits_mask(vcpu, context);
4819         update_permission_bitmask(vcpu, context, false);
4820         update_pkru_bitmask(vcpu, context, false);
4821         update_last_nonleaf_level(vcpu, context);
4822 
4823         MMU_WARN_ON(!is_pae(vcpu));
4824         context->page_fault = paging64_page_fault;
4825         context->gva_to_gpa = paging64_gva_to_gpa;
4826         context->sync_page = paging64_sync_page;
4827         context->invlpg = paging64_invlpg;
4828         context->update_pte = paging64_update_pte;
4829         context->shadow_root_level = level;
4830         context->direct_map = false;
4831 }
4832 
4833 static void paging64_init_context(struct kvm_vcpu *vcpu,
4834                                   struct kvm_mmu *context)
4835 {
4836         int root_level = is_la57_mode(vcpu) ?
4837                          PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4838 
4839         paging64_init_context_common(vcpu, context, root_level);
4840 }
4841 
4842 static void paging32_init_context(struct kvm_vcpu *vcpu,
4843                                   struct kvm_mmu *context)
4844 {
4845         context->nx = false;
4846         context->root_level = PT32_ROOT_LEVEL;
4847 
4848         reset_rsvds_bits_mask(vcpu, context);
4849         update_permission_bitmask(vcpu, context, false);
4850         update_pkru_bitmask(vcpu, context, false);
4851         update_last_nonleaf_level(vcpu, context);
4852 
4853         context->page_fault = paging32_page_fault;
4854         context->gva_to_gpa = paging32_gva_to_gpa;
4855         context->sync_page = paging32_sync_page;
4856         context->invlpg = paging32_invlpg;
4857         context->update_pte = paging32_update_pte;
4858         context->shadow_root_level = PT32E_ROOT_LEVEL;
4859         context->direct_map = false;
4860 }
4861 
4862 static void paging32E_init_context(struct kvm_vcpu *vcpu,
4863                                    struct kvm_mmu *context)
4864 {
4865         paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4866 }
4867 
4868 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4869 {
4870         union kvm_mmu_extended_role ext = {0};
4871 
4872         ext.cr0_pg = !!is_paging(vcpu);
4873         ext.cr4_pae = !!is_pae(vcpu);
4874         ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4875         ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4876         ext.cr4_pse = !!is_pse(vcpu);
4877         ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4878         ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4879         ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4880 
4881         ext.valid = 1;
4882 
4883         return ext;
4884 }
4885 
4886 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4887                                                    bool base_only)
4888 {
4889         union kvm_mmu_role role = {0};
4890 
4891         role.base.access = ACC_ALL;
4892         role.base.nxe = !!is_nx(vcpu);
4893         role.base.cr0_wp = is_write_protection(vcpu);
4894         role.base.smm = is_smm(vcpu);
4895         role.base.guest_mode = is_guest_mode(vcpu);
4896 
4897         if (base_only)
4898                 return role;
4899 
4900         role.ext = kvm_calc_mmu_role_ext(vcpu);
4901 
4902         return role;
4903 }
4904 
4905 static union kvm_mmu_role
4906 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4907 {
4908         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4909 
4910         role.base.ad_disabled = (shadow_accessed_mask == 0);
4911         role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4912         role.base.direct = true;
4913         role.base.gpte_is_8_bytes = true;
4914 
4915         return role;
4916 }
4917 
4918 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4919 {
4920         struct kvm_mmu *context = vcpu->arch.mmu;
4921         union kvm_mmu_role new_role =
4922                 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
4923 
4924         new_role.base.word &= mmu_base_role_mask.word;
4925         if (new_role.as_u64 == context->mmu_role.as_u64)
4926                 return;
4927 
4928         context->mmu_role.as_u64 = new_role.as_u64;
4929         context->page_fault = kvm_tdp_page_fault;
4930         context->sync_page = nonpaging_sync_page;
4931         context->invlpg = nonpaging_invlpg;
4932         context->update_pte = nonpaging_update_pte;
4933         context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
4934         context->direct_map = true;
4935         context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
4936         context->get_cr3 = get_cr3;
4937         context->get_pdptr = kvm_pdptr_read;
4938         context->inject_page_fault = kvm_inject_page_fault;
4939 
4940         if (!is_paging(vcpu)) {
4941                 context->nx = false;
4942                 context->gva_to_gpa = nonpaging_gva_to_gpa;
4943                 context->root_level = 0;
4944         } else if (is_long_mode(vcpu)) {
4945                 context->nx = is_nx(vcpu);
4946                 context->root_level = is_la57_mode(vcpu) ?
4947                                 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4948                 reset_rsvds_bits_mask(vcpu, context);
4949                 context->gva_to_gpa = paging64_gva_to_gpa;
4950         } else if (is_pae(vcpu)) {
4951                 context->nx = is_nx(vcpu);
4952                 context->root_level = PT32E_ROOT_LEVEL;
4953                 reset_rsvds_bits_mask(vcpu, context);
4954                 context->gva_to_gpa = paging64_gva_to_gpa;
4955         } else {
4956                 context->nx = false;
4957                 context->root_level = PT32_ROOT_LEVEL;
4958                 reset_rsvds_bits_mask(vcpu, context);
4959                 context->gva_to_gpa = paging32_gva_to_gpa;
4960         }
4961 
4962         update_permission_bitmask(vcpu, context, false);
4963         update_pkru_bitmask(vcpu, context, false);
4964         update_last_nonleaf_level(vcpu, context);
4965         reset_tdp_shadow_zero_bits_mask(vcpu, context);
4966 }
4967 
4968 static union kvm_mmu_role
4969 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4970 {
4971         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4972 
4973         role.base.smep_andnot_wp = role.ext.cr4_smep &&
4974                 !is_write_protection(vcpu);
4975         role.base.smap_andnot_wp = role.ext.cr4_smap &&
4976                 !is_write_protection(vcpu);
4977         role.base.direct = !is_paging(vcpu);
4978         role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4979 
4980         if (!is_long_mode(vcpu))
4981                 role.base.level = PT32E_ROOT_LEVEL;
4982         else if (is_la57_mode(vcpu))
4983                 role.base.level = PT64_ROOT_5LEVEL;
4984         else
4985                 role.base.level = PT64_ROOT_4LEVEL;
4986 
4987         return role;
4988 }
4989 
4990 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
4991 {
4992         struct kvm_mmu *context = vcpu->arch.mmu;
4993         union kvm_mmu_role new_role =
4994                 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
4995 
4996         new_role.base.word &= mmu_base_role_mask.word;
4997         if (new_role.as_u64 == context->mmu_role.as_u64)
4998                 return;
4999 
5000         if (!is_paging(vcpu))
5001                 nonpaging_init_context(vcpu, context);
5002         else if (is_long_mode(vcpu))
5003                 paging64_init_context(vcpu, context);
5004         else if (is_pae(vcpu))
5005                 paging32E_init_context(vcpu, context);
5006         else
5007                 paging32_init_context(vcpu, context);
5008 
5009         context->mmu_role.as_u64 = new_role.as_u64;
5010         reset_shadow_zero_bits_mask(vcpu, context);
5011 }
5012 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
5013 
5014 static union kvm_mmu_role
5015 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5016                                    bool execonly)
5017 {
5018         union kvm_mmu_role role = {0};
5019 
5020         /* SMM flag is inherited from root_mmu */
5021         role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
5022 
5023         role.base.level = PT64_ROOT_4LEVEL;
5024         role.base.gpte_is_8_bytes = true;
5025         role.base.direct = false;
5026         role.base.ad_disabled = !accessed_dirty;
5027         role.base.guest_mode = true;
5028         role.base.access = ACC_ALL;
5029 
5030         /*
5031          * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
5032          * SMAP variation to denote shadow EPT entries.
5033          */
5034         role.base.cr0_wp = true;
5035         role.base.smap_andnot_wp = true;
5036 
5037         role.ext = kvm_calc_mmu_role_ext(vcpu);
5038         role.ext.execonly = execonly;
5039 
5040         return role;
5041 }
5042 
5043 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5044                              bool accessed_dirty, gpa_t new_eptp)
5045 {
5046         struct kvm_mmu *context = vcpu->arch.mmu;
5047         union kvm_mmu_role new_role =
5048                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5049                                                    execonly);
5050 
5051         __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
5052 
5053         new_role.base.word &= mmu_base_role_mask.word;
5054         if (new_role.as_u64 == context->mmu_role.as_u64)
5055                 return;
5056 
5057         context->shadow_root_level = PT64_ROOT_4LEVEL;
5058 
5059         context->nx = true;
5060         context->ept_ad = accessed_dirty;
5061         context->page_fault = ept_page_fault;
5062         context->gva_to_gpa = ept_gva_to_gpa;
5063         context->sync_page = ept_sync_page;
5064         context->invlpg = ept_invlpg;
5065         context->update_pte = ept_update_pte;
5066         context->root_level = PT64_ROOT_4LEVEL;
5067         context->direct_map = false;
5068         context->mmu_role.as_u64 = new_role.as_u64;
5069 
5070         update_permission_bitmask(vcpu, context, true);
5071         update_pkru_bitmask(vcpu, context, true);
5072         update_last_nonleaf_level(vcpu, context);
5073         reset_rsvds_bits_mask_ept(vcpu, context, execonly);
5074         reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
5075 }
5076 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5077 
5078 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5079 {
5080         struct kvm_mmu *context = vcpu->arch.mmu;
5081 
5082         kvm_init_shadow_mmu(vcpu);
5083         context->set_cr3           = kvm_x86_ops->set_cr3;
5084         context->get_cr3           = get_cr3;
5085         context->get_pdptr         = kvm_pdptr_read;
5086         context->inject_page_fault = kvm_inject_page_fault;
5087 }
5088 
5089 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5090 {
5091         union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5092         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5093 
5094         new_role.base.word &= mmu_base_role_mask.word;
5095         if (new_role.as_u64 == g_context->mmu_role.as_u64)
5096                 return;
5097 
5098         g_context->mmu_role.as_u64 = new_role.as_u64;
5099         g_context->get_cr3           = get_cr3;
5100         g_context->get_pdptr         = kvm_pdptr_read;
5101         g_context->inject_page_fault = kvm_inject_page_fault;
5102 
5103         /*
5104          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5105          * L1's nested page tables (e.g. EPT12). The nested translation
5106          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5107          * L2's page tables as the first level of translation and L1's
5108          * nested page tables as the second level of translation. Basically
5109          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5110          */
5111         if (!is_paging(vcpu)) {
5112                 g_context->nx = false;
5113                 g_context->root_level = 0;
5114                 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5115         } else if (is_long_mode(vcpu)) {
5116                 g_context->nx = is_nx(vcpu);
5117                 g_context->root_level = is_la57_mode(vcpu) ?
5118                                         PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5119                 reset_rsvds_bits_mask(vcpu, g_context);
5120                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5121         } else if (is_pae(vcpu)) {
5122                 g_context->nx = is_nx(vcpu);
5123                 g_context->root_level = PT32E_ROOT_LEVEL;
5124                 reset_rsvds_bits_mask(vcpu, g_context);
5125                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5126         } else {
5127                 g_context->nx = false;
5128                 g_context->root_level = PT32_ROOT_LEVEL;
5129                 reset_rsvds_bits_mask(vcpu, g_context);
5130                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5131         }
5132 
5133         update_permission_bitmask(vcpu, g_context, false);
5134         update_pkru_bitmask(vcpu, g_context, false);
5135         update_last_nonleaf_level(vcpu, g_context);
5136 }
5137 
5138 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5139 {
5140         if (reset_roots) {
5141                 uint i;
5142 
5143                 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5144 
5145                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5146                         vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5147         }
5148 
5149         if (mmu_is_nested(vcpu))
5150                 init_kvm_nested_mmu(vcpu);
5151         else if (tdp_enabled)
5152                 init_kvm_tdp_mmu(vcpu);
5153         else
5154                 init_kvm_softmmu(vcpu);
5155 }
5156 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5157 
5158 static union kvm_mmu_page_role
5159 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5160 {
5161         union kvm_mmu_role role;
5162 
5163         if (tdp_enabled)
5164                 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5165         else
5166                 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5167 
5168         return role.base;
5169 }
5170 
5171 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5172 {
5173         kvm_mmu_unload(vcpu);
5174         kvm_init_mmu(vcpu, true);
5175 }
5176 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5177 
5178 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5179 {
5180         int r;
5181 
5182         r = mmu_topup_memory_caches(vcpu);
5183         if (r)
5184                 goto out;
5185         r = mmu_alloc_roots(vcpu);
5186         kvm_mmu_sync_roots(vcpu);
5187         if (r)
5188                 goto out;
5189         kvm_mmu_load_cr3(vcpu);
5190         kvm_x86_ops->tlb_flush(vcpu, true);
5191 out:
5192         return r;
5193 }
5194 EXPORT_SYMBOL_GPL(kvm_mmu_load);
5195 
5196 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5197 {
5198         kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5199         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5200         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5201         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5202 }
5203 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5204 
5205 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5206                                   struct kvm_mmu_page *sp, u64 *spte,
5207                                   const void *new)
5208 {
5209         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5210                 ++vcpu->kvm->stat.mmu_pde_zapped;
5211                 return;
5212         }
5213 
5214         ++vcpu->kvm->stat.mmu_pte_updated;
5215         vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5216 }
5217 
5218 static bool need_remote_flush(u64 old, u64 new)
5219 {
5220         if (!is_shadow_present_pte(old))
5221                 return false;
5222         if (!is_shadow_present_pte(new))
5223                 return true;
5224         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5225                 return true;
5226         old ^= shadow_nx_mask;
5227         new ^= shadow_nx_mask;
5228         return (old & ~new & PT64_PERM_MASK) != 0;
5229 }
5230 
5231 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5232                                     int *bytes)
5233 {
5234         u64 gentry = 0;
5235         int r;
5236 
5237         /*
5238          * Assume that the pte write on a page table of the same type
5239          * as the current vcpu paging mode since we update the sptes only
5240          * when they have the same mode.
5241          */
5242         if (is_pae(vcpu) && *bytes == 4) {
5243                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5244                 *gpa &= ~(gpa_t)7;
5245                 *bytes = 8;
5246         }
5247 
5248         if (*bytes == 4 || *bytes == 8) {
5249                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5250                 if (r)
5251                         gentry = 0;
5252         }
5253 
5254         return gentry;
5255 }
5256 
5257 /*
5258  * If we're seeing too many writes to a page, it may no longer be a page table,
5259  * or we may be forking, in which case it is better to unmap the page.
5260  */
5261 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5262 {
5263         /*
5264          * Skip write-flooding detected for the sp whose level is 1, because
5265          * it can become unsync, then the guest page is not write-protected.
5266          */
5267         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5268                 return false;
5269 
5270         atomic_inc(&sp->write_flooding_count);
5271         return atomic_read(&sp->write_flooding_count) >= 3;
5272 }
5273 
5274 /*
5275  * Misaligned accesses are too much trouble to fix up; also, they usually
5276  * indicate a page is not used as a page table.
5277  */
5278 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5279                                     int bytes)
5280 {
5281         unsigned offset, pte_size, misaligned;
5282 
5283         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5284                  gpa, bytes, sp->role.word);
5285 
5286         offset = offset_in_page(gpa);
5287         pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5288 
5289         /*
5290          * Sometimes, the OS only writes the last one bytes to update status
5291          * bits, for example, in linux, andb instruction is used in clear_bit().
5292          */
5293         if (!(offset & (pte_size - 1)) && bytes == 1)
5294                 return false;
5295 
5296         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5297         misaligned |= bytes < 4;
5298 
5299         return misaligned;
5300 }
5301 
5302 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5303 {
5304         unsigned page_offset, quadrant;
5305         u64 *spte;
5306         int level;
5307 
5308         page_offset = offset_in_page(gpa);
5309         level = sp->role.level;
5310         *nspte = 1;
5311         if (!sp->role.gpte_is_8_bytes) {
5312                 page_offset <<= 1;      /* 32->64 */
5313                 /*
5314                  * A 32-bit pde maps 4MB while the shadow pdes map
5315                  * only 2MB.  So we need to double the offset again
5316                  * and zap two pdes instead of one.
5317                  */
5318                 if (level == PT32_ROOT_LEVEL) {
5319                         page_offset &= ~7; /* kill rounding error */
5320                         page_offset <<= 1;
5321                         *nspte = 2;
5322                 }
5323                 quadrant = page_offset >> PAGE_SHIFT;
5324                 page_offset &= ~PAGE_MASK;
5325                 if (quadrant != sp->role.quadrant)
5326                         return NULL;
5327         }
5328 
5329         spte = &sp->spt[page_offset / sizeof(*spte)];
5330         return spte;
5331 }
5332 
5333 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5334                               const u8 *new, int bytes,
5335                               struct kvm_page_track_notifier_node *node)
5336 {
5337         gfn_t gfn = gpa >> PAGE_SHIFT;
5338         struct kvm_mmu_page *sp;
5339         LIST_HEAD(invalid_list);
5340         u64 entry, gentry, *spte;
5341         int npte;
5342         bool remote_flush, local_flush;
5343 
5344         /*
5345          * If we don't have indirect shadow pages, it means no page is
5346          * write-protected, so we can exit simply.
5347          */
5348         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5349                 return;
5350 
5351         remote_flush = local_flush = false;
5352 
5353         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5354 
5355         /*
5356          * No need to care whether allocation memory is successful
5357          * or not since pte prefetch is skiped if it does not have
5358          * enough objects in the cache.
5359          */
5360         mmu_topup_memory_caches(vcpu);
5361 
5362         spin_lock(&vcpu->kvm->mmu_lock);
5363 
5364         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5365 
5366         ++vcpu->kvm->stat.mmu_pte_write;
5367         kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5368 
5369         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5370                 if (detect_write_misaligned(sp, gpa, bytes) ||
5371                       detect_write_flooding(sp)) {
5372                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5373                         ++vcpu->kvm->stat.mmu_flooded;
5374                         continue;
5375                 }
5376 
5377                 spte = get_written_sptes(sp, gpa, &npte);
5378                 if (!spte)
5379                         continue;
5380 
5381                 local_flush = true;
5382                 while (npte--) {
5383                         u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5384 
5385                         entry = *spte;
5386                         mmu_page_zap_pte(vcpu->kvm, sp, spte);
5387                         if (gentry &&
5388                               !((sp->role.word ^ base_role)
5389                               & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5390                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5391                         if (need_remote_flush(entry, *spte))
5392                                 remote_flush = true;
5393                         ++spte;
5394                 }
5395         }
5396         kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5397         kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5398         spin_unlock(&vcpu->kvm->mmu_lock);
5399 }
5400 
5401 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5402 {
5403         gpa_t gpa;
5404         int r;
5405 
5406         if (vcpu->arch.mmu->direct_map)
5407                 return 0;
5408 
5409         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5410 
5411         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5412 
5413         return r;
5414 }
5415 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5416 
5417 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5418                        void *insn, int insn_len)
5419 {
5420         int r, emulation_type = 0;
5421         bool direct = vcpu->arch.mmu->direct_map;
5422 
5423         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
5424                 return RET_PF_RETRY;
5425 
5426         /* With shadow page tables, fault_address contains a GVA or nGPA.  */
5427         if (vcpu->arch.mmu->direct_map) {
5428                 vcpu->arch.gpa_available = true;
5429                 vcpu->arch.gpa_val = cr2_or_gpa;
5430         }
5431 
5432         r = RET_PF_INVALID;
5433         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5434                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5435                 if (r == RET_PF_EMULATE)
5436                         goto emulate;
5437         }
5438 
5439         if (r == RET_PF_INVALID) {
5440                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5441                                           lower_32_bits(error_code), false);
5442                 WARN_ON(r == RET_PF_INVALID);
5443         }
5444 
5445         if (r == RET_PF_RETRY)
5446                 return 1;
5447         if (r < 0)
5448                 return r;
5449 
5450         /*
5451          * Before emulating the instruction, check if the error code
5452          * was due to a RO violation while translating the guest page.
5453          * This can occur when using nested virtualization with nested
5454          * paging in both guests. If true, we simply unprotect the page
5455          * and resume the guest.
5456          */
5457         if (vcpu->arch.mmu->direct_map &&
5458             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5459                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5460                 return 1;
5461         }
5462 
5463         /*
5464          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5465          * optimistically try to just unprotect the page and let the processor
5466          * re-execute the instruction that caused the page fault.  Do not allow
5467          * retrying MMIO emulation, as it's not only pointless but could also
5468          * cause us to enter an infinite loop because the processor will keep
5469          * faulting on the non-existent MMIO address.  Retrying an instruction
5470          * from a nested guest is also pointless and dangerous as we are only
5471          * explicitly shadowing L1's page tables, i.e. unprotecting something
5472          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5473          */
5474         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5475                 emulation_type = EMULTYPE_ALLOW_RETRY;
5476 emulate:
5477         /*
5478          * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
5479          * This can happen if a guest gets a page-fault on data access but the HW
5480          * table walker is not able to read the instruction page (e.g instruction
5481          * page is not present in memory). In those cases we simply restart the
5482          * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
5483          */
5484         if (unlikely(insn && !insn_len)) {
5485                 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5486                         return 1;
5487         }
5488 
5489         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5490                                        insn_len);
5491 }
5492 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5493 
5494 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5495 {
5496         struct kvm_mmu *mmu = vcpu->arch.mmu;
5497         int i;
5498 
5499         /* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
5500         if (is_noncanonical_address(gva, vcpu))
5501                 return;
5502 
5503         mmu->invlpg(vcpu, gva, mmu->root_hpa);
5504 
5505         /*
5506          * INVLPG is required to invalidate any global mappings for the VA,
5507          * irrespective of PCID. Since it would take us roughly similar amount
5508          * of work to determine whether any of the prev_root mappings of the VA
5509          * is marked global, or to just sync it blindly, so we might as well
5510          * just always sync it.
5511          *
5512          * Mappings not reachable via the current cr3 or the prev_roots will be
5513          * synced when switching to that cr3, so nothing needs to be done here
5514          * for them.
5515          */
5516         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5517                 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5518                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5519 
5520         kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5521         ++vcpu->stat.invlpg;
5522 }
5523 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5524 
5525 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5526 {
5527         struct kvm_mmu *mmu = vcpu->arch.mmu;
5528         bool tlb_flush = false;
5529         uint i;
5530 
5531         if (pcid == kvm_get_active_pcid(vcpu)) {
5532                 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5533                 tlb_flush = true;
5534         }
5535 
5536         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5537                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5538                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5539                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5540                         tlb_flush = true;
5541                 }
5542         }
5543 
5544         if (tlb_flush)
5545                 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5546 
5547         ++vcpu->stat.invlpg;
5548 
5549         /*
5550          * Mappings not reachable via the current cr3 or the prev_roots will be
5551          * synced when switching to that cr3, so nothing needs to be done here
5552          * for them.
5553          */
5554 }
5555 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5556 
5557 void kvm_enable_tdp(void)
5558 {
5559         tdp_enabled = true;
5560 }
5561 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5562 
5563 void kvm_disable_tdp(void)
5564 {
5565         tdp_enabled = false;
5566 }
5567 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5568 
5569 
5570 /* The return value indicates if tlb flush on all vcpus is needed. */
5571 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5572 
5573 /* The caller should hold mmu-lock before calling this function. */
5574 static __always_inline bool
5575 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5576                         slot_level_handler fn, int start_level, int end_level,
5577                         gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5578 {
5579         struct slot_rmap_walk_iterator iterator;
5580         bool flush = false;
5581 
5582         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5583                         end_gfn, &iterator) {
5584                 if (iterator.rmap)
5585                         flush |= fn(kvm, iterator.rmap);
5586 
5587                 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5588                         if (flush && lock_flush_tlb) {
5589                                 kvm_flush_remote_tlbs_with_address(kvm,
5590                                                 start_gfn,
5591                                                 iterator.gfn - start_gfn + 1);
5592                                 flush = false;
5593                         }
5594                         cond_resched_lock(&kvm->mmu_lock);
5595                 }
5596         }
5597 
5598         if (flush && lock_flush_tlb) {
5599                 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5600                                                    end_gfn - start_gfn + 1);
5601                 flush = false;
5602         }
5603 
5604         return flush;
5605 }
5606 
5607 static __always_inline bool
5608 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5609                   slot_level_handler fn, int start_level, int end_level,
5610                   bool lock_flush_tlb)
5611 {
5612         return slot_handle_level_range(kvm, memslot, fn, start_level,
5613                         end_level, memslot->base_gfn,
5614                         memslot->base_gfn + memslot->npages - 1,
5615                         lock_flush_tlb);
5616 }
5617 
5618 static __always_inline bool
5619 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5620                       slot_level_handler fn, bool lock_flush_tlb)
5621 {
5622         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5623                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5624 }
5625 
5626 static __always_inline bool
5627 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5628                         slot_level_handler fn, bool lock_flush_tlb)
5629 {
5630         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5631                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5632 }
5633 
5634 static __always_inline bool
5635 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5636                  slot_level_handler fn, bool lock_flush_tlb)
5637 {
5638         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5639                                  PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5640 }
5641 
5642 static void free_mmu_pages(struct kvm_mmu *mmu)
5643 {
5644         free_page((unsigned long)mmu->pae_root);
5645         free_page((unsigned long)mmu->lm_root);
5646 }
5647 
5648 static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5649 {
5650         struct page *page;
5651         int i;
5652 
5653         /*
5654          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5655          * while the PDP table is a per-vCPU construct that's allocated at MMU
5656          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5657          * x86_64.  Therefore we need to allocate the PDP table in the first
5658          * 4GB of memory, which happens to fit the DMA32 zone.  Except for
5659          * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
5660          * skip allocating the PDP table.
5661          */
5662         if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5663                 return 0;
5664 
5665         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5666         if (!page)
5667                 return -ENOMEM;
5668 
5669         mmu->pae_root = page_address(page);
5670         for (i = 0; i < 4; ++i)
5671                 mmu->pae_root[i] = INVALID_PAGE;
5672 
5673         return 0;
5674 }
5675 
5676 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5677 {
5678         uint i;
5679         int ret;
5680 
5681         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5682         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5683 
5684         vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5685         vcpu->arch.root_mmu.root_cr3 = 0;
5686         vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5687         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5688                 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5689 
5690         vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5691         vcpu->arch.guest_mmu.root_cr3 = 0;
5692         vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5693         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5694                 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5695 
5696         vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5697 
5698         ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
5699         if (ret)
5700                 return ret;
5701 
5702         ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
5703         if (ret)
5704                 goto fail_allocate_root;
5705 
5706         return ret;
5707  fail_allocate_root:
5708         free_mmu_pages(&vcpu->arch.guest_mmu);
5709         return ret;
5710 }
5711 
5712 #define BATCH_ZAP_PAGES 10
5713 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5714 {
5715         struct kvm_mmu_page *sp, *node;
5716         int nr_zapped, batch = 0;
5717 
5718 restart:
5719         list_for_each_entry_safe_reverse(sp, node,
5720               &kvm->arch.active_mmu_pages, link) {
5721                 /*
5722                  * No obsolete valid page exists before a newly created page
5723                  * since active_mmu_pages is a FIFO list.