~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/kvm/x86.c

Version: ~ [ linux-6.4-rc3 ] ~ [ linux-6.3.4 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.30 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.113 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.180 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.243 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.283 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.315 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * Kernel-based Virtual Machine driver for Linux
  4  *
  5  * derived from drivers/kvm/kvm_main.c
  6  *
  7  * Copyright (C) 2006 Qumranet, Inc.
  8  * Copyright (C) 2008 Qumranet, Inc.
  9  * Copyright IBM Corporation, 2008
 10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 11  *
 12  * Authors:
 13  *   Avi Kivity   <avi@qumranet.com>
 14  *   Yaniv Kamay  <yaniv@qumranet.com>
 15  *   Amit Shah    <amit.shah@qumranet.com>
 16  *   Ben-Ami Yassour <benami@il.ibm.com>
 17  */
 18 
 19 #include <linux/kvm_host.h>
 20 #include "irq.h"
 21 #include "ioapic.h"
 22 #include "mmu.h"
 23 #include "i8254.h"
 24 #include "tss.h"
 25 #include "kvm_cache_regs.h"
 26 #include "kvm_emulate.h"
 27 #include "x86.h"
 28 #include "cpuid.h"
 29 #include "pmu.h"
 30 #include "hyperv.h"
 31 #include "lapic.h"
 32 #include "xen.h"
 33 
 34 #include <linux/clocksource.h>
 35 #include <linux/interrupt.h>
 36 #include <linux/kvm.h>
 37 #include <linux/fs.h>
 38 #include <linux/vmalloc.h>
 39 #include <linux/export.h>
 40 #include <linux/moduleparam.h>
 41 #include <linux/mman.h>
 42 #include <linux/highmem.h>
 43 #include <linux/iommu.h>
 44 #include <linux/intel-iommu.h>
 45 #include <linux/cpufreq.h>
 46 #include <linux/user-return-notifier.h>
 47 #include <linux/srcu.h>
 48 #include <linux/slab.h>
 49 #include <linux/perf_event.h>
 50 #include <linux/uaccess.h>
 51 #include <linux/hash.h>
 52 #include <linux/pci.h>
 53 #include <linux/timekeeper_internal.h>
 54 #include <linux/pvclock_gtod.h>
 55 #include <linux/kvm_irqfd.h>
 56 #include <linux/irqbypass.h>
 57 #include <linux/sched/stat.h>
 58 #include <linux/sched/isolation.h>
 59 #include <linux/mem_encrypt.h>
 60 #include <linux/entry-kvm.h>
 61 
 62 #include <trace/events/kvm.h>
 63 
 64 #include <asm/debugreg.h>
 65 #include <asm/msr.h>
 66 #include <asm/desc.h>
 67 #include <asm/mce.h>
 68 #include <linux/kernel_stat.h>
 69 #include <asm/fpu/internal.h> /* Ugh! */
 70 #include <asm/pvclock.h>
 71 #include <asm/div64.h>
 72 #include <asm/irq_remapping.h>
 73 #include <asm/mshyperv.h>
 74 #include <asm/hypervisor.h>
 75 #include <asm/tlbflush.h>
 76 #include <asm/intel_pt.h>
 77 #include <asm/emulate_prefix.h>
 78 #include <clocksource/hyperv_timer.h>
 79 
 80 #define CREATE_TRACE_POINTS
 81 #include "trace.h"
 82 
 83 #define MAX_IO_MSRS 256
 84 #define KVM_MAX_MCE_BANKS 32
 85 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
 86 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 87 
 88 #define emul_to_vcpu(ctxt) \
 89         ((struct kvm_vcpu *)(ctxt)->vcpu)
 90 
 91 /* EFER defaults:
 92  * - enable syscall per default because its emulated by KVM
 93  * - enable LME and LMA per default on 64 bit KVM
 94  */
 95 #ifdef CONFIG_X86_64
 96 static
 97 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
 98 #else
 99 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
100 #endif
101 
102 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
103 
104 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
105                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
106 
107 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
108 static void process_nmi(struct kvm_vcpu *vcpu);
109 static void process_smi(struct kvm_vcpu *vcpu);
110 static void enter_smm(struct kvm_vcpu *vcpu);
111 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
112 static void store_regs(struct kvm_vcpu *vcpu);
113 static int sync_regs(struct kvm_vcpu *vcpu);
114 
115 struct kvm_x86_ops kvm_x86_ops __read_mostly;
116 EXPORT_SYMBOL_GPL(kvm_x86_ops);
117 
118 #define KVM_X86_OP(func)                                             \
119         DEFINE_STATIC_CALL_NULL(kvm_x86_##func,                      \
120                                 *(((struct kvm_x86_ops *)0)->func));
121 #define KVM_X86_OP_NULL KVM_X86_OP
122 #include <asm/kvm-x86-ops.h>
123 EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
124 EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
125 EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
126 
127 static bool __read_mostly ignore_msrs = 0;
128 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
129 
130 bool __read_mostly report_ignored_msrs = true;
131 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
132 EXPORT_SYMBOL_GPL(report_ignored_msrs);
133 
134 unsigned int min_timer_period_us = 200;
135 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
136 
137 static bool __read_mostly kvmclock_periodic_sync = true;
138 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
139 
140 bool __read_mostly kvm_has_tsc_control;
141 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
142 u32  __read_mostly kvm_max_guest_tsc_khz;
143 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
144 u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
145 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
146 u64  __read_mostly kvm_max_tsc_scaling_ratio;
147 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
148 u64 __read_mostly kvm_default_tsc_scaling_ratio;
149 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
150 bool __read_mostly kvm_has_bus_lock_exit;
151 EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
152 
153 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
154 static u32 __read_mostly tsc_tolerance_ppm = 250;
155 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
156 
157 /*
158  * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
159  * adaptive tuning starting from default advancment of 1000ns.  '' disables
160  * advancement entirely.  Any other value is used as-is and disables adaptive
161  * tuning, i.e. allows priveleged userspace to set an exact advancement time.
162  */
163 static int __read_mostly lapic_timer_advance_ns = -1;
164 module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
165 
166 static bool __read_mostly vector_hashing = true;
167 module_param(vector_hashing, bool, S_IRUGO);
168 
169 bool __read_mostly enable_vmware_backdoor = false;
170 module_param(enable_vmware_backdoor, bool, S_IRUGO);
171 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
172 
173 static bool __read_mostly force_emulation_prefix = false;
174 module_param(force_emulation_prefix, bool, S_IRUGO);
175 
176 int __read_mostly pi_inject_timer = -1;
177 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
178 
179 /*
180  * Restoring the host value for MSRs that are only consumed when running in
181  * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
182  * returns to userspace, i.e. the kernel can run with the guest's value.
183  */
184 #define KVM_MAX_NR_USER_RETURN_MSRS 16
185 
186 struct kvm_user_return_msrs_global {
187         int nr;
188         u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
189 };
190 
191 struct kvm_user_return_msrs {
192         struct user_return_notifier urn;
193         bool registered;
194         struct kvm_user_return_msr_values {
195                 u64 host;
196                 u64 curr;
197         } values[KVM_MAX_NR_USER_RETURN_MSRS];
198 };
199 
200 static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
201 static struct kvm_user_return_msrs __percpu *user_return_msrs;
202 
203 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
204                                 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
205                                 | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
206                                 | XFEATURE_MASK_PKRU)
207 
208 u64 __read_mostly host_efer;
209 EXPORT_SYMBOL_GPL(host_efer);
210 
211 bool __read_mostly allow_smaller_maxphyaddr = 0;
212 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
213 
214 u64 __read_mostly host_xss;
215 EXPORT_SYMBOL_GPL(host_xss);
216 u64 __read_mostly supported_xss;
217 EXPORT_SYMBOL_GPL(supported_xss);
218 
219 struct kvm_stats_debugfs_item debugfs_entries[] = {
220         VCPU_STAT("pf_fixed", pf_fixed),
221         VCPU_STAT("pf_guest", pf_guest),
222         VCPU_STAT("tlb_flush", tlb_flush),
223         VCPU_STAT("invlpg", invlpg),
224         VCPU_STAT("exits", exits),
225         VCPU_STAT("io_exits", io_exits),
226         VCPU_STAT("mmio_exits", mmio_exits),
227         VCPU_STAT("signal_exits", signal_exits),
228         VCPU_STAT("irq_window", irq_window_exits),
229         VCPU_STAT("nmi_window", nmi_window_exits),
230         VCPU_STAT("halt_exits", halt_exits),
231         VCPU_STAT("halt_successful_poll", halt_successful_poll),
232         VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
233         VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
234         VCPU_STAT("halt_wakeup", halt_wakeup),
235         VCPU_STAT("hypercalls", hypercalls),
236         VCPU_STAT("request_irq", request_irq_exits),
237         VCPU_STAT("irq_exits", irq_exits),
238         VCPU_STAT("host_state_reload", host_state_reload),
239         VCPU_STAT("fpu_reload", fpu_reload),
240         VCPU_STAT("insn_emulation", insn_emulation),
241         VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
242         VCPU_STAT("irq_injections", irq_injections),
243         VCPU_STAT("nmi_injections", nmi_injections),
244         VCPU_STAT("req_event", req_event),
245         VCPU_STAT("l1d_flush", l1d_flush),
246         VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
247         VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
248         VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
249         VM_STAT("mmu_pte_write", mmu_pte_write),
250         VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
251         VM_STAT("mmu_flooded", mmu_flooded),
252         VM_STAT("mmu_recycled", mmu_recycled),
253         VM_STAT("mmu_cache_miss", mmu_cache_miss),
254         VM_STAT("mmu_unsync", mmu_unsync),
255         VM_STAT("remote_tlb_flush", remote_tlb_flush),
256         VM_STAT("largepages", lpages, .mode = 0444),
257         VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
258         VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
259         { NULL }
260 };
261 
262 u64 __read_mostly host_xcr0;
263 u64 __read_mostly supported_xcr0;
264 EXPORT_SYMBOL_GPL(supported_xcr0);
265 
266 static struct kmem_cache *x86_fpu_cache;
267 
268 static struct kmem_cache *x86_emulator_cache;
269 
270 /*
271  * When called, it means the previous get/set msr reached an invalid msr.
272  * Return true if we want to ignore/silent this failed msr access.
273  */
274 static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
275 {
276         const char *op = write ? "wrmsr" : "rdmsr";
277 
278         if (ignore_msrs) {
279                 if (report_ignored_msrs)
280                         kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
281                                       op, msr, data);
282                 /* Mask the error */
283                 return true;
284         } else {
285                 kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
286                                       op, msr, data);
287                 return false;
288         }
289 }
290 
291 static struct kmem_cache *kvm_alloc_emulator_cache(void)
292 {
293         unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
294         unsigned int size = sizeof(struct x86_emulate_ctxt);
295 
296         return kmem_cache_create_usercopy("x86_emulator", size,
297                                           __alignof__(struct x86_emulate_ctxt),
298                                           SLAB_ACCOUNT, useroffset,
299                                           size - useroffset, NULL);
300 }
301 
302 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
303 
304 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
305 {
306         int i;
307         for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
308                 vcpu->arch.apf.gfns[i] = ~0;
309 }
310 
311 static void kvm_on_user_return(struct user_return_notifier *urn)
312 {
313         unsigned slot;
314         struct kvm_user_return_msrs *msrs
315                 = container_of(urn, struct kvm_user_return_msrs, urn);
316         struct kvm_user_return_msr_values *values;
317         unsigned long flags;
318 
319         /*
320          * Disabling irqs at this point since the following code could be
321          * interrupted and executed through kvm_arch_hardware_disable()
322          */
323         local_irq_save(flags);
324         if (msrs->registered) {
325                 msrs->registered = false;
326                 user_return_notifier_unregister(urn);
327         }
328         local_irq_restore(flags);
329         for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
330                 values = &msrs->values[slot];
331                 if (values->host != values->curr) {
332                         wrmsrl(user_return_msrs_global.msrs[slot], values->host);
333                         values->curr = values->host;
334                 }
335         }
336 }
337 
338 int kvm_probe_user_return_msr(u32 msr)
339 {
340         u64 val;
341         int ret;
342 
343         preempt_disable();
344         ret = rdmsrl_safe(msr, &val);
345         if (ret)
346                 goto out;
347         ret = wrmsrl_safe(msr, val);
348 out:
349         preempt_enable();
350         return ret;
351 }
352 EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr);
353 
354 void kvm_define_user_return_msr(unsigned slot, u32 msr)
355 {
356         BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
357         user_return_msrs_global.msrs[slot] = msr;
358         if (slot >= user_return_msrs_global.nr)
359                 user_return_msrs_global.nr = slot + 1;
360 }
361 EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
362 
363 static void kvm_user_return_msr_cpu_online(void)
364 {
365         unsigned int cpu = smp_processor_id();
366         struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
367         u64 value;
368         int i;
369 
370         for (i = 0; i < user_return_msrs_global.nr; ++i) {
371                 rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
372                 msrs->values[i].host = value;
373                 msrs->values[i].curr = value;
374         }
375 }
376 
377 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
378 {
379         unsigned int cpu = smp_processor_id();
380         struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
381         int err;
382 
383         value = (value & mask) | (msrs->values[slot].host & ~mask);
384         if (value == msrs->values[slot].curr)
385                 return 0;
386         err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
387         if (err)
388                 return 1;
389 
390         msrs->values[slot].curr = value;
391         if (!msrs->registered) {
392                 msrs->urn.on_user_return = kvm_on_user_return;
393                 user_return_notifier_register(&msrs->urn);
394                 msrs->registered = true;
395         }
396         return 0;
397 }
398 EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
399 
400 static void drop_user_return_notifiers(void)
401 {
402         unsigned int cpu = smp_processor_id();
403         struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
404 
405         if (msrs->registered)
406                 kvm_on_user_return(&msrs->urn);
407 }
408 
409 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
410 {
411         return vcpu->arch.apic_base;
412 }
413 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
414 
415 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
416 {
417         return kvm_apic_mode(kvm_get_apic_base(vcpu));
418 }
419 EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
420 
421 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
422 {
423         enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
424         enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
425         u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
426                 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
427 
428         if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
429                 return 1;
430         if (!msr_info->host_initiated) {
431                 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
432                         return 1;
433                 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
434                         return 1;
435         }
436 
437         kvm_lapic_set_base(vcpu, msr_info->data);
438         kvm_recalculate_apic_map(vcpu->kvm);
439         return 0;
440 }
441 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
442 
443 asmlinkage __visible noinstr void kvm_spurious_fault(void)
444 {
445         /* Fault while not rebooting.  We want the trace. */
446         BUG_ON(!kvm_rebooting);
447 }
448 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
449 
450 #define EXCPT_BENIGN            0
451 #define EXCPT_CONTRIBUTORY      1
452 #define EXCPT_PF                2
453 
454 static int exception_class(int vector)
455 {
456         switch (vector) {
457         case PF_VECTOR:
458                 return EXCPT_PF;
459         case DE_VECTOR:
460         case TS_VECTOR:
461         case NP_VECTOR:
462         case SS_VECTOR:
463         case GP_VECTOR:
464                 return EXCPT_CONTRIBUTORY;
465         default:
466                 break;
467         }
468         return EXCPT_BENIGN;
469 }
470 
471 #define EXCPT_FAULT             0
472 #define EXCPT_TRAP              1
473 #define EXCPT_ABORT             2
474 #define EXCPT_INTERRUPT         3
475 
476 static int exception_type(int vector)
477 {
478         unsigned int mask;
479 
480         if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
481                 return EXCPT_INTERRUPT;
482 
483         mask = 1 << vector;
484 
485         /* #DB is trap, as instruction watchpoints are handled elsewhere */
486         if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
487                 return EXCPT_TRAP;
488 
489         if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
490                 return EXCPT_ABORT;
491 
492         /* Reserved exceptions will result in fault */
493         return EXCPT_FAULT;
494 }
495 
496 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
497 {
498         unsigned nr = vcpu->arch.exception.nr;
499         bool has_payload = vcpu->arch.exception.has_payload;
500         unsigned long payload = vcpu->arch.exception.payload;
501 
502         if (!has_payload)
503                 return;
504 
505         switch (nr) {
506         case DB_VECTOR:
507                 /*
508                  * "Certain debug exceptions may clear bit 0-3.  The
509                  * remaining contents of the DR6 register are never
510                  * cleared by the processor".
511                  */
512                 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
513                 /*
514                  * In order to reflect the #DB exception payload in guest
515                  * dr6, three components need to be considered: active low
516                  * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
517                  * DR6_BS and DR6_BT)
518                  * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
519                  * In the target guest dr6:
520                  * FIXED_1 bits should always be set.
521                  * Active low bits should be cleared if 1-setting in payload.
522                  * Active high bits should be set if 1-setting in payload.
523                  *
524                  * Note, the payload is compatible with the pending debug
525                  * exceptions/exit qualification under VMX, that active_low bits
526                  * are active high in payload.
527                  * So they need to be flipped for DR6.
528                  */
529                 vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
530                 vcpu->arch.dr6 |= payload;
531                 vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
532 
533                 /*
534                  * The #DB payload is defined as compatible with the 'pending
535                  * debug exceptions' field under VMX, not DR6. While bit 12 is
536                  * defined in the 'pending debug exceptions' field (enabled
537                  * breakpoint), it is reserved and must be zero in DR6.
538                  */
539                 vcpu->arch.dr6 &= ~BIT(12);
540                 break;
541         case PF_VECTOR:
542                 vcpu->arch.cr2 = payload;
543                 break;
544         }
545 
546         vcpu->arch.exception.has_payload = false;
547         vcpu->arch.exception.payload = 0;
548 }
549 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
550 
551 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
552                 unsigned nr, bool has_error, u32 error_code,
553                 bool has_payload, unsigned long payload, bool reinject)
554 {
555         u32 prev_nr;
556         int class1, class2;
557 
558         kvm_make_request(KVM_REQ_EVENT, vcpu);
559 
560         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
561         queue:
562                 if (has_error && !is_protmode(vcpu))
563                         has_error = false;
564                 if (reinject) {
565                         /*
566                          * On vmentry, vcpu->arch.exception.pending is only
567                          * true if an event injection was blocked by
568                          * nested_run_pending.  In that case, however,
569                          * vcpu_enter_guest requests an immediate exit,
570                          * and the guest shouldn't proceed far enough to
571                          * need reinjection.
572                          */
573                         WARN_ON_ONCE(vcpu->arch.exception.pending);
574                         vcpu->arch.exception.injected = true;
575                         if (WARN_ON_ONCE(has_payload)) {
576                                 /*
577                                  * A reinjected event has already
578                                  * delivered its payload.
579                                  */
580                                 has_payload = false;
581                                 payload = 0;
582                         }
583                 } else {
584                         vcpu->arch.exception.pending = true;
585                         vcpu->arch.exception.injected = false;
586                 }
587                 vcpu->arch.exception.has_error_code = has_error;
588                 vcpu->arch.exception.nr = nr;
589                 vcpu->arch.exception.error_code = error_code;
590                 vcpu->arch.exception.has_payload = has_payload;
591                 vcpu->arch.exception.payload = payload;
592                 if (!is_guest_mode(vcpu))
593                         kvm_deliver_exception_payload(vcpu);
594                 return;
595         }
596 
597         /* to check exception */
598         prev_nr = vcpu->arch.exception.nr;
599         if (prev_nr == DF_VECTOR) {
600                 /* triple fault -> shutdown */
601                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
602                 return;
603         }
604         class1 = exception_class(prev_nr);
605         class2 = exception_class(nr);
606         if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
607                 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
608                 /*
609                  * Generate double fault per SDM Table 5-5.  Set
610                  * exception.pending = true so that the double fault
611                  * can trigger a nested vmexit.
612                  */
613                 vcpu->arch.exception.pending = true;
614                 vcpu->arch.exception.injected = false;
615                 vcpu->arch.exception.has_error_code = true;
616                 vcpu->arch.exception.nr = DF_VECTOR;
617                 vcpu->arch.exception.error_code = 0;
618                 vcpu->arch.exception.has_payload = false;
619                 vcpu->arch.exception.payload = 0;
620         } else
621                 /* replace previous exception with a new one in a hope
622                    that instruction re-execution will regenerate lost
623                    exception */
624                 goto queue;
625 }
626 
627 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
628 {
629         kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
630 }
631 EXPORT_SYMBOL_GPL(kvm_queue_exception);
632 
633 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
634 {
635         kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
636 }
637 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
638 
639 void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
640                            unsigned long payload)
641 {
642         kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
643 }
644 EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
645 
646 static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
647                                     u32 error_code, unsigned long payload)
648 {
649         kvm_multiple_exception(vcpu, nr, true, error_code,
650                                true, payload, false);
651 }
652 
653 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
654 {
655         if (err)
656                 kvm_inject_gp(vcpu, 0);
657         else
658                 return kvm_skip_emulated_instruction(vcpu);
659 
660         return 1;
661 }
662 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
663 
664 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
665 {
666         ++vcpu->stat.pf_guest;
667         vcpu->arch.exception.nested_apf =
668                 is_guest_mode(vcpu) && fault->async_page_fault;
669         if (vcpu->arch.exception.nested_apf) {
670                 vcpu->arch.apf.nested_apf_token = fault->address;
671                 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
672         } else {
673                 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
674                                         fault->address);
675         }
676 }
677 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
678 
679 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
680                                     struct x86_exception *fault)
681 {
682         struct kvm_mmu *fault_mmu;
683         WARN_ON_ONCE(fault->vector != PF_VECTOR);
684 
685         fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
686                                                vcpu->arch.walk_mmu;
687 
688         /*
689          * Invalidate the TLB entry for the faulting address, if it exists,
690          * else the access will fault indefinitely (and to emulate hardware).
691          */
692         if ((fault->error_code & PFERR_PRESENT_MASK) &&
693             !(fault->error_code & PFERR_RSVD_MASK))
694                 kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
695                                        fault_mmu->root_hpa);
696 
697         fault_mmu->inject_page_fault(vcpu, fault);
698         return fault->nested_page_fault;
699 }
700 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
701 
702 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
703 {
704         atomic_inc(&vcpu->arch.nmi_queued);
705         kvm_make_request(KVM_REQ_NMI, vcpu);
706 }
707 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
708 
709 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
710 {
711         kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
712 }
713 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
714 
715 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
716 {
717         kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
718 }
719 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
720 
721 /*
722  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
723  * a #GP and return false.
724  */
725 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
726 {
727         if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
728                 return true;
729         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
730         return false;
731 }
732 EXPORT_SYMBOL_GPL(kvm_require_cpl);
733 
734 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
735 {
736         if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
737                 return true;
738 
739         kvm_queue_exception(vcpu, UD_VECTOR);
740         return false;
741 }
742 EXPORT_SYMBOL_GPL(kvm_require_dr);
743 
744 /*
745  * This function will be used to read from the physical memory of the currently
746  * running guest. The difference to kvm_vcpu_read_guest_page is that this function
747  * can read from guest physical or from the guest's guest physical memory.
748  */
749 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
750                             gfn_t ngfn, void *data, int offset, int len,
751                             u32 access)
752 {
753         struct x86_exception exception;
754         gfn_t real_gfn;
755         gpa_t ngpa;
756 
757         ngpa     = gfn_to_gpa(ngfn);
758         real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
759         if (real_gfn == UNMAPPED_GVA)
760                 return -EFAULT;
761 
762         real_gfn = gpa_to_gfn(real_gfn);
763 
764         return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
765 }
766 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
767 
768 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
769                                void *data, int offset, int len, u32 access)
770 {
771         return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
772                                        data, offset, len, access);
773 }
774 
775 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
776 {
777         return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
778 }
779 
780 /*
781  * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
782  */
783 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
784 {
785         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
786         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
787         int i;
788         int ret;
789         u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
790 
791         ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
792                                       offset * sizeof(u64), sizeof(pdpte),
793                                       PFERR_USER_MASK|PFERR_WRITE_MASK);
794         if (ret < 0) {
795                 ret = 0;
796                 goto out;
797         }
798         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
799                 if ((pdpte[i] & PT_PRESENT_MASK) &&
800                     (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
801                         ret = 0;
802                         goto out;
803                 }
804         }
805         ret = 1;
806 
807         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
808         kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
809 
810 out:
811 
812         return ret;
813 }
814 EXPORT_SYMBOL_GPL(load_pdptrs);
815 
816 bool pdptrs_changed(struct kvm_vcpu *vcpu)
817 {
818         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
819         int offset;
820         gfn_t gfn;
821         int r;
822 
823         if (!is_pae_paging(vcpu))
824                 return false;
825 
826         if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
827                 return true;
828 
829         gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
830         offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
831         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
832                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
833         if (r < 0)
834                 return true;
835 
836         return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
837 }
838 EXPORT_SYMBOL_GPL(pdptrs_changed);
839 
840 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
841 {
842         unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
843 
844         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
845                 kvm_clear_async_pf_completion_queue(vcpu);
846                 kvm_async_pf_hash_reset(vcpu);
847         }
848 
849         if ((cr0 ^ old_cr0) & update_bits)
850                 kvm_mmu_reset_context(vcpu);
851 
852         if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
853             kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
854             !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
855                 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
856 }
857 EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
858 
859 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
860 {
861         unsigned long old_cr0 = kvm_read_cr0(vcpu);
862         unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
863 
864         cr0 |= X86_CR0_ET;
865 
866 #ifdef CONFIG_X86_64
867         if (cr0 & 0xffffffff00000000UL)
868                 return 1;
869 #endif
870 
871         cr0 &= ~CR0_RESERVED_BITS;
872 
873         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
874                 return 1;
875 
876         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
877                 return 1;
878 
879 #ifdef CONFIG_X86_64
880         if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
881             (cr0 & X86_CR0_PG)) {
882                 int cs_db, cs_l;
883 
884                 if (!is_pae(vcpu))
885                         return 1;
886                 static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
887                 if (cs_l)
888                         return 1;
889         }
890 #endif
891         if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
892             is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
893             !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
894                 return 1;
895 
896         if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
897                 return 1;
898 
899         static_call(kvm_x86_set_cr0)(vcpu, cr0);
900 
901         kvm_post_set_cr0(vcpu, old_cr0, cr0);
902 
903         return 0;
904 }
905 EXPORT_SYMBOL_GPL(kvm_set_cr0);
906 
907 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
908 {
909         (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
910 }
911 EXPORT_SYMBOL_GPL(kvm_lmsw);
912 
913 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
914 {
915         if (vcpu->arch.guest_state_protected)
916                 return;
917 
918         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
919 
920                 if (vcpu->arch.xcr0 != host_xcr0)
921                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
922 
923                 if (vcpu->arch.xsaves_enabled &&
924                     vcpu->arch.ia32_xss != host_xss)
925                         wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
926         }
927 
928         if (static_cpu_has(X86_FEATURE_PKU) &&
929             (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
930              (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
931             vcpu->arch.pkru != vcpu->arch.host_pkru)
932                 __write_pkru(vcpu->arch.pkru);
933 }
934 EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
935 
936 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
937 {
938         if (vcpu->arch.guest_state_protected)
939                 return;
940 
941         if (static_cpu_has(X86_FEATURE_PKU) &&
942             (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
943              (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
944                 vcpu->arch.pkru = rdpkru();
945                 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
946                         __write_pkru(vcpu->arch.host_pkru);
947         }
948 
949         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
950 
951                 if (vcpu->arch.xcr0 != host_xcr0)
952                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
953 
954                 if (vcpu->arch.xsaves_enabled &&
955                     vcpu->arch.ia32_xss != host_xss)
956                         wrmsrl(MSR_IA32_XSS, host_xss);
957         }
958 
959 }
960 EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
961 
962 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
963 {
964         u64 xcr0 = xcr;
965         u64 old_xcr0 = vcpu->arch.xcr0;
966         u64 valid_bits;
967 
968         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
969         if (index != XCR_XFEATURE_ENABLED_MASK)
970                 return 1;
971         if (!(xcr0 & XFEATURE_MASK_FP))
972                 return 1;
973         if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
974                 return 1;
975 
976         /*
977          * Do not allow the guest to set bits that we do not support
978          * saving.  However, xcr0 bit 0 is always set, even if the
979          * emulated CPU does not support XSAVE (see fx_init).
980          */
981         valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
982         if (xcr0 & ~valid_bits)
983                 return 1;
984 
985         if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
986             (!(xcr0 & XFEATURE_MASK_BNDCSR)))
987                 return 1;
988 
989         if (xcr0 & XFEATURE_MASK_AVX512) {
990                 if (!(xcr0 & XFEATURE_MASK_YMM))
991                         return 1;
992                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
993                         return 1;
994         }
995         vcpu->arch.xcr0 = xcr0;
996 
997         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
998                 kvm_update_cpuid_runtime(vcpu);
999         return 0;
1000 }
1001 
1002 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
1003 {
1004         if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
1005                 return __kvm_set_xcr(vcpu, index, xcr);
1006 
1007         return 1;
1008 }
1009 EXPORT_SYMBOL_GPL(kvm_set_xcr);
1010 
1011 bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1012 {
1013         if (cr4 & cr4_reserved_bits)
1014                 return false;
1015 
1016         if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
1017                 return false;
1018 
1019         return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
1020 }
1021 EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
1022 
1023 void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
1024 {
1025         unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
1026                                       X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
1027 
1028         if (((cr4 ^ old_cr4) & mmu_role_bits) ||
1029             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
1030                 kvm_mmu_reset_context(vcpu);
1031 }
1032 EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
1033 
1034 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1035 {
1036         unsigned long old_cr4 = kvm_read_cr4(vcpu);
1037         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
1038                                    X86_CR4_SMEP;
1039 
1040         if (!kvm_is_valid_cr4(vcpu, cr4))
1041                 return 1;
1042 
1043         if (is_long_mode(vcpu)) {
1044                 if (!(cr4 & X86_CR4_PAE))
1045                         return 1;
1046                 if ((cr4 ^ old_cr4) & X86_CR4_LA57)
1047                         return 1;
1048         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
1049                    && ((cr4 ^ old_cr4) & pdptr_bits)
1050                    && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
1051                                    kvm_read_cr3(vcpu)))
1052                 return 1;
1053 
1054         if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
1055                 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
1056                         return 1;
1057 
1058                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1059                 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1060                         return 1;
1061         }
1062 
1063         static_call(kvm_x86_set_cr4)(vcpu, cr4);
1064 
1065         kvm_post_set_cr4(vcpu, old_cr4, cr4);
1066 
1067         return 0;
1068 }
1069 EXPORT_SYMBOL_GPL(kvm_set_cr4);
1070 
1071 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1072 {
1073         bool skip_tlb_flush = false;
1074 #ifdef CONFIG_X86_64
1075         bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
1076 
1077         if (pcid_enabled) {
1078                 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1079                 cr3 &= ~X86_CR3_PCID_NOFLUSH;
1080         }
1081 #endif
1082 
1083         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
1084                 if (!skip_tlb_flush) {
1085                         kvm_mmu_sync_roots(vcpu);
1086                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1087                 }
1088                 return 0;
1089         }
1090 
1091         /*
1092          * Do not condition the GPA check on long mode, this helper is used to
1093          * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
1094          * the current vCPU mode is accurate.
1095          */
1096         if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
1097                 return 1;
1098 
1099         if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
1100                 return 1;
1101 
1102         kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
1103         vcpu->arch.cr3 = cr3;
1104         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1105 
1106         return 0;
1107 }
1108 EXPORT_SYMBOL_GPL(kvm_set_cr3);
1109 
1110 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
1111 {
1112         if (cr8 & CR8_RESERVED_BITS)
1113                 return 1;
1114         if (lapic_in_kernel(vcpu))
1115                 kvm_lapic_set_tpr(vcpu, cr8);
1116         else
1117                 vcpu->arch.cr8 = cr8;
1118         return 0;
1119 }
1120 EXPORT_SYMBOL_GPL(kvm_set_cr8);
1121 
1122 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
1123 {
1124         if (lapic_in_kernel(vcpu))
1125                 return kvm_lapic_get_cr8(vcpu);
1126         else
1127                 return vcpu->arch.cr8;
1128 }
1129 EXPORT_SYMBOL_GPL(kvm_get_cr8);
1130 
1131 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1132 {
1133         int i;
1134 
1135         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1136                 for (i = 0; i < KVM_NR_DB_REGS; i++)
1137                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1138                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
1139         }
1140 }
1141 
1142 void kvm_update_dr7(struct kvm_vcpu *vcpu)
1143 {
1144         unsigned long dr7;
1145 
1146         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1147                 dr7 = vcpu->arch.guest_debug_dr7;
1148         else
1149                 dr7 = vcpu->arch.dr7;
1150         static_call(kvm_x86_set_dr7)(vcpu, dr7);
1151         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1152         if (dr7 & DR7_BP_EN_MASK)
1153                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1154 }
1155 EXPORT_SYMBOL_GPL(kvm_update_dr7);
1156 
1157 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1158 {
1159         u64 fixed = DR6_FIXED_1;
1160 
1161         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
1162                 fixed |= DR6_RTM;
1163         return fixed;
1164 }
1165 
1166 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1167 {
1168         size_t size = ARRAY_SIZE(vcpu->arch.db);
1169 
1170         switch (dr) {
1171         case 0 ... 3:
1172                 vcpu->arch.db[array_index_nospec(dr, size)] = val;
1173                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1174                         vcpu->arch.eff_db[dr] = val;
1175                 break;
1176         case 4:
1177         case 6:
1178                 if (!kvm_dr6_valid(val))
1179                         return 1; /* #GP */
1180                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1181                 break;
1182         case 5:
1183         default: /* 7 */
1184                 if (!kvm_dr7_valid(val))
1185                         return 1; /* #GP */
1186                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1187                 kvm_update_dr7(vcpu);
1188                 break;
1189         }
1190 
1191         return 0;
1192 }
1193 EXPORT_SYMBOL_GPL(kvm_set_dr);
1194 
1195 void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
1196 {
1197         size_t size = ARRAY_SIZE(vcpu->arch.db);
1198 
1199         switch (dr) {
1200         case 0 ... 3:
1201                 *val = vcpu->arch.db[array_index_nospec(dr, size)];
1202                 break;
1203         case 4:
1204         case 6:
1205                 *val = vcpu->arch.dr6;
1206                 break;
1207         case 5:
1208         default: /* 7 */
1209                 *val = vcpu->arch.dr7;
1210                 break;
1211         }
1212 }
1213 EXPORT_SYMBOL_GPL(kvm_get_dr);
1214 
1215 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1216 {
1217         u32 ecx = kvm_rcx_read(vcpu);
1218         u64 data;
1219         int err;
1220 
1221         err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1222         if (err)
1223                 return err;
1224         kvm_rax_write(vcpu, (u32)data);
1225         kvm_rdx_write(vcpu, data >> 32);
1226         return err;
1227 }
1228 EXPORT_SYMBOL_GPL(kvm_rdpmc);
1229 
1230 /*
1231  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1232  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1233  *
1234  * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1235  * extract the supported MSRs from the related const lists.
1236  * msrs_to_save is selected from the msrs_to_save_all to reflect the
1237  * capabilities of the host cpu. This capabilities test skips MSRs that are
1238  * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
1239  * may depend on host virtualization features rather than host cpu features.
1240  */
1241 
1242 static const u32 msrs_to_save_all[] = {
1243         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1244         MSR_STAR,
1245 #ifdef CONFIG_X86_64
1246         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1247 #endif
1248         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1249         MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1250         MSR_IA32_SPEC_CTRL,
1251         MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1252         MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1253         MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1254         MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1255         MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1256         MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1257         MSR_IA32_UMWAIT_CONTROL,
1258 
1259         MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1260         MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
1261         MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1262         MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1263         MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1264         MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1265         MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1266         MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1267         MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
1268         MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
1269         MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
1270         MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
1271         MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
1272         MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1273         MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1274         MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1275         MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1276         MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
1277         MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
1278         MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
1279         MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
1280         MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
1281 };
1282 
1283 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
1284 static unsigned num_msrs_to_save;
1285 
1286 static const u32 emulated_msrs_all[] = {
1287         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1288         MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1289         HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1290         HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1291         HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1292         HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1293         HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1294         HV_X64_MSR_RESET,
1295         HV_X64_MSR_VP_INDEX,
1296         HV_X64_MSR_VP_RUNTIME,
1297         HV_X64_MSR_SCONTROL,
1298         HV_X64_MSR_STIMER0_CONFIG,
1299         HV_X64_MSR_VP_ASSIST_PAGE,
1300         HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1301         HV_X64_MSR_TSC_EMULATION_STATUS,
1302         HV_X64_MSR_SYNDBG_OPTIONS,
1303         HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1304         HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1305         HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1306 
1307         MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1308         MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1309 
1310         MSR_IA32_TSC_ADJUST,
1311         MSR_IA32_TSCDEADLINE,
1312         MSR_IA32_ARCH_CAPABILITIES,
1313         MSR_IA32_PERF_CAPABILITIES,
1314         MSR_IA32_MISC_ENABLE,
1315         MSR_IA32_MCG_STATUS,
1316         MSR_IA32_MCG_CTL,
1317         MSR_IA32_MCG_EXT_CTL,
1318         MSR_IA32_SMBASE,
1319         MSR_SMI_COUNT,
1320         MSR_PLATFORM_INFO,
1321         MSR_MISC_FEATURES_ENABLES,
1322         MSR_AMD64_VIRT_SPEC_CTRL,
1323         MSR_IA32_POWER_CTL,
1324         MSR_IA32_UCODE_REV,
1325 
1326         /*
1327          * The following list leaves out MSRs whose values are determined
1328          * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1329          * We always support the "true" VMX control MSRs, even if the host
1330          * processor does not, so I am putting these registers here rather
1331          * than in msrs_to_save_all.
1332          */
1333         MSR_IA32_VMX_BASIC,
1334         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1335         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1336         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1337         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1338         MSR_IA32_VMX_MISC,
1339         MSR_IA32_VMX_CR0_FIXED0,
1340         MSR_IA32_VMX_CR4_FIXED0,
1341         MSR_IA32_VMX_VMCS_ENUM,
1342         MSR_IA32_VMX_PROCBASED_CTLS2,
1343         MSR_IA32_VMX_EPT_VPID_CAP,
1344         MSR_IA32_VMX_VMFUNC,
1345 
1346         MSR_K7_HWCR,
1347         MSR_KVM_POLL_CONTROL,
1348 };
1349 
1350 static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1351 static unsigned num_emulated_msrs;
1352 
1353 /*
1354  * List of msr numbers which are used to expose MSR-based features that
1355  * can be used by a hypervisor to validate requested CPU features.
1356  */
1357 static const u32 msr_based_features_all[] = {
1358         MSR_IA32_VMX_BASIC,
1359         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1360         MSR_IA32_VMX_PINBASED_CTLS,
1361         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1362         MSR_IA32_VMX_PROCBASED_CTLS,
1363         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1364         MSR_IA32_VMX_EXIT_CTLS,
1365         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1366         MSR_IA32_VMX_ENTRY_CTLS,
1367         MSR_IA32_VMX_MISC,
1368         MSR_IA32_VMX_CR0_FIXED0,
1369         MSR_IA32_VMX_CR0_FIXED1,
1370         MSR_IA32_VMX_CR4_FIXED0,
1371         MSR_IA32_VMX_CR4_FIXED1,
1372         MSR_IA32_VMX_VMCS_ENUM,
1373         MSR_IA32_VMX_PROCBASED_CTLS2,
1374         MSR_IA32_VMX_EPT_VPID_CAP,
1375         MSR_IA32_VMX_VMFUNC,
1376 
1377         MSR_F10H_DECFG,
1378         MSR_IA32_UCODE_REV,
1379         MSR_IA32_ARCH_CAPABILITIES,
1380         MSR_IA32_PERF_CAPABILITIES,
1381 };
1382 
1383 static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
1384 static unsigned int num_msr_based_features;
1385 
1386 static u64 kvm_get_arch_capabilities(void)
1387 {
1388         u64 data = 0;
1389 
1390         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
1391                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1392 
1393         /*
1394          * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1395          * the nested hypervisor runs with NX huge pages.  If it is not,
1396          * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
1397          * L1 guests, so it need not worry about its own (L2) guests.
1398          */
1399         data |= ARCH_CAP_PSCHANGE_MC_NO;
1400 
1401         /*
1402          * If we're doing cache flushes (either "always" or "cond")
1403          * we will do one whenever the guest does a vmlaunch/vmresume.
1404          * If an outer hypervisor is doing the cache flush for us
1405          * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1406          * capability to the guest too, and if EPT is disabled we're not
1407          * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1408          * require a nested hypervisor to do a flush of its own.
1409          */
1410         if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1411                 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1412 
1413         if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1414                 data |= ARCH_CAP_RDCL_NO;
1415         if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1416                 data |= ARCH_CAP_SSB_NO;
1417         if (!boot_cpu_has_bug(X86_BUG_MDS))
1418                 data |= ARCH_CAP_MDS_NO;
1419 
1420         if (!boot_cpu_has(X86_FEATURE_RTM)) {
1421                 /*
1422                  * If RTM=0 because the kernel has disabled TSX, the host might
1423                  * have TAA_NO or TSX_CTRL.  Clear TAA_NO (the guest sees RTM=0
1424                  * and therefore knows that there cannot be TAA) but keep
1425                  * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
1426                  * and we want to allow migrating those guests to tsx=off hosts.
1427                  */
1428                 data &= ~ARCH_CAP_TAA_NO;
1429         } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
1430                 data |= ARCH_CAP_TAA_NO;
1431         } else {
1432                 /*
1433                  * Nothing to do here; we emulate TSX_CTRL if present on the
1434                  * host so the guest can choose between disabling TSX or
1435                  * using VERW to clear CPU buffers.
1436                  */
1437         }
1438 
1439         return data;
1440 }
1441 
1442 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1443 {
1444         switch (msr->index) {
1445         case MSR_IA32_ARCH_CAPABILITIES:
1446                 msr->data = kvm_get_arch_capabilities();
1447                 break;
1448         case MSR_IA32_UCODE_REV:
1449                 rdmsrl_safe(msr->index, &msr->data);
1450                 break;
1451         default:
1452                 return static_call(kvm_x86_get_msr_feature)(msr);
1453         }
1454         return 0;
1455 }
1456 
1457 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1458 {
1459         struct kvm_msr_entry msr;
1460         int r;
1461 
1462         msr.index = index;
1463         r = kvm_get_msr_feature(&msr);
1464 
1465         if (r == KVM_MSR_RET_INVALID) {
1466                 /* Unconditionally clear the output for simplicity */
1467                 *data = 0;
1468                 if (kvm_msr_ignored_check(index, 0, false))
1469                         r = 0;
1470         }
1471 
1472         if (r)
1473                 return r;
1474 
1475         *data = msr.data;
1476 
1477         return 0;
1478 }
1479 
1480 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1481 {
1482         if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1483                 return false;
1484 
1485         if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1486                 return false;
1487 
1488         if (efer & (EFER_LME | EFER_LMA) &&
1489             !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1490                 return false;
1491 
1492         if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1493                 return false;
1494 
1495         return true;
1496 
1497 }
1498 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1499 {
1500         if (efer & efer_reserved_bits)
1501                 return false;
1502 
1503         return __kvm_valid_efer(vcpu, efer);
1504 }
1505 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1506 
1507 static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1508 {
1509         u64 old_efer = vcpu->arch.efer;
1510         u64 efer = msr_info->data;
1511         int r;
1512 
1513         if (efer & efer_reserved_bits)
1514                 return 1;
1515 
1516         if (!msr_info->host_initiated) {
1517                 if (!__kvm_valid_efer(vcpu, efer))
1518                         return 1;
1519 
1520                 if (is_paging(vcpu) &&
1521                     (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1522                         return 1;
1523         }
1524 
1525         efer &= ~EFER_LMA;
1526         efer |= vcpu->arch.efer & EFER_LMA;
1527 
1528         r = static_call(kvm_x86_set_efer)(vcpu, efer);
1529         if (r) {
1530                 WARN_ON(r > 0);
1531                 return r;
1532         }
1533 
1534         /* Update reserved bits */
1535         if ((efer ^ old_efer) & EFER_NX)
1536                 kvm_mmu_reset_context(vcpu);
1537 
1538         return 0;
1539 }
1540 
1541 void kvm_enable_efer_bits(u64 mask)
1542 {
1543        efer_reserved_bits &= ~mask;
1544 }
1545 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1546 
1547 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1548 {
1549         struct kvm_x86_msr_filter *msr_filter;
1550         struct msr_bitmap_range *ranges;
1551         struct kvm *kvm = vcpu->kvm;
1552         bool allowed;
1553         int idx;
1554         u32 i;
1555 
1556         /* x2APIC MSRs do not support filtering. */
1557         if (index >= 0x800 && index <= 0x8ff)
1558                 return true;
1559 
1560         idx = srcu_read_lock(&kvm->srcu);
1561 
1562         msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1563         if (!msr_filter) {
1564                 allowed = true;
1565                 goto out;
1566         }
1567 
1568         allowed = msr_filter->default_allow;
1569         ranges = msr_filter->ranges;
1570 
1571         for (i = 0; i < msr_filter->count; i++) {
1572                 u32 start = ranges[i].base;
1573                 u32 end = start + ranges[i].nmsrs;
1574                 u32 flags = ranges[i].flags;
1575                 unsigned long *bitmap = ranges[i].bitmap;
1576 
1577                 if ((index >= start) && (index < end) && (flags & type)) {
1578                         allowed = !!test_bit(index - start, bitmap);
1579                         break;
1580                 }
1581         }
1582 
1583 out:
1584         srcu_read_unlock(&kvm->srcu, idx);
1585 
1586         return allowed;
1587 }
1588 EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1589 
1590 /*
1591  * Write @data into the MSR specified by @index.  Select MSR specific fault
1592  * checks are bypassed if @host_initiated is %true.
1593  * Returns 0 on success, non-0 otherwise.
1594  * Assumes vcpu_load() was already called.
1595  */
1596 static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1597                          bool host_initiated)
1598 {
1599         struct msr_data msr;
1600 
1601         if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1602                 return KVM_MSR_RET_FILTERED;
1603 
1604         switch (index) {
1605         case MSR_FS_BASE:
1606         case MSR_GS_BASE:
1607         case MSR_KERNEL_GS_BASE:
1608         case MSR_CSTAR:
1609         case MSR_LSTAR:
1610                 if (is_noncanonical_address(data, vcpu))
1611                         return 1;
1612                 break;
1613         case MSR_IA32_SYSENTER_EIP:
1614         case MSR_IA32_SYSENTER_ESP:
1615                 /*
1616                  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1617                  * non-canonical address is written on Intel but not on
1618                  * AMD (which ignores the top 32-bits, because it does
1619                  * not implement 64-bit SYSENTER).
1620                  *
1621                  * 64-bit code should hence be able to write a non-canonical
1622                  * value on AMD.  Making the address canonical ensures that
1623                  * vmentry does not fail on Intel after writing a non-canonical
1624                  * value, and that something deterministic happens if the guest
1625                  * invokes 64-bit SYSENTER.
1626                  */
1627                 data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
1628         }
1629 
1630         msr.data = data;
1631         msr.index = index;
1632         msr.host_initiated = host_initiated;
1633 
1634         return static_call(kvm_x86_set_msr)(vcpu, &msr);
1635 }
1636 
1637 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1638                                      u32 index, u64 data, bool host_initiated)
1639 {
1640         int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1641 
1642         if (ret == KVM_MSR_RET_INVALID)
1643                 if (kvm_msr_ignored_check(index, data, true))
1644                         ret = 0;
1645 
1646         return ret;
1647 }
1648 
1649 /*
1650  * Read the MSR specified by @index into @data.  Select MSR specific fault
1651  * checks are bypassed if @host_initiated is %true.
1652  * Returns 0 on success, non-0 otherwise.
1653  * Assumes vcpu_load() was already called.
1654  */
1655 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1656                   bool host_initiated)
1657 {
1658         struct msr_data msr;
1659         int ret;
1660 
1661         if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1662                 return KVM_MSR_RET_FILTERED;
1663 
1664         msr.index = index;
1665         msr.host_initiated = host_initiated;
1666 
1667         ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
1668         if (!ret)
1669                 *data = msr.data;
1670         return ret;
1671 }
1672 
1673 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1674                                      u32 index, u64 *data, bool host_initiated)
1675 {
1676         int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1677 
1678         if (ret == KVM_MSR_RET_INVALID) {
1679                 /* Unconditionally clear *data for simplicity */
1680                 *data = 0;
1681                 if (kvm_msr_ignored_check(index, 0, false))
1682                         ret = 0;
1683         }
1684 
1685         return ret;
1686 }
1687 
1688 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1689 {
1690         return kvm_get_msr_ignored_check(vcpu, index, data, false);
1691 }
1692 EXPORT_SYMBOL_GPL(kvm_get_msr);
1693 
1694 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1695 {
1696         return kvm_set_msr_ignored_check(vcpu, index, data, false);
1697 }
1698 EXPORT_SYMBOL_GPL(kvm_set_msr);
1699 
1700 static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1701 {
1702         int err = vcpu->run->msr.error;
1703         if (!err) {
1704                 kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1705                 kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1706         }
1707 
1708         return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
1709 }
1710 
1711 static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
1712 {
1713         return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
1714 }
1715 
1716 static u64 kvm_msr_reason(int r)
1717 {
1718         switch (r) {
1719         case KVM_MSR_RET_INVALID:
1720                 return KVM_MSR_EXIT_REASON_UNKNOWN;
1721         case KVM_MSR_RET_FILTERED:
1722                 return KVM_MSR_EXIT_REASON_FILTER;
1723         default:
1724                 return KVM_MSR_EXIT_REASON_INVAL;
1725         }
1726 }
1727 
1728 static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
1729                               u32 exit_reason, u64 data,
1730                               int (*completion)(struct kvm_vcpu *vcpu),
1731                               int r)
1732 {
1733         u64 msr_reason = kvm_msr_reason(r);
1734 
1735         /* Check if the user wanted to know about this MSR fault */
1736         if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
1737                 return 0;
1738 
1739         vcpu->run->exit_reason = exit_reason;
1740         vcpu->run->msr.error = 0;
1741         memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
1742         vcpu->run->msr.reason = msr_reason;
1743         vcpu->run->msr.index = index;
1744         vcpu->run->msr.data = data;
1745         vcpu->arch.complete_userspace_io = completion;
1746 
1747         return 1;
1748 }
1749 
1750 static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
1751 {
1752         return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
1753                                    complete_emulated_rdmsr, r);
1754 }
1755 
1756 static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
1757 {
1758         return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
1759                                    complete_emulated_wrmsr, r);
1760 }
1761 
1762 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
1763 {
1764         u32 ecx = kvm_rcx_read(vcpu);
1765         u64 data;
1766         int r;
1767 
1768         r = kvm_get_msr(vcpu, ecx, &data);
1769 
1770         /* MSR read failed? See if we should ask user space */
1771         if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
1772                 /* Bounce to user space */
1773                 return 0;
1774         }
1775 
1776         if (!r) {
1777                 trace_kvm_msr_read(ecx, data);
1778 
1779                 kvm_rax_write(vcpu, data & -1u);
1780                 kvm_rdx_write(vcpu, (data >> 32) & -1u);
1781         } else {
1782                 trace_kvm_msr_read_ex(ecx);
1783         }
1784 
1785         return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
1786 }
1787 EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
1788 
1789 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
1790 {
1791         u32 ecx = kvm_rcx_read(vcpu);
1792         u64 data = kvm_read_edx_eax(vcpu);
1793         int r;
1794 
1795         r = kvm_set_msr(vcpu, ecx, data);
1796 
1797         /* MSR write failed? See if we should ask user space */
1798         if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
1799                 /* Bounce to user space */
1800                 return 0;
1801 
1802         /* Signal all other negative errors to userspace */
1803         if (r < 0)
1804                 return r;
1805 
1806         if (!r)
1807                 trace_kvm_msr_write(ecx, data);
1808         else
1809                 trace_kvm_msr_write_ex(ecx, data);
1810 
1811         return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
1812 }
1813 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
1814 
1815 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
1816 {
1817         xfer_to_guest_mode_prepare();
1818         return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
1819                 xfer_to_guest_mode_work_pending();
1820 }
1821 
1822 /*
1823  * The fast path for frequent and performance sensitive wrmsr emulation,
1824  * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
1825  * the latency of virtual IPI by avoiding the expensive bits of transitioning
1826  * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
1827  * other cases which must be called after interrupts are enabled on the host.
1828  */
1829 static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
1830 {
1831         if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
1832                 return 1;
1833 
1834         if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
1835                 ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
1836                 ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
1837                 ((u32)(data >> 32) != X2APIC_BROADCAST)) {
1838 
1839                 data &= ~(1 << 12);
1840                 kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
1841                 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
1842                 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
1843                 trace_kvm_apic_write(APIC_ICR, (u32)data);
1844                 return 0;
1845         }
1846 
1847         return 1;
1848 }
1849 
1850 static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
1851 {
1852         if (!kvm_can_use_hv_timer(vcpu))
1853                 return 1;
1854 
1855         kvm_set_lapic_tscdeadline_msr(vcpu, data);
1856         return 0;
1857 }
1858 
1859 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1860 {
1861         u32 msr = kvm_rcx_read(vcpu);
1862         u64 data;
1863         fastpath_t ret = EXIT_FASTPATH_NONE;
1864 
1865         switch (msr) {
1866         case APIC_BASE_MSR + (APIC_ICR >> 4):
1867                 data = kvm_read_edx_eax(vcpu);
1868                 if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
1869                         kvm_skip_emulated_instruction(vcpu);
1870                         ret = EXIT_FASTPATH_EXIT_HANDLED;
1871                 }
1872                 break;
1873         case MSR_IA32_TSCDEADLINE:
1874                 data = kvm_read_edx_eax(vcpu);
1875                 if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
1876                         kvm_skip_emulated_instruction(vcpu);
1877                         ret = EXIT_FASTPATH_REENTER_GUEST;
1878                 }
1879                 break;
1880         default:
1881                 break;
1882         }
1883 
1884         if (ret != EXIT_FASTPATH_NONE)
1885                 trace_kvm_msr_write(msr, data);
1886 
1887         return ret;
1888 }
1889 EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
1890 
1891 /*
1892  * Adapt set_msr() to msr_io()'s calling convention
1893  */
1894 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1895 {
1896         return kvm_get_msr_ignored_check(vcpu, index, data, true);
1897 }
1898 
1899 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1900 {
1901         return kvm_set_msr_ignored_check(vcpu, index, *data, true);
1902 }
1903 
1904 #ifdef CONFIG_X86_64
1905 struct pvclock_clock {
1906         int vclock_mode;
1907         u64 cycle_last;
1908         u64 mask;
1909         u32 mult;
1910         u32 shift;
1911         u64 base_cycles;
1912         u64 offset;
1913 };
1914 
1915 struct pvclock_gtod_data {
1916         seqcount_t      seq;
1917 
1918         struct pvclock_clock clock; /* extract of a clocksource struct */
1919         struct pvclock_clock raw_clock; /* extract of a clocksource struct */
1920 
1921         ktime_t         offs_boot;
1922         u64             wall_time_sec;
1923 };
1924 
1925 static struct pvclock_gtod_data pvclock_gtod_data;
1926 
1927 static void update_pvclock_gtod(struct timekeeper *tk)
1928 {
1929         struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1930 
1931         write_seqcount_begin(&vdata->seq);
1932 
1933         /* copy pvclock gtod data */
1934         vdata->clock.vclock_mode        = tk->tkr_mono.clock->vdso_clock_mode;
1935         vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
1936         vdata->clock.mask               = tk->tkr_mono.mask;
1937         vdata->clock.mult               = tk->tkr_mono.mult;
1938         vdata->clock.shift              = tk->tkr_mono.shift;
1939         vdata->clock.base_cycles        = tk->tkr_mono.xtime_nsec;
1940         vdata->clock.offset             = tk->tkr_mono.base;
1941 
1942         vdata->raw_clock.vclock_mode    = tk->tkr_raw.clock->vdso_clock_mode;
1943         vdata->raw_clock.cycle_last     = tk->tkr_raw.cycle_last;
1944         vdata->raw_clock.mask           = tk->tkr_raw.mask;
1945         vdata->raw_clock.mult           = tk->tkr_raw.mult;
1946         vdata->raw_clock.shift          = tk->tkr_raw.shift;
1947         vdata->raw_clock.base_cycles    = tk->tkr_raw.xtime_nsec;
1948         vdata->raw_clock.offset         = tk->tkr_raw.base;
1949 
1950         vdata->wall_time_sec            = tk->xtime_sec;
1951 
1952         vdata->offs_boot                = tk->offs_boot;
1953 
1954         write_seqcount_end(&vdata->seq);
1955 }
1956 
1957 static s64 get_kvmclock_base_ns(void)
1958 {
1959         /* Count up from boot time, but with the frequency of the raw clock.  */
1960         return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
1961 }
1962 #else
1963 static s64 get_kvmclock_base_ns(void)
1964 {
1965         /* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
1966         return ktime_get_boottime_ns();
1967 }
1968 #endif
1969 
1970 void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
1971 {
1972         int version;
1973         int r;
1974         struct pvclock_wall_clock wc;
1975         u32 wc_sec_hi;
1976         u64 wall_nsec;
1977 
1978         if (!wall_clock)
1979                 return;
1980 
1981         r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1982         if (r)
1983                 return;
1984 
1985         if (version & 1)
1986                 ++version;  /* first time write, random junk */
1987 
1988         ++version;
1989 
1990         if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1991                 return;
1992 
1993         /*
1994          * The guest calculates current wall clock time by adding
1995          * system time (updated by kvm_guest_time_update below) to the
1996          * wall clock specified here.  We do the reverse here.
1997          */
1998         wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
1999 
2000         wc.nsec = do_div(wall_nsec, 1000000000);
2001         wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
2002         wc.version = version;
2003 
2004         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
2005 
2006         if (sec_hi_ofs) {
2007                 wc_sec_hi = wall_nsec >> 32;
2008                 kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
2009                                 &wc_sec_hi, sizeof(wc_sec_hi));
2010         }
2011 
2012         version++;
2013         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
2014 }
2015 
2016 static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
2017                                   bool old_msr, bool host_initiated)
2018 {
2019         struct kvm_arch *ka = &vcpu->kvm->arch;
2020 
2021         if (vcpu->vcpu_id == 0 && !host_initiated) {
2022                 if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2023                         kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2024 
2025                 ka->boot_vcpu_runs_old_kvmclock = old_msr;
2026         }
2027 
2028         vcpu->arch.time = system_time;
2029         kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2030 
2031         /* we verify if the enable bit is set... */
2032         vcpu->arch.pv_time_enabled = false;
2033         if (!(system_time & 1))
2034                 return;
2035 
2036         if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
2037                                        &vcpu->arch.pv_time, system_time & ~1ULL,
2038                                        sizeof(struct pvclock_vcpu_time_info)))
2039                 vcpu->arch.pv_time_enabled = true;
2040 
2041         return;
2042 }
2043 
2044 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
2045 {
2046         do_shl32_div32(dividend, divisor);
2047         return dividend;
2048 }
2049 
2050 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
2051                                s8 *pshift, u32 *pmultiplier)
2052 {
2053         uint64_t scaled64;
2054         int32_t  shift = 0;
2055         uint64_t tps64;
2056         uint32_t tps32;
2057 
2058         tps64 = base_hz;
2059         scaled64 = scaled_hz;
2060         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
2061                 tps64 >>= 1;
2062                 shift--;
2063         }
2064 
2065         tps32 = (uint32_t)tps64;
2066         while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
2067                 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
2068                         scaled64 >>= 1;
2069                 else
2070                         tps32 <<= 1;
2071                 shift++;
2072         }
2073 
2074         *pshift = shift;
2075         *pmultiplier = div_frac(scaled64, tps32);
2076 }
2077 
2078 #ifdef CONFIG_X86_64
2079 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
2080 #endif
2081 
2082 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
2083 static unsigned long max_tsc_khz;
2084 
2085 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
2086 {
2087         u64 v = (u64)khz * (1000000 + ppm);
2088         do_div(v, 1000000);
2089         return v;
2090 }
2091 
2092 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2093 {
2094         u64 ratio;
2095 
2096         /* Guest TSC same frequency as host TSC? */
2097         if (!scale) {
2098                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
2099                 return 0;
2100         }
2101 
2102         /* TSC scaling supported? */
2103         if (!kvm_has_tsc_control) {
2104                 if (user_tsc_khz > tsc_khz) {
2105                         vcpu->arch.tsc_catchup = 1;
2106                         vcpu->arch.tsc_always_catchup = 1;
2107                         return 0;
2108                 } else {
2109                         pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
2110                         return -1;
2111                 }
2112         }
2113 
2114         /* TSC scaling required  - calculate ratio */
2115         ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
2116                                 user_tsc_khz, tsc_khz);
2117 
2118         if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
2119                 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2120                                     user_tsc_khz);
2121                 return -1;
2122         }
2123 
2124         vcpu->arch.tsc_scaling_ratio = ratio;
2125         return 0;
2126 }
2127 
2128 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
2129 {
2130         u32 thresh_lo, thresh_hi;
2131         int use_scaling = 0;
2132 
2133         /* tsc_khz can be zero if TSC calibration fails */
2134         if (user_tsc_khz == 0) {
2135                 /* set tsc_scaling_ratio to a safe value */
2136                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
2137                 return -1;
2138         }
2139 
2140         /* Compute a scale to convert nanoseconds in TSC cycles */
2141         kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
2142                            &vcpu->arch.virtual_tsc_shift,
2143                            &vcpu->arch.virtual_tsc_mult);
2144         vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2145 
2146         /*
2147          * Compute the variation in TSC rate which is acceptable
2148          * within the range of tolerance and decide if the
2149          * rate being applied is within that bounds of the hardware
2150          * rate.  If so, no scaling or compensation need be done.
2151          */
2152         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2153         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
2154         if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
2155                 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
2156                 use_scaling = 1;
2157         }
2158         return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
2159 }
2160 
2161 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
2162 {
2163         u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2164                                       vcpu->arch.virtual_tsc_mult,
2165                                       vcpu->arch.virtual_tsc_shift);
2166         tsc += vcpu->arch.this_tsc_write;
2167         return tsc;
2168 }
2169 
2170 static inline int gtod_is_based_on_tsc(int mode)
2171 {
2172         return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2173 }
2174 
2175 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
2176 {
2177 #ifdef CONFIG_X86_64
2178         bool vcpus_matched;
2179         struct kvm_arch *ka = &vcpu->kvm->arch;
2180         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2181 
2182         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2183                          atomic_read(&vcpu->kvm->online_vcpus));
2184 
2185         /*
2186          * Once the masterclock is enabled, always perform request in
2187          * order to update it.
2188          *
2189          * In order to enable masterclock, the host clocksource must be TSC
2190          * and the vcpus need to have matched TSCs.  When that happens,
2191          * perform request to enable masterclock.
2192          */
2193         if (ka->use_master_clock ||
2194             (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
2195                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2196 
2197         trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2198                             atomic_read(&vcpu->kvm->online_vcpus),
2199                             ka->use_master_clock, gtod->clock.vclock_mode);
2200 #endif
2201 }
2202 
2203 /*
2204  * Multiply tsc by a fixed point number represented by ratio.
2205  *
2206  * The most significant 64-N bits (mult) of ratio represent the
2207  * integral part of the fixed point number; the remaining N bits
2208  * (frac) represent the fractional part, ie. ratio represents a fixed
2209  * point number (mult + frac * 2^(-N)).
2210  *
2211  * N equals to kvm_tsc_scaling_ratio_frac_bits.
2212  */
2213 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
2214 {
2215         return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
2216 }
2217 
2218 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
2219 {
2220         u64 _tsc = tsc;
2221         u64 ratio = vcpu->arch.tsc_scaling_ratio;
2222 
2223         if (ratio != kvm_default_tsc_scaling_ratio)
2224                 _tsc = __scale_tsc(ratio, tsc);
2225 
2226         return _tsc;
2227 }
2228 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
2229 
2230 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2231 {
2232         u64 tsc;
2233 
2234         tsc = kvm_scale_tsc(vcpu, rdtsc());
2235 
2236         return target_tsc - tsc;
2237 }
2238 
2239 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2240 {
2241         return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
2242 }
2243 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
2244 
2245 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2246 {
2247         vcpu->arch.l1_tsc_offset = offset;
2248         vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
2249 }
2250 
2251 static inline bool kvm_check_tsc_unstable(void)
2252 {
2253 #ifdef CONFIG_X86_64
2254         /*
2255          * TSC is marked unstable when we're running on Hyper-V,
2256          * 'TSC page' clocksource is good.
2257          */
2258         if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
2259                 return false;
2260 #endif
2261         return check_tsc_unstable();
2262 }
2263 
2264 static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
2265 {
2266         struct kvm *kvm = vcpu->kvm;
2267         u64 offset, ns, elapsed;
2268         unsigned long flags;
2269         bool matched;
2270         bool already_matched;
2271         bool synchronizing = false;
2272 
2273         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2274         offset = kvm_compute_tsc_offset(vcpu, data);
2275         ns = get_kvmclock_base_ns();
2276         elapsed = ns - kvm->arch.last_tsc_nsec;
2277 
2278         if (vcpu->arch.virtual_tsc_khz) {
2279                 if (data == 0) {
2280                         /*
2281                          * detection of vcpu initialization -- need to sync
2282                          * with other vCPUs. This particularly helps to keep
2283                          * kvm_clock stable after CPU hotplug
2284                          */
2285                         synchronizing = true;
2286                 } else {
2287                         u64 tsc_exp = kvm->arch.last_tsc_write +
2288                                                 nsec_to_cycles(vcpu, elapsed);
2289                         u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2290                         /*
2291                          * Special case: TSC write with a small delta (1 second)
2292                          * of virtual cycle time against real time is
2293                          * interpreted as an attempt to synchronize the CPU.
2294                          */
2295                         synchronizing = data < tsc_exp + tsc_hz &&
2296                                         data + tsc_hz > tsc_exp;
2297                 }
2298         }
2299 
2300         /*
2301          * For a reliable TSC, we can match TSC offsets, and for an unstable
2302          * TSC, we add elapsed time in this computation.  We could let the
2303          * compensation code attempt to catch up if we fall behind, but
2304          * it's better to try to match offsets from the beginning.
2305          */
2306         if (synchronizing &&
2307             vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2308                 if (!kvm_check_tsc_unstable()) {
2309                         offset = kvm->arch.cur_tsc_offset;
2310                 } else {
2311                         u64 delta = nsec_to_cycles(vcpu, elapsed);
2312                         data += delta;
2313                         offset = kvm_compute_tsc_offset(vcpu, data);
2314                 }
2315                 matched = true;
2316                 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
2317         } else {
2318                 /*
2319                  * We split periods of matched TSC writes into generations.
2320                  * For each generation, we track the original measured
2321                  * nanosecond time, offset, and write, so if TSCs are in
2322                  * sync, we can match exact offset, and if not, we can match
2323                  * exact software computation in compute_guest_tsc()
2324                  *
2325                  * These values are tracked in kvm->arch.cur_xxx variables.
2326                  */
2327                 kvm->arch.cur_tsc_generation++;
2328                 kvm->arch.cur_tsc_nsec = ns;
2329                 kvm->arch.cur_tsc_write = data;
2330                 kvm->arch.cur_tsc_offset = offset;
2331                 matched = false;
2332         }
2333 
2334         /*
2335          * We also track th most recent recorded KHZ, write and time to
2336          * allow the matching interval to be extended at each write.
2337          */
2338         kvm->arch.last_tsc_nsec = ns;
2339         kvm->arch.last_tsc_write = data;
2340         kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2341 
2342         vcpu->arch.last_guest_tsc = data;
2343 
2344         /* Keep track of which generation this VCPU has synchronized to */
2345         vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2346         vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2347         vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2348 
2349         kvm_vcpu_write_tsc_offset(vcpu, offset);
2350         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2351 
2352         spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
2353         if (!matched) {
2354                 kvm->arch.nr_vcpus_matched_tsc = 0;
2355         } else if (!already_matched) {
2356                 kvm->arch.nr_vcpus_matched_tsc++;
2357         }
2358 
2359         kvm_track_tsc_matching(vcpu);
2360         spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
2361 }
2362 
2363 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2364                                            s64 adjustment)
2365 {
2366         u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2367         kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
2368 }
2369 
2370 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2371 {
2372         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
2373                 WARN_ON(adjustment < 0);
2374         adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
2375         adjust_tsc_offset_guest(vcpu, adjustment);
2376 }
2377 
2378 #ifdef CONFIG_X86_64
2379 
2380 static u64 read_tsc(void)
2381 {
2382         u64 ret = (u64)rdtsc_ordered();
2383         u64 last = pvclock_gtod_data.clock.cycle_last;
2384 
2385         if (likely(ret >= last))
2386                 return ret;
2387 
2388         /*
2389          * GCC likes to generate cmov here, but this branch is extremely
2390          * predictable (it's just a function of time and the likely is
2391          * very likely) and there's a data dependence, so force GCC
2392          * to generate a branch instead.  I don't barrier() because
2393          * we don't actually need a barrier, and if this function
2394          * ever gets inlined it will generate worse code.
2395          */
2396         asm volatile ("");
2397         return last;
2398 }
2399 
2400 static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2401                           int *mode)
2402 {
2403         long v;
2404         u64 tsc_pg_val;
2405 
2406         switch (clock->vclock_mode) {
2407         case VDSO_CLOCKMODE_HVCLOCK:
2408                 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
2409                                                   tsc_timestamp);
2410                 if (tsc_pg_val != U64_MAX) {
2411                         /* TSC page valid */
2412                         *mode = VDSO_CLOCKMODE_HVCLOCK;
2413                         v = (tsc_pg_val - clock->cycle_last) &
2414                                 clock->mask;
2415                 } else {
2416                         /* TSC page invalid */
2417                         *mode = VDSO_CLOCKMODE_NONE;
2418                 }
2419                 break;
2420         case VDSO_CLOCKMODE_TSC:
2421                 *mode = VDSO_CLOCKMODE_TSC;
2422                 *tsc_timestamp = read_tsc();
2423                 v = (*tsc_timestamp - clock->cycle_last) &
2424                         clock->mask;
2425                 break;
2426         default:
2427                 *mode = VDSO_CLOCKMODE_NONE;
2428         }
2429 
2430         if (*mode == VDSO_CLOCKMODE_NONE)
2431                 *tsc_timestamp = v = 0;
2432 
2433         return v * clock->mult;
2434 }
2435 
2436 static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
2437 {
2438         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2439         unsigned long seq;
2440         int mode;
2441         u64 ns;
2442 
2443         do {
2444                 seq = read_seqcount_begin(&gtod->seq);
2445                 ns = gtod->raw_clock.base_cycles;
2446                 ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2447                 ns >>= gtod->raw_clock.shift;
2448                 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2449         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2450         *t = ns;
2451 
2452         return mode;
2453 }
2454 
2455 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
2456 {
2457         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2458         unsigned long seq;
2459         int mode;
2460         u64 ns;
2461 
2462         do {
2463                 seq = read_seqcount_begin(&gtod->seq);
2464                 ts->tv_sec = gtod->wall_time_sec;
2465                 ns = gtod->clock.base_cycles;
2466                 ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2467                 ns >>= gtod->clock.shift;
2468         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2469 
2470         ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2471         ts->tv_nsec = ns;
2472 
2473         return mode;
2474 }
2475 
2476 /* returns true if host is using TSC based clocksource */
2477 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
2478 {
2479         /* checked again under seqlock below */
2480         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2481                 return false;
2482 
2483         return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
2484                                                       tsc_timestamp));
2485 }
2486 
2487 /* returns true if host is using TSC based clocksource */
2488 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
2489                                            u64 *tsc_timestamp)
2490 {
2491         /* checked again under seqlock below */
2492         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2493                 return false;
2494 
2495         return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
2496 }
2497 #endif
2498 
2499 /*
2500  *
2501  * Assuming a stable TSC across physical CPUS, and a stable TSC
2502  * across virtual CPUs, the following condition is possible.
2503  * Each numbered line represents an event visible to both
2504  * CPUs at the next numbered event.
2505  *
2506  * "timespecX" represents host monotonic time. "tscX" represents
2507  * RDTSC value.
2508  *
2509  *              VCPU0 on CPU0           |       VCPU1 on CPU1
2510  *
2511  * 1.  read timespec0,tsc0
2512  * 2.                                   | timespec1 = timespec0 + N
2513  *                                      | tsc1 = tsc0 + M
2514  * 3. transition to guest               | transition to guest
2515  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2516  * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
2517  *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2518  *
2519  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2520  *
2521  *      - ret0 < ret1
2522  *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2523  *              ...
2524  *      - 0 < N - M => M < N
2525  *
2526  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2527  * always the case (the difference between two distinct xtime instances
2528  * might be smaller then the difference between corresponding TSC reads,
2529  * when updating guest vcpus pvclock areas).
2530  *
2531  * To avoid that problem, do not allow visibility of distinct
2532  * system_timestamp/tsc_timestamp values simultaneously: use a master
2533  * copy of host monotonic time values. Update that master copy
2534  * in lockstep.
2535  *
2536  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
2537  *
2538  */
2539 
2540 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2541 {
2542 #ifdef CONFIG_X86_64
2543         struct kvm_arch *ka = &kvm->arch;
2544         int vclock_mode;
2545         bool host_tsc_clocksource, vcpus_matched;
2546 
2547         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2548                         atomic_read(&kvm->online_vcpus));
2549 
2550         /*
2551          * If the host uses TSC clock, then passthrough TSC as stable
2552          * to the guest.
2553          */
2554         host_tsc_clocksource = kvm_get_time_and_clockread(
2555                                         &ka->master_kernel_ns,
2556                                         &ka->master_cycle_now);
2557 
2558         ka->use_master_clock = host_tsc_clocksource && vcpus_matched
2559                                 && !ka->backwards_tsc_observed
2560                                 && !ka->boot_vcpu_runs_old_kvmclock;
2561 
2562         if (ka->use_master_clock)
2563                 atomic_set(&kvm_guest_has_master_clock, 1);
2564 
2565         vclock_mode = pvclock_gtod_data.clock.vclock_mode;
2566         trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2567                                         vcpus_matched);
2568 #endif
2569 }
2570 
2571 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2572 {
2573         kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2574 }
2575 
2576 static void kvm_gen_update_masterclock(struct kvm *kvm)
2577 {
2578 #ifdef CONFIG_X86_64
2579         int i;
2580         struct kvm_vcpu *vcpu;
2581         struct kvm_arch *ka = &kvm->arch;
2582         unsigned long flags;
2583 
2584         kvm_hv_invalidate_tsc_page(kvm);
2585 
2586         kvm_make_mclock_inprogress_request(kvm);
2587 
2588         /* no guest entries from this point */
2589         spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
2590         pvclock_update_vm_gtod_copy(kvm);
2591         spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
2592 
2593         kvm_for_each_vcpu(i, vcpu, kvm)
2594                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2595 
2596         /* guest entries allowed */
2597         kvm_for_each_vcpu(i, vcpu, kvm)
2598                 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2599 #endif
2600 }
2601 
2602 u64 get_kvmclock_ns(struct kvm *kvm)
2603 {
2604         struct kvm_arch *ka = &kvm->arch;
2605         struct pvclock_vcpu_time_info hv_clock;
2606         unsigned long flags;
2607         u64 ret;
2608 
2609         spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
2610         if (!ka->use_master_clock) {
2611                 spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
2612                 return get_kvmclock_base_ns() + ka->kvmclock_offset;
2613         }
2614 
2615         hv_clock.tsc_timestamp = ka->master_cycle_now;
2616         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
2617         spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
2618 
2619         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
2620         get_cpu();
2621 
2622         if (__this_cpu_read(cpu_tsc_khz)) {
2623                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
2624                                    &hv_clock.tsc_shift,
2625                                    &hv_clock.tsc_to_system_mul);
2626                 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2627         } else
2628                 ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
2629 
2630         put_cpu();
2631 
2632         return ret;
2633 }
2634 
2635 static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
2636                                    struct gfn_to_hva_cache *cache,
2637                                    unsigned int offset)
2638 {
2639         struct kvm_vcpu_arch *vcpu = &v->arch;
2640         struct pvclock_vcpu_time_info guest_hv_clock;
2641 
2642         if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
2643                 &guest_hv_clock, offset, sizeof(guest_hv_clock))))
2644                 return;
2645 
2646         /* This VCPU is paused, but it's legal for a guest to read another
2647          * VCPU's kvmclock, so we really have to follow the specification where
2648          * it says that version is odd if data is being modified, and even after
2649          * it is consistent.
2650          *
2651          * Version field updates must be kept separate.  This is because
2652          * kvm_write_guest_cached might use a "rep movs" instruction, and
2653          * writes within a string instruction are weakly ordered.  So there
2654          * are three writes overall.
2655          *
2656          * As a small optimization, only write the version field in the first
2657          * and third write.  The vcpu->pv_time cache is still valid, because the
2658          * version field is the first in the struct.
2659          */
2660         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2661 
2662         if (guest_hv_clock.version & 1)
2663                 ++guest_hv_clock.version;  /* first time write, random junk */
2664 
2665         vcpu->hv_clock.version = guest_hv_clock.version + 1;
2666         kvm_write_guest_offset_cached(v->kvm, cache,
2667                                       &vcpu->hv_clock, offset,
2668                                       sizeof(vcpu->hv_clock.version));
2669 
2670         smp_wmb();
2671 
2672         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2673         vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2674 
2675         if (vcpu->pvclock_set_guest_stopped_request) {
2676                 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2677                 vcpu->pvclock_set_guest_stopped_request = false;
2678         }
2679 
2680         trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2681 
2682         kvm_write_guest_offset_cached(v->kvm, cache,
2683                                       &vcpu->hv_clock, offset,
2684                                       sizeof(vcpu->hv_clock));
2685 
2686         smp_wmb();
2687 
2688         vcpu->hv_clock.version++;
2689         kvm_write_guest_offset_cached(v->kvm, cache,
2690                                      &vcpu->hv_clock, offset,
2691                                      sizeof(vcpu->hv_clock.version));
2692 }
2693 
2694 static int kvm_guest_time_update(struct kvm_vcpu *v)
2695 {
2696         unsigned long flags, tgt_tsc_khz;
2697         struct kvm_vcpu_arch *vcpu = &v->arch;
2698         struct kvm_arch *ka = &v->kvm->arch;
2699         s64 kernel_ns;
2700         u64 tsc_timestamp, host_tsc;
2701         u8 pvclock_flags;
2702         bool use_master_clock;
2703 
2704         kernel_ns = 0;
2705         host_tsc = 0;
2706 
2707         /*
2708          * If the host uses TSC clock, then passthrough TSC as stable
2709          * to the guest.
2710          */
2711         spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
2712         use_master_clock = ka->use_master_clock;
2713         if (use_master_clock) {
2714                 host_tsc = ka->master_cycle_now;
2715                 kernel_ns = ka->master_kernel_ns;
2716         }
2717         spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
2718 
2719         /* Keep irq disabled to prevent changes to the clock */
2720         local_irq_save(flags);
2721         tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2722         if (unlikely(tgt_tsc_khz == 0)) {
2723                 local_irq_restore(flags);
2724                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2725                 return 1;
2726         }
2727         if (!use_master_clock) {
2728                 host_tsc = rdtsc();
2729                 kernel_ns = get_kvmclock_base_ns();
2730         }
2731 
2732         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2733 
2734         /*
2735          * We may have to catch up the TSC to match elapsed wall clock
2736          * time for two reasons, even if kvmclock is used.
2737          *   1) CPU could have been running below the maximum TSC rate
2738          *   2) Broken TSC compensation resets the base at each VCPU
2739          *      entry to avoid unknown leaps of TSC even when running
2740          *      again on the same CPU.  This may cause apparent elapsed
2741          *      time to disappear, and the guest to stand still or run
2742          *      very slowly.
2743          */
2744         if (vcpu->tsc_catchup) {
2745                 u64 tsc = compute_guest_tsc(v, kernel_ns);
2746                 if (tsc > tsc_timestamp) {
2747                         adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2748                         tsc_timestamp = tsc;
2749                 }
2750         }
2751 
2752         local_irq_restore(flags);
2753 
2754         /* With all the info we got, fill in the values */
2755 
2756         if (kvm_has_tsc_control)
2757                 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2758 
2759         if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2760                 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2761                                    &vcpu->hv_clock.tsc_shift,
2762                                    &vcpu->hv_clock.tsc_to_system_mul);
2763                 vcpu->hw_tsc_khz = tgt_tsc_khz;
2764         }
2765 
2766         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2767         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2768         vcpu->last_guest_tsc = tsc_timestamp;
2769 
2770         /* If the host uses TSC clocksource, then it is stable */
2771         pvclock_flags = 0;
2772         if (use_master_clock)
2773                 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2774 
2775         vcpu->hv_clock.flags = pvclock_flags;
2776 
2777         if (vcpu->pv_time_enabled)
2778                 kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
2779         if (vcpu->xen.vcpu_info_set)
2780                 kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
2781                                        offsetof(struct compat_vcpu_info, time));
2782         if (vcpu->xen.vcpu_time_info_set)
2783                 kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
2784         if (v == kvm_get_vcpu(v->kvm, 0))
2785                 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2786         return 0;
2787 }
2788 
2789 /*
2790  * kvmclock updates which are isolated to a given vcpu, such as
2791  * vcpu->cpu migration, should not allow system_timestamp from
2792  * the rest of the vcpus to remain static. Otherwise ntp frequency
2793  * correction applies to one vcpu's system_timestamp but not
2794  * the others.
2795  *
2796  * So in those cases, request a kvmclock update for all vcpus.
2797  * We need to rate-limit these requests though, as they can
2798  * considerably slow guests that have a large number of vcpus.
2799  * The time for a remote vcpu to update its kvmclock is bound
2800  * by the delay we use to rate-limit the updates.
2801  */
2802 
2803 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2804 
2805 static void kvmclock_update_fn(struct work_struct *work)
2806 {
2807         int i;
2808         struct delayed_work *dwork = to_delayed_work(work);
2809         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2810                                            kvmclock_update_work);
2811         struct kvm *kvm = container_of(ka, struct kvm, arch);
2812         struct kvm_vcpu *vcpu;
2813 
2814         kvm_for_each_vcpu(i, vcpu, kvm) {
2815                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2816                 kvm_vcpu_kick(vcpu);
2817         }
2818 }
2819 
2820 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2821 {
2822         struct kvm *kvm = v->kvm;
2823 
2824         kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2825         schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2826                                         KVMCLOCK_UPDATE_DELAY);
2827 }
2828 
2829 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2830 
2831 static void kvmclock_sync_fn(struct work_struct *work)
2832 {
2833         struct delayed_work *dwork = to_delayed_work(work);
2834         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2835                                            kvmclock_sync_work);
2836         struct kvm *kvm = container_of(ka, struct kvm, arch);
2837 
2838         if (!kvmclock_periodic_sync)
2839                 return;
2840 
2841         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2842         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2843                                         KVMCLOCK_SYNC_PERIOD);
2844 }
2845 
2846 /*
2847  * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2848  */
2849 static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2850 {
2851         /* McStatusWrEn enabled? */
2852         if (guest_cpuid_is_amd_or_hygon(vcpu))
2853                 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2854 
2855         return false;
2856 }
2857 
2858 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2859 {
2860         u64 mcg_cap = vcpu->arch.mcg_cap;
2861         unsigned bank_num = mcg_cap & 0xff;
2862         u32 msr = msr_info->index;
2863         u64 data = msr_info->data;
2864 
2865         switch (msr) {
2866         case MSR_IA32_MCG_STATUS:
2867                 vcpu->arch.mcg_status = data;
2868                 break;
2869         case MSR_IA32_MCG_CTL:
2870                 if (!(mcg_cap & MCG_CTL_P) &&
2871                     (data || !msr_info->host_initiated))
2872                         return 1;
2873                 if (data != 0 && data != ~(u64)0)
2874                         return 1;
2875                 vcpu->arch.mcg_ctl = data;
2876                 break;
2877         default:
2878                 if (msr >= MSR_IA32_MC0_CTL &&
2879                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2880                         u32 offset = array_index_nospec(
2881                                 msr - MSR_IA32_MC0_CTL,
2882                                 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
2883 
2884                         /* only 0 or all 1s can be written to IA32_MCi_CTL
2885                          * some Linux kernels though clear bit 10 in bank 4 to
2886                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2887                          * this to avoid an uncatched #GP in the guest
2888                          */
2889                         if ((offset & 0x3) == 0 &&
2890                             data != 0 && (data | (1 << 10)) != ~(u64)0)
2891                                 return -1;
2892 
2893                         /* MCi_STATUS */
2894                         if (!msr_info->host_initiated &&
2895                             (offset & 0x3) == 1 && data != 0) {
2896                                 if (!can_set_mci_status(vcpu))
2897                                         return -1;
2898                         }
2899 
2900                         vcpu->arch.mce_banks[offset] = data;
2901                         break;
2902                 }
2903                 return 1;
2904         }
2905         return 0;
2906 }
2907 
2908 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
2909 {
2910         u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
2911 
2912         return (vcpu->arch.apf.msr_en_val & mask) == mask;
2913 }
2914 
2915 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2916 {
2917         gpa_t gpa = data & ~0x3f;
2918 
2919         /* Bits 4:5 are reserved, Should be zero */
2920         if (data & 0x30)
2921                 return 1;
2922 
2923         if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
2924             (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
2925                 return 1;
2926 
2927         if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
2928             (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
2929                 return 1;
2930 
2931         if (!lapic_in_kernel(vcpu))
2932                 return data ? 1 : 0;
2933 
2934         vcpu->arch.apf.msr_en_val = data;
2935 
2936         if (!kvm_pv_async_pf_enabled(vcpu)) {
2937                 kvm_clear_async_pf_completion_queue(vcpu);
2938                 kvm_async_pf_hash_reset(vcpu);
2939                 return 0;
2940         }
2941 
2942         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2943                                         sizeof(u64)))
2944                 return 1;
2945 
2946         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2947         vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2948 
2949         kvm_async_pf_wakeup_all(vcpu);
2950 
2951         return 0;
2952 }
2953 
2954 static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
2955 {
2956         /* Bits 8-63 are reserved */
2957         if (data >> 8)
2958                 return 1;
2959 
2960         if (!lapic_in_kernel(vcpu))
2961                 return 1;
2962 
2963         vcpu->arch.apf.msr_int_val = data;
2964 
2965         vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
2966 
2967         return 0;
2968 }
2969 
2970 static void kvmclock_reset(struct kvm_vcpu *vcpu)
2971 {
2972         vcpu->arch.pv_time_enabled = false;
2973         vcpu->arch.time = 0;
2974 }
2975 
2976 static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
2977 {
2978         ++vcpu->stat.tlb_flush;
2979         static_call(kvm_x86_tlb_flush_all)(vcpu);
2980 }
2981 
2982 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
2983 {
2984         ++vcpu->stat.tlb_flush;
2985 
2986         if (!tdp_enabled) {
2987                /*
2988                  * A TLB flush on behalf of the guest is equivalent to
2989                  * INVPCID(all), toggling CR4.PGE, etc., which requires
2990                  * a forced sync of the shadow page tables.  Unload the
2991                  * entire MMU here and the subsequent load will sync the
2992                  * shadow page tables, and also flush the TLB.
2993                  */
2994                 kvm_mmu_unload(vcpu);
2995                 return;
2996         }
2997 
2998         static_call(kvm_x86_tlb_flush_guest)(vcpu);
2999 }
3000 
3001 static void record_steal_time(struct kvm_vcpu *vcpu)
3002 {
3003         struct kvm_host_map map;
3004         struct kvm_steal_time *st;
3005 
3006         if (kvm_xen_msr_enabled(vcpu->kvm)) {
3007                 kvm_xen_runstate_set_running(vcpu);
3008                 return;
3009         }
3010 
3011         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3012                 return;
3013 
3014         /* -EAGAIN is returned in atomic context so we can just return. */
3015         if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
3016                         &map, &vcpu->arch.st.cache, false))
3017                 return;
3018 
3019         st = map.hva +
3020                 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
3021 
3022         /*
3023          * Doing a TLB flush here, on the guest's behalf, can avoid
3024          * expensive IPIs.
3025          */
3026         if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3027                 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3028                                        st->preempted & KVM_VCPU_FLUSH_TLB);
3029                 if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
3030                         kvm_vcpu_flush_tlb_guest(vcpu);
3031         } else {
3032                 st->preempted = 0;
3033         }
3034 
3035         vcpu->arch.st.preempted = 0;
3036 
3037         if (st->version & 1)
3038                 st->version += 1;  /* first time write, random junk */
3039 
3040         st->version += 1;
3041 
3042         smp_wmb();
3043 
3044         st->steal += current->sched_info.run_delay -
3045                 vcpu->arch.st.last_steal;
3046         vcpu->arch.st.last_steal = current->sched_info.run_delay;
3047 
3048         smp_wmb();
3049 
3050         st->version += 1;
3051 
3052         kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
3053 }
3054 
3055 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3056 {
3057         bool pr = false;
3058         u32 msr = msr_info->index;
3059         u64 data = msr_info->data;
3060 
3061         if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
3062                 return kvm_xen_write_hypercall_page(vcpu, data);
3063 
3064         switch (msr) {
3065         case MSR_AMD64_NB_CFG:
3066         case MSR_IA32_UCODE_WRITE:
3067         case MSR_VM_HSAVE_PA:
3068         case MSR_AMD64_PATCH_LOADER:
3069         case MSR_AMD64_BU_CFG2:
3070         case MSR_AMD64_DC_CFG:
3071         case MSR_F15H_EX_CFG:
3072                 break;
3073 
3074         case MSR_IA32_UCODE_REV:
3075                 if (msr_info->host_initiated)
3076                         vcpu->arch.microcode_version = data;
3077                 break;
3078         case MSR_IA32_ARCH_CAPABILITIES:
3079                 if (!msr_info->host_initiated)
3080                         return 1;
3081                 vcpu->arch.arch_capabilities = data;
3082                 break;
3083         case MSR_IA32_PERF_CAPABILITIES: {
3084                 struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
3085 
3086                 if (!msr_info->host_initiated)
3087                         return 1;
3088                 if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
3089                         return 1;
3090                 if (data & ~msr_ent.data)
3091                         return 1;
3092 
3093                 vcpu->arch.perf_capabilities = data;
3094 
3095                 return 0;
3096                 }
3097         case MSR_EFER:
3098                 return set_efer(vcpu, msr_info);
3099         case MSR_K7_HWCR:
3100                 data &= ~(u64)0x40;     /* ignore flush filter disable */
3101                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
3102                 data &= ~(u64)0x8;      /* ignore TLB cache disable */
3103 
3104                 /* Handle McStatusWrEn */
3105                 if (data == BIT_ULL(18)) {
3106                         vcpu->arch.msr_hwcr = data;
3107                 } else if (data != 0) {
3108                         vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
3109                                     data);
3110                         return 1;
3111                 }
3112                 break;
3113         case MSR_FAM10H_MMIO_CONF_BASE:
3114                 if (data != 0) {
3115                         vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
3116                                     "0x%llx\n", data);
3117                         return 1;
3118                 }
3119                 break;
3120         case 0x200 ... 0x2ff:
3121                 return kvm_mtrr_set_msr(vcpu, msr, data);
3122         case MSR_IA32_APICBASE:
3123                 return kvm_set_apic_base(vcpu, msr_info);
3124         case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3125                 return kvm_x2apic_msr_write(vcpu, msr, data);
3126         case MSR_IA32_TSCDEADLINE:
3127                 kvm_set_lapic_tscdeadline_msr(vcpu, data);
3128                 break;
3129         case MSR_IA32_TSC_ADJUST:
3130                 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
3131                         if (!msr_info->host_initiated) {
3132                                 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
3133                                 adjust_tsc_offset_guest(vcpu, adj);
3134                         }
3135                         vcpu->arch.ia32_tsc_adjust_msr = data;
3136                 }
3137                 break;
3138         case MSR_IA32_MISC_ENABLE:
3139                 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3140                     ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
3141                         if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3142                                 return 1;
3143                         vcpu->arch.ia32_misc_enable_msr = data;
3144                         kvm_update_cpuid_runtime(vcpu);
3145                 } else {
3146                         vcpu->arch.ia32_misc_enable_msr = data;
3147                 }
3148                 break;
3149         case MSR_IA32_SMBASE:
3150                 if (!msr_info->host_initiated)
3151                         return 1;
3152                 vcpu->arch.smbase = data;
3153                 break;
3154         case MSR_IA32_POWER_CTL:
3155                 vcpu->arch.msr_ia32_power_ctl = data;
3156                 break;
3157         case MSR_IA32_TSC:
3158                 if (msr_info->host_initiated) {
3159                         kvm_synchronize_tsc(vcpu, data);
3160                 } else {
3161                         u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3162                         adjust_tsc_offset_guest(vcpu, adj);
3163                         vcpu->arch.ia32_tsc_adjust_msr += adj;
3164                 }
3165                 break;
3166         case MSR_IA32_XSS:
3167                 if (!msr_info->host_initiated &&
3168                     !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3169                         return 1;
3170                 /*
3171                  * KVM supports exposing PT to the guest, but does not support
3172                  * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3173                  * XSAVES/XRSTORS to save/restore PT MSRs.
3174                  */
3175                 if (data & ~supported_xss)
3176                         return 1;
3177                 vcpu->arch.ia32_xss = data;
3178                 break;
3179         case MSR_SMI_COUNT:
3180                 if (!msr_info->host_initiated)
3181                         return 1;
3182                 vcpu->arch.smi_count = data;
3183                 break;
3184         case MSR_KVM_WALL_CLOCK_NEW:
3185                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3186                         return 1;
3187 
3188                 vcpu->kvm->arch.wall_clock = data;
3189                 kvm_write_wall_clock(vcpu->kvm, data, 0);
3190                 break;
3191         case MSR_KVM_WALL_CLOCK:
3192                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3193                         return 1;
3194 
3195                 vcpu->kvm->arch.wall_clock = data;
3196                 kvm_write_wall_clock(vcpu->kvm, data, 0);
3197                 break;
3198         case MSR_KVM_SYSTEM_TIME_NEW:
3199                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3200                         return 1;
3201 
3202                 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
3203                 break;
3204         case MSR_KVM_SYSTEM_TIME:
3205                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3206                         return 1;
3207 
3208                 kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
3209                 break;
3210         case MSR_KVM_ASYNC_PF_EN:
3211                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3212                         return 1;
3213 
3214                 if (kvm_pv_enable_async_pf(vcpu, data))
3215                         return 1;
3216                 break;
3217         case MSR_KVM_ASYNC_PF_INT:
3218                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3219                         return 1;
3220 
3221                 if (kvm_pv_enable_async_pf_int(vcpu, data))
3222                         return 1;
3223                 break;
3224         case MSR_KVM_ASYNC_PF_ACK:
3225                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3226                         return 1;
3227                 if (data & 0x1) {
3228                         vcpu->arch.apf.pageready_pending = false;
3229                         kvm_check_async_pf_completion(vcpu);
3230                 }
3231                 break;
3232         case MSR_KVM_STEAL_TIME:
3233                 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3234                         return 1;
3235 
3236                 if (unlikely(!sched_info_on()))
3237                         return 1;
3238 
3239                 if (data & KVM_STEAL_RESERVED_MASK)
3240                         return 1;
3241 
3242                 vcpu->arch.st.msr_val = data;
3243 
3244                 if (!(data & KVM_MSR_ENABLED))
3245                         break;
3246 
3247                 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3248 
3249                 break;
3250         case MSR_KVM_PV_EOI_EN:
3251                 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3252                         return 1;
3253 
3254                 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
3255                         return 1;
3256                 break;
3257 
3258         case MSR_KVM_POLL_CONTROL:
3259                 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3260                         return 1;
3261 
3262                 /* only enable bit supported */
3263                 if (data & (-1ULL << 1))
3264                         return 1;
3265 
3266                 vcpu->arch.msr_kvm_poll_control = data;
3267                 break;
3268 
3269         case MSR_IA32_MCG_CTL:
3270         case MSR_IA32_MCG_STATUS:
3271         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3272                 return set_msr_mce(vcpu, msr_info);
3273 
3274         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3275         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3276                 pr = true;
3277                 fallthrough;
3278         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3279         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3280                 if (kvm_pmu_is_valid_msr(vcpu, msr))
3281                         return kvm_pmu_set_msr(vcpu, msr_info);
3282 
3283                 if (pr || data != 0)
3284                         vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
3285                                     "0x%x data 0x%llx\n", msr, data);
3286                 break;
3287         case MSR_K7_CLK_CTL:
3288                 /*
3289                  * Ignore all writes to this no longer documented MSR.
3290                  * Writes are only relevant for old K7 processors,
3291                  * all pre-dating SVM, but a recommended workaround from
3292                  * AMD for these chips. It is possible to specify the
3293                  * affected processor models on the command line, hence
3294                  * the need to ignore the workaround.
3295                  */
3296                 break;
3297         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3298         case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3299         case HV_X64_MSR_SYNDBG_OPTIONS:
3300         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3301         case HV_X64_MSR_CRASH_CTL:
3302         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3303         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3304         case HV_X64_MSR_TSC_EMULATION_CONTROL:
3305         case HV_X64_MSR_TSC_EMULATION_STATUS:
3306                 return kvm_hv_set_msr_common(vcpu, msr, data,
3307                                              msr_info->host_initiated);
3308         case MSR_IA32_BBL_CR_CTL3:
3309                 /* Drop writes to this legacy MSR -- see rdmsr
3310                  * counterpart for further detail.
3311                  */
3312                 if (report_ignored_msrs)
3313                         vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
3314                                 msr, data);
3315                 break;
3316         case MSR_AMD64_OSVW_ID_LENGTH:
3317                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3318                         return 1;
3319                 vcpu->arch.osvw.length = data;
3320                 break;
3321         case MSR_AMD64_OSVW_STATUS:
3322                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3323                         return 1;
3324                 vcpu->arch.osvw.status = data;
3325                 break;
3326         case MSR_PLATFORM_INFO:
3327                 if (!msr_info->host_initiated ||
3328                     (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
3329                      cpuid_fault_enabled(vcpu)))
3330                         return 1;
3331                 vcpu->arch.msr_platform_info = data;
3332                 break;
3333         case MSR_MISC_FEATURES_ENABLES:
3334                 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
3335                     (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
3336                      !supports_cpuid_fault(vcpu)))
3337                         return 1;
3338                 vcpu->arch.msr_misc_features_enables = data;
3339                 break;
3340         default:
3341                 if (kvm_pmu_is_valid_msr(vcpu, msr))
3342                         return kvm_pmu_set_msr(vcpu, msr_info);
3343                 return KVM_MSR_RET_INVALID;
3344         }
3345         return 0;
3346 }
3347 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
3348 
3349 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
3350 {
3351         u64 data;
3352         u64 mcg_cap = vcpu->arch.mcg_cap;
3353         unsigned bank_num = mcg_cap & 0xff;
3354 
3355         switch (msr) {
3356         case MSR_IA32_P5_MC_ADDR:
3357         case MSR_IA32_P5_MC_TYPE:
3358                 data = 0;
3359                 break;
3360         case MSR_IA32_MCG_CAP:
3361                 data = vcpu->arch.mcg_cap;
3362                 break;
3363         case MSR_IA32_MCG_CTL:
3364                 if (!(mcg_cap & MCG_CTL_P) && !host)
3365                         return 1;
3366                 data = vcpu->arch.mcg_ctl;
3367                 break;
3368         case MSR_IA32_MCG_STATUS:
3369                 data = vcpu->arch.mcg_status;
3370                 break;
3371         default:
3372                 if (msr >= MSR_IA32_MC0_CTL &&
3373                     msr < MSR_IA32_MCx_CTL(bank_num)) {
3374                         u32 offset = array_index_nospec(
3375                                 msr - MSR_IA32_MC0_CTL,
3376                                 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
3377 
3378                         data = vcpu->arch.mce_banks[offset];
3379                         break;
3380                 }
3381                 return 1;
3382         }
3383         *pdata = data;
3384         return 0;
3385 }
3386 
3387 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3388 {
3389         switch (msr_info->index) {
3390         case MSR_IA32_PLATFORM_ID:
3391         case MSR_IA32_EBL_CR_POWERON:
3392         case MSR_IA32_LASTBRANCHFROMIP:
3393         case MSR_IA32_LASTBRANCHTOIP:
3394         case MSR_IA32_LASTINTFROMIP:
3395         case MSR_IA32_LASTINTTOIP:
3396         case MSR_K8_SYSCFG:
3397         case MSR_K8_TSEG_ADDR:
3398         case MSR_K8_TSEG_MASK:
3399         case MSR_VM_HSAVE_PA:
3400         case MSR_K8_INT_PENDING_MSG:
3401         case MSR_AMD64_NB_CFG:
3402         case MSR_FAM10H_MMIO_CONF_BASE:
3403         case MSR_AMD64_BU_CFG2:
3404         case MSR_IA32_PERF_CTL:
3405         case MSR_AMD64_DC_CFG:
3406         case MSR_F15H_EX_CFG:
3407         /*
3408          * Intel Sandy Bridge CPUs must support the RAPL (running average power
3409          * limit) MSRs. Just return 0, as we do not want to expose the host
3410          * data here. Do not conditionalize this on CPUID, as KVM does not do
3411          * so for existing CPU-specific MSRs.
3412          */
3413         case MSR_RAPL_POWER_UNIT:
3414         case MSR_PP0_ENERGY_STATUS:     /* Power plane 0 (core) */
3415         case MSR_PP1_ENERGY_STATUS:     /* Power plane 1 (graphics uncore) */
3416         case MSR_PKG_ENERGY_STATUS:     /* Total package */
3417         case MSR_DRAM_ENERGY_STATUS:    /* DRAM controller */
3418                 msr_info->data = 0;
3419                 break;
3420         case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
3421         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3422         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3423         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3424         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3425                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3426                         return kvm_pmu_get_msr(vcpu, msr_info);
3427                 msr_info->data = 0;
3428                 break;
3429         case MSR_IA32_UCODE_REV:
3430                 msr_info->data = vcpu->arch.microcode_version;
3431                 break;
3432         case MSR_IA32_ARCH_CAPABILITIES:
3433                 if (!msr_info->host_initiated &&
3434                     !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
3435                         return 1;
3436                 msr_info->data = vcpu->arch.arch_capabilities;
3437                 break;
3438         case MSR_IA32_PERF_CAPABILITIES:
3439                 if (!msr_info->host_initiated &&
3440                     !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
3441                         return 1;
3442                 msr_info->data = vcpu->arch.perf_capabilities;
3443                 break;
3444         case MSR_IA32_POWER_CTL:
3445                 msr_info->data = vcpu->arch.msr_ia32_power_ctl;
3446                 break;
3447         case MSR_IA32_TSC: {
3448                 /*
3449                  * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
3450                  * even when not intercepted. AMD manual doesn't explicitly
3451                  * state this but appears to behave the same.
3452                  *
3453                  * On userspace reads and writes, however, we unconditionally
3454                  * return L1's TSC value to ensure backwards-compatible
3455                  * behavior for migration.
3456                  */
3457                 u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
3458                                                             vcpu->arch.tsc_offset;
3459 
3460                 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
3461                 break;
3462         }
3463         case MSR_MTRRcap:
3464         case 0x200 ... 0x2ff:
3465                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
3466         case 0xcd: /* fsb frequency */
3467                 msr_info->data = 3;
3468                 break;
3469                 /*
3470                  * MSR_EBC_FREQUENCY_ID
3471                  * Conservative value valid for even the basic CPU models.
3472                  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
3473                  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
3474                  * and 266MHz for model 3, or 4. Set Core Clock
3475                  * Frequency to System Bus Frequency Ratio to 1 (bits
3476                  * 31:24) even though these are only valid for CPU
3477                  * models > 2, however guests may end up dividing or
3478                  * multiplying by zero otherwise.
3479                  */
3480         case MSR_EBC_FREQUENCY_ID:
3481                 msr_info->data = 1 << 24;
3482                 break;
3483         case MSR_IA32_APICBASE:
3484                 msr_info->data = kvm_get_apic_base(vcpu);
3485                 break;
3486         case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3487                 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
3488         case MSR_IA32_TSCDEADLINE:
3489                 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
3490                 break;
3491         case MSR_IA32_TSC_ADJUST:
3492                 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
3493                 break;
3494         case MSR_IA32_MISC_ENABLE:
3495                 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
3496                 break;
3497         case MSR_IA32_SMBASE:
3498                 if (!msr_info->host_initiated)
3499                         return 1;
3500                 msr_info->data = vcpu->arch.smbase;
3501                 break;
3502         case MSR_SMI_COUNT:
3503                 msr_info->data = vcpu->arch.smi_count;
3504                 break;
3505         case MSR_IA32_PERF_STATUS:
3506                 /* TSC increment by tick */
3507                 msr_info->data = 1000ULL;
3508                 /* CPU multiplier */
3509                 msr_info->data |= (((uint64_t)4ULL) << 40);
3510                 break;
3511         case MSR_EFER:
3512                 msr_info->data = vcpu->arch.efer;
3513                 break;
3514         case MSR_KVM_WALL_CLOCK:
3515                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3516                         return 1;
3517 
3518                 msr_info->data = vcpu->kvm->arch.wall_clock;
3519                 break;
3520         case MSR_KVM_WALL_CLOCK_NEW:
3521                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3522                         return 1;
3523 
3524                 msr_info->data = vcpu->kvm->arch.wall_clock;
3525                 break;
3526         case MSR_KVM_SYSTEM_TIME:
3527                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3528                         return 1;
3529 
3530                 msr_info->data = vcpu->arch.time;
3531                 break;
3532         case MSR_KVM_SYSTEM_TIME_NEW:
3533                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3534                         return 1;
3535 
3536                 msr_info->data = vcpu->arch.time;
3537                 break;
3538         case MSR_KVM_ASYNC_PF_EN:
3539                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3540                         return 1;
3541 
3542                 msr_info->data = vcpu->arch.apf.msr_en_val;
3543                 break;
3544         case MSR_KVM_ASYNC_PF_INT:
3545                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3546                         return 1;
3547 
3548                 msr_info->data = vcpu->arch.apf.msr_int_val;
3549                 break;
3550         case MSR_KVM_ASYNC_PF_ACK:
3551                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3552                         return 1;
3553 
3554                 msr_info->data = 0;
3555                 break;
3556         case MSR_KVM_STEAL_TIME:
3557                 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3558                         return 1;
3559 
3560                 msr_info->data = vcpu->arch.st.msr_val;
3561                 break;
3562         case MSR_KVM_PV_EOI_EN:
3563                 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3564                         return 1;
3565 
3566                 msr_info->data = vcpu->arch.pv_eoi.msr_val;
3567                 break;
3568         case MSR_KVM_POLL_CONTROL:
3569                 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3570                         return 1;
3571 
3572                 msr_info->data = vcpu->arch.msr_kvm_poll_control;
3573                 break;
3574         case MSR_IA32_P5_MC_ADDR:
3575         case MSR_IA32_P5_MC_TYPE:
3576         case MSR_IA32_MCG_CAP:
3577         case MSR_IA32_MCG_CTL:
3578         case MSR_IA32_MCG_STATUS:
3579         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3580                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
3581                                    msr_info->host_initiated);
3582         case MSR_IA32_XSS:
3583                 if (!msr_info->host_initiated &&
3584                     !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3585                         return 1;
3586                 msr_info->data = vcpu->arch.ia32_xss;
3587                 break;
3588         case MSR_K7_CLK_CTL:
3589                 /*
3590                  * Provide expected ramp-up count for K7. All other
3591                  * are set to zero, indicating minimum divisors for
3592                  * every field.
3593                  *
3594                  * This prevents guest kernels on AMD host with CPU
3595                  * type 6, model 8 and higher from exploding due to
3596                  * the rdmsr failing.
3597                  */
3598                 msr_info->data = 0x20000000;
3599                 break;
3600         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3601         case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3602         case HV_X64_MSR_SYNDBG_OPTIONS:
3603         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3604         case HV_X64_MSR_CRASH_CTL:
3605         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3606         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3607         case HV_X64_MSR_TSC_EMULATION_CONTROL:
3608         case HV_X64_MSR_TSC_EMULATION_STATUS:
3609                 return kvm_hv_get_msr_common(vcpu,
3610                                              msr_info->index, &msr_info->data,
3611                                              msr_info->host_initiated);
3612         case MSR_IA32_BBL_CR_CTL3:
3613                 /* This legacy MSR exists but isn't fully documented in current
3614                  * silicon.  It is however accessed by winxp in very narrow
3615                  * scenarios where it sets bit #19, itself documented as
3616                  * a "reserved" bit.  Best effort attempt to source coherent
3617                  * read data here should the balance of the register be
3618                  * interpreted by the guest:
3619                  *
3620                  * L2 cache control register 3: 64GB range, 256KB size,
3621                  * enabled, latency 0x1, configured
3622                  */
3623                 msr_info->data = 0xbe702111;
3624                 break;
3625         case MSR_AMD64_OSVW_ID_LENGTH:
3626                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3627                         return 1;
3628                 msr_info->data = vcpu->arch.osvw.length;
3629                 break;
3630         case MSR_AMD64_OSVW_STATUS:
3631                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3632                         return 1;
3633                 msr_info->data = vcpu->arch.osvw.status;
3634                 break;
3635         case MSR_PLATFORM_INFO:
3636                 if (!msr_info->host_initiated &&
3637                     !vcpu->kvm->arch.guest_can_read_msr_platform_info)
3638                         return 1;
3639                 msr_info->data = vcpu->arch.msr_platform_info;
3640                 break;
3641         case MSR_MISC_FEATURES_ENABLES:
3642                 msr_info->data = vcpu->arch.msr_misc_features_enables;
3643                 break;
3644         case MSR_K7_HWCR:
3645                 msr_info->data = vcpu->arch.msr_hwcr;
3646                 break;
3647         default:
3648                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3649                         return kvm_pmu_get_msr(vcpu, msr_info);
3650                 return KVM_MSR_RET_INVALID;
3651         }
3652         return 0;
3653 }
3654 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
3655 
3656 /*
3657  * Read or write a bunch of msrs. All parameters are kernel addresses.
3658  *
3659  * @return number of msrs set successfully.
3660  */
3661 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
3662                     struct kvm_msr_entry *entries,
3663                     int (*do_msr)(struct kvm_vcpu *vcpu,
3664                                   unsigned index, u64 *data))
3665 {
3666         int i;
3667 
3668         for (i = 0; i < msrs->nmsrs; ++i)
3669                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
3670                         break;
3671 
3672         return i;
3673 }
3674 
3675 /*
3676  * Read or write a bunch of msrs. Parameters are user addresses.
3677  *
3678  * @return number of msrs set successfully.
3679  */
3680 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
3681                   int (*do_msr)(struct kvm_vcpu *vcpu,
3682                                 unsigned index, u64 *data),
3683                   int writeback)
3684 {
3685         struct kvm_msrs msrs;
3686         struct kvm_msr_entry *entries;
3687         int r, n;
3688         unsigned size;
3689 
3690         r = -EFAULT;
3691         if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
3692                 goto out;
3693 
3694         r = -E2BIG;
3695         if (msrs.nmsrs >= MAX_IO_MSRS)
3696                 goto out;
3697 
3698         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
3699         entries = memdup_user(user_msrs->entries, size);
3700         if (IS_ERR(entries)) {
3701                 r = PTR_ERR(entries);
3702                 goto out;
3703         }
3704 
3705         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
3706         if (r < 0)
3707                 goto out_free;
3708 
3709         r = -EFAULT;
3710         if (writeback && copy_to_user(user_msrs->entries, entries, size))
3711                 goto out_free;
3712 
3713         r = n;
3714 
3715 out_free:
3716         kfree(entries);
3717 out:
3718         return r;
3719 }
3720 
3721 static inline bool kvm_can_mwait_in_guest(void)
3722 {
3723         return boot_cpu_has(X86_FEATURE_MWAIT) &&
3724                 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
3725                 boot_cpu_has(X86_FEATURE_ARAT);
3726 }
3727 
3728 static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
3729                                             struct kvm_cpuid2 __user *cpuid_arg)
3730 {
3731         struct kvm_cpuid2 cpuid;
3732         int r;
3733 
3734         r = -EFAULT;
3735         if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3736                 return r;
3737 
3738         r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3739         if (r)
3740                 return r;
3741 
3742         r = -EFAULT;
3743         if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3744                 return r;
3745 
3746         return 0;
3747 }
3748 
3749 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3750 {
3751         int r = 0;
3752 
3753         switch (ext) {
3754         case KVM_CAP_IRQCHIP:
3755         case KVM_CAP_HLT:
3756         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
3757         case KVM_CAP_SET_TSS_ADDR:
3758         case KVM_CAP_EXT_CPUID:
3759         case KVM_CAP_EXT_EMUL_CPUID:
3760         case KVM_CAP_CLOCKSOURCE:
3761         case KVM_CAP_PIT:
3762         case KVM_CAP_NOP_IO_DELAY:
3763         case KVM_CAP_MP_STATE:
3764         case KVM_CAP_SYNC_MMU:
3765         case KVM_CAP_USER_NMI:
3766         case KVM_CAP_REINJECT_CONTROL:
3767         case KVM_CAP_IRQ_INJECT_STATUS:
3768         case KVM_CAP_IOEVENTFD:
3769         case KVM_CAP_IOEVENTFD_NO_LENGTH:
3770         case KVM_CAP_PIT2:
3771         case KVM_CAP_PIT_STATE2:
3772         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
3773         case KVM_CAP_VCPU_EVENTS:
3774         case KVM_CAP_HYPERV:
3775         case KVM_CAP_HYPERV_VAPIC:
3776         case KVM_CAP_HYPERV_SPIN:
3777         case KVM_CAP_HYPERV_SYNIC:
3778         case KVM_CAP_HYPERV_SYNIC2:
3779         case KVM_CAP_HYPERV_VP_INDEX:
3780         case KVM_CAP_HYPERV_EVENTFD:
3781         case KVM_CAP_HYPERV_TLBFLUSH:
3782         case KVM_CAP_HYPERV_SEND_IPI:
3783         case KVM_CAP_HYPERV_CPUID:
3784         case KVM_CAP_SYS_HYPERV_CPUID:
3785         case KVM_CAP_PCI_SEGMENT:
3786         case KVM_CAP_DEBUGREGS:
3787         case KVM_CAP_X86_ROBUST_SINGLESTEP:
3788         case KVM_CAP_XSAVE:
3789         case KVM_CAP_ASYNC_PF:
3790         case KVM_CAP_ASYNC_PF_INT:
3791         case KVM_CAP_GET_TSC_KHZ:
3792         case KVM_CAP_KVMCLOCK_CTRL:
3793         case KVM_CAP_READONLY_MEM:
3794         case KVM_CAP_HYPERV_TIME:
3795         case KVM_CAP_IOAPIC_POLARITY_IGNORED:
3796         case KVM_CAP_TSC_DEADLINE_TIMER:
3797         case KVM_CAP_DISABLE_QUIRKS:
3798         case KVM_CAP_SET_BOOT_CPU_ID:
3799         case KVM_CAP_SPLIT_IRQCHIP:
3800         case KVM_CAP_IMMEDIATE_EXIT:
3801         case KVM_CAP_PMU_EVENT_FILTER:
3802         case KVM_CAP_GET_MSR_FEATURES:
3803         case KVM_CAP_MSR_PLATFORM_INFO:
3804         case KVM_CAP_EXCEPTION_PAYLOAD:
3805         case KVM_CAP_SET_GUEST_DEBUG:
3806         case KVM_CAP_LAST_CPU:
3807         case KVM_CAP_X86_USER_SPACE_MSR:
3808         case KVM_CAP_X86_MSR_FILTER:
3809         case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
3810                 r = 1;
3811                 break;
3812 #ifdef CONFIG_KVM_XEN
3813         case KVM_CAP_XEN_HVM:
3814                 r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
3815                     KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
3816                     KVM_XEN_HVM_CONFIG_SHARED_INFO;
3817                 if (sched_info_on())
3818                         r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
3819                 break;
3820 #endif
3821         case KVM_CAP_SYNC_REGS:
3822                 r = KVM_SYNC_X86_VALID_FIELDS;
3823                 break;
3824         case KVM_CAP_ADJUST_CLOCK:
3825                 r = KVM_CLOCK_TSC_STABLE;
3826                 break;
3827         case KVM_CAP_X86_DISABLE_EXITS:
3828                 r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
3829                       KVM_X86_DISABLE_EXITS_CSTATE;
3830                 if(kvm_can_mwait_in_guest())
3831                         r |= KVM_X86_DISABLE_EXITS_MWAIT;
3832                 break;
3833         case KVM_CAP_X86_SMM:
3834                 /* SMBASE is usually relocated above 1M on modern chipsets,
3835                  * and SMM handlers might indeed rely on 4G segment limits,
3836                  * so do not report SMM to be available if real mode is
3837                  * emulated via vm86 mode.  Still, do not go to great lengths
3838                  * to avoid userspace's usage of the feature, because it is a
3839                  * fringe case that is not enabled except via specific settings
3840                  * of the module parameters.
3841                  */
3842                 r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
3843                 break;
3844         case KVM_CAP_VAPIC:
3845                 r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
3846                 break;
3847         case KVM_CAP_NR_VCPUS:
3848                 r = KVM_SOFT_MAX_VCPUS;
3849                 break;
3850         case KVM_CAP_MAX_VCPUS:
3851                 r = KVM_MAX_VCPUS;
3852                 break;
3853         case KVM_CAP_MAX_VCPU_ID:
3854                 r = KVM_MAX_VCPU_ID;
3855                 break;
3856         case KVM_CAP_PV_MMU:    /* obsolete */
3857                 r = 0;
3858                 break;
3859         case KVM_CAP_MCE:
3860                 r = KVM_MAX_MCE_BANKS;
3861                 break;
3862         case KVM_CAP_XCRS:
3863                 r = boot_cpu_has(X86_FEATURE_XSAVE);
3864                 break;
3865         case KVM_CAP_TSC_CONTROL:
3866                 r = kvm_has_tsc_control;
3867                 break;
3868         case KVM_CAP_X2APIC_API:
3869                 r = KVM_X2APIC_API_VALID_FLAGS;
3870                 break;
3871         case KVM_CAP_NESTED_STATE:
3872                 r = kvm_x86_ops.nested_ops->get_state ?
3873                         kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
3874                 break;
3875         case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
3876                 r = kvm_x86_ops.enable_direct_tlbflush != NULL;
3877                 break;
3878         case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3879                 r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
3880                 break;
3881         case KVM_CAP_SMALLER_MAXPHYADDR:
3882                 r = (int) allow_smaller_maxphyaddr;
3883                 break;
3884         case KVM_CAP_STEAL_TIME:
3885                 r = sched_info_on();
3886                 break;
3887         case KVM_CAP_X86_BUS_LOCK_EXIT:
3888                 if (kvm_has_bus_lock_exit)
3889                         r = KVM_BUS_LOCK_DETECTION_OFF |
3890                             KVM_BUS_LOCK_DETECTION_EXIT;
3891                 else
3892                         r = 0;
3893                 break;
3894         default:
3895                 break;
3896         }
3897         return r;
3898 
3899 }
3900 
3901 long kvm_arch_dev_ioctl(struct file *filp,
3902                         unsigned int ioctl, unsigned long arg)
3903 {
3904         void __user *argp = (void __user *)arg;
3905         long r;
3906 
3907         switch (ioctl) {
3908         case KVM_GET_MSR_INDEX_LIST: {
3909                 struct kvm_msr_list __user *user_msr_list = argp;
3910                 struct kvm_msr_list msr_list;
3911                 unsigned n;
3912 
3913                 r = -EFAULT;
3914                 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3915                         goto out;
3916                 n = msr_list.nmsrs;
3917                 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3918                 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3919                         goto out;
3920                 r = -E2BIG;
3921                 if (n < msr_list.nmsrs)
3922                         goto out;
3923                 r = -EFAULT;
3924                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3925                                  num_msrs_to_save * sizeof(u32)))
3926                         goto out;
3927                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
3928                                  &emulated_msrs,
3929                                  num_emulated_msrs * sizeof(u32)))
3930                         goto out;
3931                 r = 0;
3932                 break;
3933         }
3934         case KVM_GET_SUPPORTED_CPUID:
3935         case KVM_GET_EMULATED_CPUID: {
3936                 struct kvm_cpuid2 __user *cpuid_arg = argp;
3937                 struct kvm_cpuid2 cpuid;
3938 
3939                 r = -EFAULT;
3940                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3941                         goto out;
3942 
3943                 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
3944                                             ioctl);
3945                 if (r)
3946                         goto out;
3947 
3948                 r = -EFAULT;
3949                 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3950                         goto out;
3951                 r = 0;
3952                 break;
3953         }
3954         case KVM_X86_GET_MCE_CAP_SUPPORTED:
3955                 r = -EFAULT;
3956                 if (copy_to_user(argp, &kvm_mce_cap_supported,
3957                                  sizeof(kvm_mce_cap_supported)))
3958                         goto out;
3959                 r = 0;
3960                 break;
3961         case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3962                 struct kvm_msr_list __user *user_msr_list = argp;
3963                 struct kvm_msr_list msr_list;
3964                 unsigned int n;
3965 
3966                 r = -EFAULT;
3967                 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3968                         goto out;
3969                 n = msr_list.nmsrs;
3970                 msr_list.nmsrs = num_msr_based_features;
3971                 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3972                         goto out;
3973                 r = -E2BIG;
3974                 if (n < msr_list.nmsrs)
3975                         goto out;
3976                 r = -EFAULT;
3977                 if (copy_to_user(user_msr_list->indices, &msr_based_features,
3978                                  num_msr_based_features * sizeof(u32)))
3979                         goto out;
3980                 r = 0;
3981                 break;
3982         }
3983         case KVM_GET_MSRS:
3984                 r = msr_io(NULL, argp, do_get_msr_feature, 1);
3985                 break;
3986         case KVM_GET_SUPPORTED_HV_CPUID:
3987                 r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
3988                 break;
3989         default:
3990                 r = -EINVAL;
3991                 break;
3992         }
3993 out:
3994         return r;
3995 }
3996 
3997 static void wbinvd_ipi(void *garbage)
3998 {
3999         wbinvd();
4000 }
4001 
4002 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
4003 {
4004         return kvm_arch_has_noncoherent_dma(vcpu->kvm);
4005 }
4006 
4007 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
4008 {
4009         /* Address WBINVD may be executed by guest */
4010         if (need_emulate_wbinvd(vcpu)) {
4011                 if (static_call(kvm_x86_has_wbinvd_exit)())
4012                         cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4013                 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
4014                         smp_call_function_single(vcpu->cpu,
4015                                         wbinvd_ipi, NULL, 1);
4016         }
4017 
4018         static_call(kvm_x86_vcpu_load)(vcpu, cpu);
4019 
4020         /* Save host pkru register if supported */
4021         vcpu->arch.host_pkru = read_pkru();
4022 
4023         /* Apply any externally detected TSC adjustments (due to suspend) */
4024         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
4025                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
4026                 vcpu->arch.tsc_offset_adjustment = 0;
4027                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4028         }
4029 
4030         if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
4031                 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
4032                                 rdtsc() - vcpu->arch.last_host_tsc;
4033                 if (tsc_delta < 0)
4034                         mark_tsc_unstable("KVM discovered backwards TSC");
4035 
4036                 if (kvm_check_tsc_unstable()) {
4037                         u64 offset = kvm_compute_tsc_offset(vcpu,
4038                                                 vcpu->arch.last_guest_tsc);
4039                         kvm_vcpu_write_tsc_offset(vcpu, offset);
4040                         vcpu->arch.tsc_catchup = 1;
4041                 }
4042 
4043                 if (kvm_lapic_hv_timer_in_use(vcpu))
4044                         kvm_lapic_restart_hv_timer(vcpu);
4045 
4046                 /*
4047                  * On a host with synchronized TSC, there is no need to update
4048                  * kvmclock on vcpu->cpu migration
4049                  */
4050                 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
4051                         kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
4052                 if (vcpu->cpu != cpu)
4053                         kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
4054                 vcpu->cpu = cpu;
4055         }
4056 
4057         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
4058 }
4059 
4060 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
4061 {
4062         struct kvm_host_map map;
4063         struct kvm_steal_time *st;
4064 
4065         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
4066                 return;
4067 
4068         if (vcpu->arch.st.preempted)
4069                 return;
4070 
4071         if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
4072                         &vcpu->arch.st.cache, true))
4073                 return;
4074 
4075         st = map.hva +
4076                 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
4077 
4078         st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4079 
4080         kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
4081 }
4082 
4083 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
4084 {
4085         int idx;
4086 
4087         if (vcpu->preempted && !vcpu->arch.guest_state_protected)
4088                 vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
4089 
4090         /*
4091          * Take the srcu lock as memslots will be accessed to check the gfn
4092          * cache generation against the memslots generation.
4093          */
4094         idx = srcu_read_lock(&vcpu->kvm->srcu);
4095         if (kvm_xen_msr_enabled(vcpu->kvm))
4096                 kvm_xen_runstate_set_preempted(vcpu);
4097         else
4098                 kvm_steal_time_set_preempted(vcpu);
4099         srcu_read_unlock(&vcpu->kvm->srcu, idx);
4100 
4101         static_call(kvm_x86_vcpu_put)(vcpu);
4102         vcpu->arch.last_host_tsc = rdtsc();
4103         /*
4104          * If userspace has set any breakpoints or watchpoints, dr6 is restored
4105          * on every vmexit, but if not, we might have a stale dr6 from the
4106          * guest. do_debug expects dr6 to be cleared after it runs, do the same.
4107          */
4108         set_debugreg(0, 6);
4109 }
4110 
4111 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
4112                                     struct kvm_lapic_state *s)
4113 {
4114         if (vcpu->arch.apicv_active)
4115                 static_call(kvm_x86_sync_pir_to_irr)(vcpu);
4116 
4117         return kvm_apic_get_state(vcpu, s);
4118 }
4119 
4120 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
4121                                     struct kvm_lapic_state *s)
4122 {
4123         int r;
4124 
4125         r = kvm_apic_set_state(vcpu, s);
4126         if (r)
4127                 return r;
4128         update_cr8_intercept(vcpu);
4129 
4130         return 0;
4131 }
4132 
4133 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
4134 {
4135         /*
4136          * We can accept userspace's request for interrupt injection
4137          * as long as we have a place to store the interrupt number.
4138          * The actual injection will happen when the CPU is able to
4139          * deliver the interrupt.
4140          */
4141         if (kvm_cpu_has_extint(vcpu))
4142                 return false;
4143 
4144         /* Acknowledging ExtINT does not happen if LINT0 is masked.  */
4145         return (!lapic_in_kernel(vcpu) ||
4146                 kvm_apic_accept_pic_intr(vcpu));
4147 }
4148 
4149 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
4150 {
4151         return kvm_arch_interrupt_allowed(vcpu) &&
4152                 kvm_cpu_accept_dm_intr(vcpu);
4153 }
4154 
4155 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
4156                                     struct kvm_interrupt *irq)
4157 {
4158         if (irq->irq >= KVM_NR_INTERRUPTS)
4159                 return -EINVAL;
4160 
4161         if (!irqchip_in_kernel(vcpu->kvm)) {
4162                 kvm_queue_interrupt(vcpu, irq->irq, false);
4163                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4164                 return 0;
4165         }
4166 
4167         /*
4168          * With in-kernel LAPIC, we only use this to inject EXTINT, so
4169          * fail for in-kernel 8259.
4170          */
4171         if (pic_in_kernel(vcpu->kvm))
4172                 return -ENXIO;
4173 
4174         if (vcpu->arch.pending_external_vector != -1)
4175                 return -EEXIST;
4176 
4177         vcpu->arch.pending_external_vector = irq->irq;
4178         kvm_make_request(KVM_REQ_EVENT, vcpu);
4179         return 0;
4180 }
4181 
4182 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
4183 {
4184         kvm_inject_nmi(vcpu);
4185 
4186         return 0;
4187 }
4188 
4189 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
4190 {
4191         kvm_make_request(KVM_REQ_SMI, vcpu);
4192 
4193         return 0;
4194 }
4195 
4196 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
4197                                            struct kvm_tpr_access_ctl *tac)
4198 {
4199         if (tac->flags)
4200                 return -EINVAL;
4201         vcpu->arch.tpr_access_reporting = !!tac->enabled;
4202         return 0;
4203 }
4204 
4205 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
4206                                         u64 mcg_cap)
4207 {
4208         int r;
4209         unsigned bank_num = mcg_cap & 0xff, bank;
4210 
4211         r = -EINVAL;
4212         if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
4213                 goto out;
4214         if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
4215                 goto out;
4216         r = 0;
4217         vcpu->arch.mcg_cap = mcg_cap;
4218         /* Init IA32_MCG_CTL to all 1s */
4219         if (mcg_cap & MCG_CTL_P)
4220                 vcpu->arch.mcg_ctl = ~(u64)0;
4221         /* Init IA32_MCi_CTL to all 1s */
4222         for (bank = 0; bank < bank_num; bank++)
4223                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
4224 
4225         static_call(kvm_x86_setup_mce)(vcpu);
4226 out:
4227         return r;
4228 }
4229 
4230 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
4231                                       struct kvm_x86_mce *mce)
4232 {
4233         u64 mcg_cap = vcpu->arch.mcg_cap;
4234         unsigned bank_num = mcg_cap & 0xff;
4235         u64 *banks = vcpu->arch.mce_banks;
4236 
4237         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
4238                 return -EINVAL;
4239         /*
4240          * if IA32_MCG_CTL is not all 1s, the uncorrected error
4241          * reporting is disabled
4242          */
4243         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
4244             vcpu->arch.mcg_ctl != ~(u64)0)
4245                 return 0;
4246         banks += 4 * mce->bank;
4247         /*
4248          * if IA32_MCi_CTL is not all 1s, the uncorrected error
4249          * reporting is disabled for the bank
4250          */
4251         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
4252                 return 0;
4253         if (mce->status & MCI_STATUS_UC) {
4254                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
4255                     !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
4256                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4257                         return 0;
4258                 }
4259                 if (banks[1] & MCI_STATUS_VAL)
4260                         mce->status |= MCI_STATUS_OVER;
4261                 banks[2] = mce->addr;
4262                 banks[3] = mce->misc;
4263                 vcpu->arch.mcg_status = mce->mcg_status;
4264                 banks[1] = mce->status;
4265                 kvm_queue_exception(vcpu, MC_VECTOR);
4266         } else if (!(banks[1] & MCI_STATUS_VAL)
4267                    || !(banks[1] & MCI_STATUS_UC)) {
4268                 if (banks[1] & MCI_STATUS_VAL)
4269                         mce->status |= MCI_STATUS_OVER;
4270                 banks[2] = mce->addr;
4271                 banks[3] = mce->misc;
4272                 banks[1] = mce->status;
4273         } else
4274                 banks[1] |= MCI_STATUS_OVER;
4275         return 0;
4276 }
4277 
4278 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
4279                                                struct kvm_vcpu_events *events)
4280 {
4281         process_nmi(vcpu);
4282 
4283         if (kvm_check_request(KVM_REQ_SMI, vcpu))
4284                 process_smi(vcpu);
4285 
4286         /*
4287          * In guest mode, payload delivery should be deferred,
4288          * so that the L1 hypervisor can intercept #PF before
4289          * CR2 is modified (or intercept #DB before DR6 is
4290          * modified under nVMX). Unless the per-VM capability,
4291          * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
4292          * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
4293          * opportunistically defer the exception payload, deliver it if the
4294          * capability hasn't been requested before processing a
4295          * KVM_GET_VCPU_EVENTS.
4296          */
4297         if (!vcpu->kvm->arch.exception_payload_enabled &&
4298             vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
4299                 kvm_deliver_exception_payload(vcpu);
4300 
4301         /*
4302          * The API doesn't provide the instruction length for software
4303          * exceptions, so don't report them. As long as the guest RIP
4304          * isn't advanced, we should expect to encounter the exception
4305          * again.
4306          */
4307         if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
4308                 events->exception.injected = 0;
4309                 events->exception.pending = 0;
4310         } else {
4311                 events->exception.injected = vcpu->arch.exception.injected;
4312                 events->exception.pending = vcpu->arch.exception.pending;
4313                 /*
4314                  * For ABI compatibility, deliberately conflate
4315                  * pending and injected exceptions when
4316                  * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
4317                  */
4318                 if (!vcpu->kvm->arch.exception_payload_enabled)
4319                         events->exception.injected |=
4320                                 vcpu->arch.exception.pending;
4321         }
4322         events->exception.nr = vcpu->arch.exception.nr;
4323         events->exception.has_error_code = vcpu->arch.exception.has_error_code;
4324         events->exception.error_code = vcpu->arch.exception.error_code;
4325         events->exception_has_payload = vcpu->arch.exception.has_payload;
4326         events->exception_payload = vcpu->arch.exception.payload;
4327 
4328         events->interrupt.injected =
4329                 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
4330         events->interrupt.nr = vcpu->arch.interrupt.nr;
4331         events->interrupt.soft = 0;
4332         events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
4333 
4334         events->nmi.injected = vcpu->arch.nmi_injected;
4335         events->nmi.pending = vcpu->arch.nmi_pending != 0;
4336         events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
4337         events->nmi.pad = 0;
4338 
4339         events->sipi_vector = 0; /* never valid when reporting to user space */
4340 
4341         events->smi.smm = is_smm(vcpu);
4342         events->smi.pending = vcpu->arch.smi_pending;
4343         events->smi.smm_inside_nmi =
4344                 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
4345         events->smi.latched_init = kvm_lapic_latched_init(vcpu);
4346 
4347         events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
4348                          | KVM_VCPUEVENT_VALID_SHADOW
4349                          | KVM_VCPUEVENT_VALID_SMM);
4350         if (vcpu->kvm->arch.exception_payload_enabled)
4351                 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4352 
4353         memset(&events->reserved, 0, sizeof(events->reserved));
4354 }
4355 
4356 static void kvm_smm_changed(struct kvm_vcpu *vcpu);
4357 
4358 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
4359                                               struct kvm_vcpu_events *events)
4360 {
4361         if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
4362                               | KVM_VCPUEVENT_VALID_SIPI_VECTOR
4363                               | KVM_VCPUEVENT_VALID_SHADOW
4364                               | KVM_VCPUEVENT_VALID_SMM
4365                               | KVM_VCPUEVENT_VALID_PAYLOAD))
4366                 return -EINVAL;
4367 
4368         if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4369                 if (!vcpu->kvm->arch.exception_payload_enabled)
4370                         return -EINVAL;
4371                 if (events->exception.pending)
4372                         events->exception.injected = 0;
4373                 else
4374                         events->exception_has_payload = 0;
4375         } else {
4376                 events->exception.pending = 0;
4377                 events->exception_has_payload = 0;
4378         }
4379 
4380         if ((events->exception.injected || events->exception.pending) &&
4381             (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
4382                 return -EINVAL;
4383 
4384         /* INITs are latched while in SMM */
4385         if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
4386             (events->smi.smm || events->smi.pending) &&
4387             vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4388                 return -EINVAL;
4389 
4390         process_nmi(vcpu);
4391         vcpu->arch.exception.injected = events->exception.injected;
4392         vcpu->arch.exception.pending = events->exception.pending;
4393         vcpu->arch.exception.nr = events->exception.nr;
4394         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
4395         vcpu->arch.exception.error_code = events->exception.error_code;
4396         vcpu->arch.exception.has_payload = events->exception_has_payload;
4397         vcpu->arch.exception.payload = events->exception_payload;
4398 
4399         vcpu->arch.interrupt.injected = events->interrupt.injected;
4400         vcpu->arch.interrupt.nr = events->interrupt.nr;
4401         vcpu->arch.interrupt.soft = events->interrupt.soft;
4402         if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
4403                 static_call(kvm_x86_set_interrupt_shadow)(vcpu,
4404                                                 events->interrupt.shadow);
4405 
4406         vcpu->arch.nmi_injected = events->nmi.injected;
4407         if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
4408                 vcpu->arch.nmi_pending = events->nmi.pending;
4409         static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
4410 
4411         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
4412             lapic_in_kernel(vcpu))
4413                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
4414 
4415         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
4416                 if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
4417                         if (events->smi.smm)
4418                                 vcpu->arch.hflags |= HF_SMM_MASK;
4419                         else
4420                                 vcpu->arch.hflags &= ~HF_SMM_MASK;
4421                         kvm_smm_changed(vcpu);
4422                 }
4423 
4424                 vcpu->arch.smi_pending = events->smi.pending;
4425 
4426                 if (events->smi.smm) {
4427                         if (events->smi.smm_inside_nmi)
4428                                 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
4429                         else
4430                                 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
4431                 }
4432 
4433                 if (lapic_in_kernel(vcpu)) {
4434                         if (events->smi.latched_init)
4435                                 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
4436                         else
4437                                 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
4438                 }
4439         }
4440 
4441         kvm_make_request(KVM_REQ_EVENT, vcpu);
4442 
4443         return 0;
4444 }
4445 
4446 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
4447                                              struct kvm_debugregs *dbgregs)
4448 {
4449         unsigned long val;
4450 
4451         memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
4452         kvm_get_dr(vcpu, 6, &val);
4453         dbgregs->dr6 = val;
4454         dbgregs->dr7 = vcpu->arch.dr7;
4455         dbgregs->flags = 0;
4456         memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
4457 }
4458 
4459 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
4460                                             struct kvm_debugregs *dbgregs)
4461 {
4462         if (dbgregs->flags)
4463                 return -EINVAL;
4464 
4465         if (!kvm_dr6_valid(dbgregs->dr6))
4466                 return -EINVAL;
4467         if (!kvm_dr7_valid(dbgregs->dr7))
4468                 return -EINVAL;
4469 
4470         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
4471         kvm_update_dr0123(vcpu);
4472         vcpu->arch.dr6 = dbgregs->dr6;
4473         vcpu->arch.dr7 = dbgregs->dr7;
4474         kvm_update_dr7(vcpu);
4475 
4476         return 0;
4477 }
4478 
4479 #define XSTATE_COMPACTION_ENABLED (1ULL << 63)
4480 
4481 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
4482 {
4483         struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
4484         u64 xstate_bv = xsave->header.xfeatures;
4485         u64 valid;
4486 
4487         /*
4488          * Copy legacy XSAVE area, to avoid complications with CPUID
4489          * leaves 0 and 1 in the loop below.
4490          */
4491         memcpy(dest, xsave, XSAVE_HDR_OFFSET);
4492 
4493         /* Set XSTATE_BV */
4494         xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
4495         *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
4496 
4497         /*
4498          * Copy each region from the possibly compacted offset to the
4499          * non-compacted offset.
4500          */
4501         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
4502         while (valid) {
4503                 u64 xfeature_mask = valid & -valid;
4504                 int xfeature_nr = fls64(xfeature_mask) - 1;
4505                 void *src = get_xsave_addr(xsave, xfeature_nr);
4506 
4507                 if (src) {
4508                         u32 size, offset, ecx, edx;
4509                         cpuid_count(XSTATE_CPUID, xfeature_nr,
4510                                     &size, &offset, &ecx, &edx);
4511                         if (xfeature_nr == XFEATURE_PKRU)
4512                                 memcpy(dest + offset, &vcpu->arch.pkru,
4513                                        sizeof(vcpu->arch.pkru));
4514                         else
4515                                 memcpy(dest + offset, src, size);
4516 
4517                 }
4518 
4519                 valid -= xfeature_mask;
4520         }
4521 }
4522 
4523 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
4524 {
4525         struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
4526         u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
4527         u64 valid;
4528 
4529         /*
4530          * Copy legacy XSAVE area, to avoid complications with CPUID
4531          * leaves 0 and 1 in the loop below.
4532          */
4533         memcpy(xsave, src, XSAVE_HDR_OFFSET);
4534 
4535         /* Set XSTATE_BV and possibly XCOMP_BV.  */
4536         xsave->header.xfeatures = xstate_bv;
4537         if (boot_cpu_has(X86_FEATURE_XSAVES))
4538                 xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
4539 
4540         /*
4541          * Copy each region from the non-compacted offset to the
4542          * possibly compacted offset.
4543          */
4544         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
4545         while (valid) {
4546                 u64 xfeature_mask = valid & -valid;
4547                 int xfeature_nr = fls64(xfeature_mask) - 1;
4548                 void *dest = get_xsave_addr(xsave, xfeature_nr);
4549 
4550                 if (dest) {
4551                         u32 size, offset, ecx, edx;
4552                         cpuid_count(XSTATE_CPUID, xfeature_nr,
4553                                     &size, &offset, &ecx, &edx);
4554                         if (xfeature_nr == XFEATURE_PKRU)
4555                                 memcpy(&vcpu->arch.pkru, src + offset,
4556                                        sizeof(vcpu->arch.pkru));
4557                         else
4558                                 memcpy(dest, src + offset, size);
4559                 }
4560 
4561                 valid -= xfeature_mask;
4562         }
4563 }
4564 
4565 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
4566                                          struct kvm_xsave *guest_xsave)
4567 {
4568         if (!vcpu->arch.guest_fpu)
4569                 return;
4570 
4571         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
4572                 memset(guest_xsave, 0, sizeof(struct kvm_xsave));
4573                 fill_xsave((u8 *) guest_xsave->region, vcpu);
4574         } else {
4575                 memcpy(guest_xsave->region,
4576                         &vcpu->arch.guest_fpu->state.fxsave,
4577                         sizeof(struct fxregs_state));
4578                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
4579                         XFEATURE_MASK_FPSSE;
4580         }
4581 }
4582 
4583 #define XSAVE_MXCSR_OFFSET 24
4584 
4585 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
4586                                         struct kvm_xsave *guest_xsave)
4587 {
4588         u64 xstate_bv;
4589         u32 mxcsr;
4590 
4591         if (!vcpu->arch.guest_fpu)
4592                 return 0;
4593 
4594         xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
4595         mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
4596 
4597         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
4598                 /*
4599                  * Here we allow setting states that are not present in
4600                  * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
4601                  * with old userspace.
4602                  */
4603                 if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
4604                         return -EINVAL;
4605                 load_xsave(vcpu, (u8 *)guest_xsave->region);
4606         } else {
4607                 if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
4608                         mxcsr & ~mxcsr_feature_mask)
4609                         return -EINVAL;
4610                 memcpy(&vcpu->arch.guest_fpu->state.fxsave,
4611                         guest_xsave->region, sizeof(struct fxregs_state));
4612         }
4613         return 0;
4614 }
4615 
4616 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
4617                                         struct kvm_xcrs *guest_xcrs)
4618 {
4619         if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
4620                 guest_xcrs->nr_xcrs = 0;
4621                 return;
4622         }
4623 
4624         guest_xcrs->nr_xcrs = 1;
4625         guest_xcrs->flags = 0;
4626         guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
4627         guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
4628 }
4629 
4630 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
4631                                        struct kvm_xcrs *guest_xcrs)
4632 {
4633         int i, r = 0;
4634 
4635         if (!boot_cpu_has(X86_FEATURE_XSAVE))
4636                 return -EINVAL;
4637 
4638         if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
4639                 return -EINVAL;
4640 
4641         for (i = 0; i < guest_xcrs->nr_xcrs; i++)
4642                 /* Only support XCR0 currently */
4643                 if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
4644                         r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
4645                                 guest_xcrs->xcrs[i].value);
4646                         break;
4647                 }
4648         if (r)
4649                 r = -EINVAL;
4650         return r;
4651 }
4652 
4653 /*
4654  * kvm_set_guest_paused() indicates to the guest kernel that it has been
4655  * stopped by the hypervisor.  This function will be called from the host only.
4656  * EINVAL is returned when the host attempts to set the flag for a guest that
4657  * does not support pv clocks.
4658  */
4659 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
4660 {
4661         if (!vcpu->arch.pv_time_enabled)
4662                 return -EINVAL;
4663         vcpu->arch.pvclock_set_guest_stopped_request = true;
4664         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4665         return 0;
4666 }
4667 
4668 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
4669                                      struct kvm_enable_cap *cap)
4670 {
4671         int r;
4672         uint16_t vmcs_version;
4673         void __user *user_ptr;
4674 
4675         if (cap->flags)
4676                 return -EINVAL;
4677 
4678         switch (cap->cap) {
4679         case KVM_CAP_HYPERV_SYNIC2:
4680                 if (cap->args[0])
4681                         return -EINVAL;
4682                 fallthrough;
4683 
4684         case KVM_CAP_HYPERV_SYNIC:
4685                 if (!irqchip_in_kernel(vcpu->kvm))
4686                         return -EINVAL;
4687                 return kvm_hv_activate_synic(vcpu, cap->cap ==
4688                                              KVM_CAP_HYPERV_SYNIC2);
4689         case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
4690                 if (!kvm_x86_ops.nested_ops->enable_evmcs)
4691                         return -ENOTTY;
4692                 r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
4693                 if (!r) {
4694                         user_ptr = (void __user *)(uintptr_t)cap->args[0];
4695                         if (copy_to_user(user_ptr, &vmcs_version,
4696                                          sizeof(vmcs_version)))
4697                                 r = -EFAULT;
4698                 }
4699                 return r;
4700         case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
4701                 if (!kvm_x86_ops.enable_direct_tlbflush)
4702                         return -ENOTTY;
4703 
4704                 return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
4705 
4706         case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
4707                 vcpu->arch.pv_cpuid.enforce = cap->args[0];
4708                 if (vcpu->arch.pv_cpuid.enforce)
4709                         kvm_update_pv_runtime(vcpu);
4710 
4711                 return 0;
4712 
4713         default:
4714                 return -EINVAL;
4715         }
4716 }
4717 
4718 long kvm_arch_vcpu_ioctl(struct file *filp,
4719                          unsigned int ioctl, unsigned long arg)
4720 {
4721         struct kvm_vcpu *vcpu = filp->private_data;
4722         void __user *argp = (void __user *)arg;
4723         int r;
4724         union {
4725                 struct kvm_lapic_state *lapic;
4726                 struct kvm_xsave *xsave;
4727                 struct kvm_xcrs *xcrs;
4728                 void *buffer;
4729         } u;
4730 
4731         vcpu_load(vcpu);
4732 
4733         u.buffer = NULL;
4734         switch (ioctl) {
4735         case KVM_GET_LAPIC: {
4736                 r = -EINVAL;
4737                 if (!lapic_in_kernel(vcpu))
4738                         goto out;
4739                 u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
4740                                 GFP_KERNEL_ACCOUNT);
4741 
4742                 r = -ENOMEM;
4743                 if (!u.lapic)
4744                         goto out;
4745                 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
4746                 if (r)
4747                         goto out;
4748                 r = -EFAULT;
4749                 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
4750                         goto out;
4751                 r = 0;
4752                 break;
4753         }
4754         case KVM_SET_LAPIC: {
4755                 r = -EINVAL;
4756                 if (!lapic_in_kernel(vcpu))
4757                         goto out;
4758                 u.lapic = memdup_user(argp, sizeof(*u.lapic));
4759                 if (IS_ERR(u.lapic)) {
4760                         r = PTR_ERR(u.lapic);
4761                         goto out_nofree;
4762                 }
4763 
4764                 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
4765                 break;
4766         }
4767         case KVM_INTERRUPT: {
4768                 struct kvm_interrupt irq;
4769 
4770                 r = -EFAULT;
4771                 if (copy_from_user(&irq, argp, sizeof(irq)))
4772                         goto out;
4773                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
4774                 break;
4775         }
4776         case KVM_NMI: {
4777                 r = kvm_vcpu_ioctl_nmi(vcpu);
4778                 break;
4779         }
4780         case KVM_SMI: {
4781                 r = kvm_vcpu_ioctl_smi(vcpu);
4782                 break;
4783         }
4784         case KVM_SET_CPUID: {
4785                 struct kvm_cpuid __user *cpuid_arg = argp;
4786                 struct kvm_cpuid cpuid;
4787 
4788                 r = -EFAULT;
4789                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4790                         goto out;
4791                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4792                 break;
4793         }
4794         case KVM_SET_CPUID2: {
4795                 struct kvm_cpuid2 __user *cpuid_arg = argp;
4796                 struct kvm_cpuid2 cpuid;
4797 
4798                 r = -EFAULT;
4799                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4800                         goto out;
4801                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
4802                                               cpuid_arg->entries);
4803                 break;
4804         }
4805         case KVM_GET_CPUID2: {
4806                 struct kvm_cpuid2 __user *cpuid_arg = argp;
4807                 struct kvm_cpuid2 cpuid;
4808 
4809                 r = -EFAULT;
4810                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4811                         goto out;
4812                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
4813                                               cpuid_arg->entries);
4814                 if (r)
4815                         goto out;
4816                 r = -EFAULT;
4817                 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4818                         goto out;
4819                 r = 0;
4820                 break;
4821         }
4822         case KVM_GET_MSRS: {
4823                 int idx = srcu_read_lock(&vcpu->kvm->srcu);
4824                 r = msr_io(vcpu, argp, do_get_msr, 1);
4825                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4826                 break;
4827         }
4828         case KVM_SET_MSRS: {
4829                 int idx = srcu_read_lock(&vcpu->kvm->srcu);
4830                 r = msr_io(vcpu, argp, do_set_msr, 0);
4831                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4832                 break;
4833         }
4834         case KVM_TPR_ACCESS_REPORTING: {
4835                 struct kvm_tpr_access_ctl tac;
4836 
4837                 r = -EFAULT;
4838                 if (copy_from_user(&tac, argp, sizeof(tac)))
4839                         goto out;
4840                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
4841                 if (r)
4842                         goto out;
4843                 r = -EFAULT;
4844                 if (copy_to_user(argp, &tac, sizeof(tac)))
4845                         goto out;
4846                 r = 0;
4847                 break;
4848         };
4849         case KVM_SET_VAPIC_ADDR: {
4850                 struct kvm_vapic_addr va;
4851                 int idx;
4852 
4853                 r = -EINVAL;
4854                 if (!lapic_in_kernel(vcpu))
4855                         goto out;
4856                 r = -EFAULT;
4857                 if (copy_from_user(&va, argp, sizeof(va)))
4858                         goto out;
4859                 idx = srcu_read_lock(&vcpu->kvm->srcu);
4860                 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
4861                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4862                 break;
4863         }
4864         case KVM_X86_SETUP_MCE: {
4865                 u64 mcg_cap;
4866 
4867                 r = -EFAULT;
4868                 if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
4869                         goto out;
4870                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
4871                 break;
4872         }
4873         case KVM_X86_SET_MCE: {
4874                 struct kvm_x86_mce mce;
4875 
4876                 r = -EFAULT;
4877                 if (copy_from_user(&mce, argp, sizeof(mce)))
4878                         goto out;
4879                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
4880                 break;
4881         }
4882         case KVM_GET_VCPU_EVENTS: {
4883                 struct kvm_vcpu_events events;
4884 
4885                 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
4886 
4887                 r = -EFAULT;
4888                 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
4889                         break;
4890                 r = 0;
4891                 break;
4892         }
4893         case KVM_SET_VCPU_EVENTS: {
4894                 struct kvm_vcpu_events events;
4895 
4896                 r = -EFAULT;
4897                 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
4898                         break;
4899 
4900                 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
4901                 break;
4902         }
4903         case KVM_GET_DEBUGREGS: {
4904                 struct kvm_debugregs dbgregs;
4905 
4906                 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
4907 
4908                 r = -EFAULT;
4909                 if (copy_to_user(argp, &dbgregs,
4910                                  sizeof(struct kvm_debugregs)))
4911                         break;
4912                 r = 0;
4913                 break;
4914         }
4915         case KVM_SET_DEBUGREGS: {
4916                 struct kvm_debugregs dbgregs;
4917 
4918                 r = -EFAULT;
4919                 if (copy_from_user(&dbgregs, argp,
4920                                    sizeof(struct kvm_debugregs)))
4921                         break;
4922 
4923                 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
4924                 break;
4925         }
4926         case KVM_GET_XSAVE: {
4927                 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
4928                 r = -ENOMEM;
4929                 if (!u.xsave)
4930                         break;
4931 
4932                 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
4933 
4934                 r = -EFAULT;
4935                 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
4936                         break;
4937                 r = 0;
4938                 break;
4939         }
4940         case KVM_SET_XSAVE: {
4941                 u.xsave = memdup_user(argp, sizeof(*u.xsave));
4942                 if (IS_ERR(u.xsave)) {
4943                         r = PTR_ERR(u.xsave);
4944                         goto out_nofree;
4945                 }
4946 
4947                 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
4948                 break;
4949         }
4950         case KVM_GET_XCRS: {
4951                 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
4952                 r = -ENOMEM;
4953                 if (!u.xcrs)
4954                         break;
4955 
4956                 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
4957 
4958                 r = -EFAULT;
4959                 if (copy_to_user(argp, u.xcrs,
4960                                  sizeof(struct kvm_xcrs)))
4961                         break;
4962                 r = 0;
4963                 break;
4964         }
4965         case KVM_SET_XCRS: {
4966                 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
4967                 if (IS_ERR(u.xcrs)) {
4968                         r = PTR_ERR(u.xcrs);
4969                         goto out_nofree;
4970                 }
4971 
4972                 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
4973                 break;
4974         }
4975         case KVM_SET_TSC_KHZ: {
4976                 u32 user_tsc_khz;
4977 
4978                 r = -EINVAL;
4979                 user_tsc_khz = (u32)arg;
4980 
4981                 if (kvm_has_tsc_control &&
4982                     user_tsc_khz >= kvm_max_guest_tsc_khz)
4983                         goto out;
4984 
4985                 if (user_tsc_khz == 0)
4986                         user_tsc_khz = tsc_khz;
4987 
4988                 if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
4989                         r = 0;
4990 
4991                 goto out;
4992         }
4993         case KVM_GET_TSC_KHZ: {
4994                 r = vcpu->arch.virtual_tsc_khz;
4995                 goto out;
4996         }
4997         case KVM_KVMCLOCK_CTRL: {
4998                 r = kvm_set_guest_paused(vcpu);
4999                 goto out;
5000         }
5001         case KVM_ENABLE_CAP: {
5002                 struct kvm_enable_cap cap;
5003 
5004                 r = -EFAULT;
5005                 if (copy_from_user(&cap, argp, sizeof(cap)))
5006                         goto out;
5007                 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
5008                 break;
5009         }
5010         case KVM_GET_NESTED_STATE: {
5011                 struct kvm_nested_state __user *user_kvm_nested_state = argp;
5012                 u32 user_data_size;
5013 
5014                 r = -EINVAL;
5015                 if (!kvm_x86_ops.nested_ops->get_state)
5016                         break;
5017 
5018                 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
5019                 r = -EFAULT;
5020                 if (get_user(user_data_size, &user_kvm_nested_state->size))
5021                         break;
5022 
5023                 r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
5024                                                      user_data_size);
5025                 if (r < 0)
5026                         break;
5027 
5028                 if (r > user_data_size) {
5029                         if (put_user(r, &user_kvm_nested_state->size))
5030                                 r = -EFAULT;
5031                         else
5032                                 r = -E2BIG;
5033                         break;
5034                 }
5035 
5036                 r = 0;
5037                 break;
5038         }
5039         case KVM_SET_NESTED_STATE: {
5040                 struct kvm_nested_state __user *user_kvm_nested_state = argp;
5041                 struct kvm_nested_state kvm_state;
5042                 int idx;
5043 
5044                 r = -EINVAL;
5045                 if (!kvm_x86_ops.nested_ops->set_state)
5046                         break;
5047 
5048                 r = -EFAULT;
5049                 if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
5050                         break;
5051 
5052                 r = -EINVAL;
5053                 if (kvm_state.size < sizeof(kvm_state))
5054                         break;
5055 
5056                 if (kvm_state.flags &
5057                     ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
5058                       | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
5059                       | KVM_STATE_NESTED_GIF_SET))
5060                         break;
5061 
5062                 /* nested_run_pending implies guest_mode.  */
5063                 if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
5064                     && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
5065                         break;
5066 
5067                 idx = srcu_read_lock(&vcpu->kvm->srcu);
5068                 r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
5069                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5070                 break;
5071         }
5072         case KVM_GET_SUPPORTED_HV_CPUID:
5073                 r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
5074                 break;
5075 #ifdef CONFIG_KVM_XEN
5076         case KVM_XEN_VCPU_GET_ATTR: {
5077                 struct kvm_xen_vcpu_attr xva;
5078 
5079                 r = -EFAULT;
5080                 if (copy_from_user(&xva, argp, sizeof(xva)))
5081                         goto out;
5082                 r = kvm_xen_vcpu_get_attr(vcpu, &xva);
5083                 if (!r && copy_to_user(argp, &xva, sizeof(xva)))
5084                         r = -EFAULT;
5085                 break;
5086         }
5087         case KVM_XEN_VCPU_SET_ATTR: {
5088                 struct kvm_xen_vcpu_attr xva;
5089 
5090                 r = -EFAULT;
5091                 if (copy_from_user(&xva, argp, sizeof(xva)))
5092                         goto out;
5093                 r = kvm_xen_vcpu_set_attr(vcpu, &xva);
5094                 break;
5095         }
5096 #endif
5097         default:
5098                 r = -EINVAL;
5099         }
5100 out:
5101         kfree(u.buffer);
5102 out_nofree:
5103         vcpu_put(vcpu);
5104         return r;
5105 }
5106 
5107 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
5108 {
5109         return VM_FAULT_SIGBUS;
5110 }
5111 
5112 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
5113 {
5114         int ret;
5115 
5116         if (addr > (unsigned int)(-3 * PAGE_SIZE))
5117                 return -EINVAL;
5118         ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
5119         return ret;
5120 }
5121 
5122 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
5123                                               u64 ident_addr)
5124 {
5125         return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
5126 }
5127 
5128 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
5129                                          unsigned long kvm_nr_mmu_pages)
5130 {
5131         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
5132                 return -EINVAL;
5133 
5134         mutex_lock(&kvm->slots_lock);
5135 
5136         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
5137         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
5138 
5139         mutex_unlock(&kvm->slots_lock);
5140         return 0;
5141 }
5142 
5143 static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
5144 {
5145         return kvm->arch.n_max_mmu_pages;
5146 }
5147 
5148 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
5149 {
5150         struct kvm_pic *pic = kvm->arch.vpic;
5151         int r;
5152 
5153         r = 0;
5154         switch (chip->chip_id) {
5155         case KVM_IRQCHIP_PIC_MASTER:
5156                 memcpy(&chip->chip.pic, &pic->pics[0],
5157                         sizeof(struct kvm_pic_state));
5158                 break;
5159         case KVM_IRQCHIP_PIC_SLAVE:
5160                 memcpy(&chip->chip.pic, &pic->pics[1],
5161                         sizeof(struct kvm_pic_state));
5162                 break;
5163         case KVM_IRQCHIP_IOAPIC:
5164                 kvm_get_ioapic(kvm, &chip->chip.ioapic);
5165                 break;
5166         default:
5167                 r = -EINVAL;
5168                 break;
5169         }
5170         return r;
5171 }
5172 
5173 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
5174 {
5175         struct kvm_pic *pic = kvm->arch.vpic;
5176         int r;
5177 
5178         r = 0;
5179         switch (chip->chip_id) {
5180         case KVM_IRQCHIP_PIC_MASTER:
5181                 spin_lock(&pic->lock);
5182                 memcpy(&pic->pics[0], &chip->chip.pic,
5183                         sizeof(struct kvm_pic_state));
5184                 spin_unlock(&pic->lock);
5185                 break;
5186         case KVM_IRQCHIP_PIC_SLAVE:
5187                 spin_lock(&pic->lock);
5188                 memcpy(&pic->pics[1], &chip->chip.pic,
5189                         sizeof(struct kvm_pic_state));
5190                 spin_unlock(&pic->lock);
5191                 break;
5192         case KVM_IRQCHIP_IOAPIC:
5193                 kvm_set_ioapic(kvm, &chip->chip.ioapic);
5194                 break;
5195         default:
5196                 r = -EINVAL;
5197                 break;
5198         }
5199         kvm_pic_update_irq(pic);
5200         return r;
5201 }
5202 
5203 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
5204 {
5205         struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
5206 
5207         BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
5208 
5209         mutex_lock(&kps->lock);
5210         memcpy(ps, &kps->channels, sizeof(*ps));
5211         mutex_unlock(&kps->lock);
5212         return 0;
5213 }
5214 
5215 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
5216 {
5217         int i;
5218         struct kvm_pit *pit = kvm->arch.vpit;
5219 
5220         mutex_lock(&pit->pit_state.lock);
5221         memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
5222         for (i = 0; i < 3; i++)
5223                 kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
5224         mutex_unlock(&pit->pit_state.lock);
5225         return 0;
5226 }
5227 
5228 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
5229 {
5230         mutex_lock(&kvm->arch.vpit->pit_state.lock);
5231         memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
5232                 sizeof(ps->channels));
5233         ps->flags = kvm->arch.vpit->pit_state.flags;
5234         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
5235         memset(&ps->reserved, 0, sizeof(ps->reserved));
5236         return 0;
5237 }
5238 
5239 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
5240 {
5241         int start = 0;
5242         int i;
5243         u32 prev_legacy, cur_legacy;
5244         struct kvm_pit *pit = kvm->arch.vpit;
5245 
5246         mutex_lock(&pit->pit_state.lock);
5247         prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
5248         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
5249         if (!prev_legacy && cur_legacy)
5250                 start = 1;
5251         memcpy(&pit->pit_state.channels, &ps->channels,
5252                sizeof(pit->pit_state.channels));
5253         pit->pit_state.flags = ps->flags;
5254         for (i = 0; i < 3; i++)
5255                 kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
5256                                    start && i == 0);
5257         mutex_unlock(&pit->pit_state.lock);
5258         return 0;
5259 }
5260 
5261 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
5262                                  struct kvm_reinject_control *control)
5263 {
5264         struct kvm_pit *pit = kvm->arch.vpit;
5265 
5266         /* pit->pit_state.lock was overloaded to prevent userspace from getting
5267          * an inconsistent state after running multiple KVM_REINJECT_CONTROL
5268          * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
5269          */
5270         mutex_lock(&pit->pit_state.lock);
5271         kvm_pit_set_reinject(pit, control->pit_reinject);
5272         mutex_unlock(&pit->pit_state.lock);
5273 
5274         return 0;
5275 }
5276 
5277 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
5278 {
5279 
5280         /*
5281          * Flush all CPUs' dirty log buffers to the  dirty_bitmap.  Called
5282          * before reporting dirty_bitmap to userspace.  KVM flushes the buffers
5283          * on all VM-Exits, thus we only need to kick running vCPUs to force a
5284          * VM-Exit.
5285          */
5286         struct kvm_vcpu *vcpu;
5287         int i;
5288 
5289         kvm_for_each_vcpu(i, vcpu, kvm)
5290                 kvm_vcpu_kick(vcpu);
5291 }
5292 
5293 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
5294                         bool line_status)
5295 {
5296         if (!irqchip_in_kernel(kvm))
5297                 return -ENXIO;
5298 
5299         irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
5300                                         irq_event->irq, irq_event->level,
5301                                         line_status);
5302         return 0;
5303 }
5304 
5305 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
5306                             struct kvm_enable_cap *cap)
5307 {
5308         int r;
5309 
5310         if (cap->flags)
5311                 return -EINVAL;
5312 
5313         switch (cap->cap) {
5314         case KVM_CAP_DISABLE_QUIRKS:
5315                 kvm->arch.disabled_quirks = cap->args[0];
5316                 r = 0;
5317                 break;
5318         case KVM_CAP_SPLIT_IRQCHIP: {
5319                 mutex_lock(&kvm->lock);
5320                 r = -EINVAL;
5321                 if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
5322                         goto split_irqchip_unlock;
5323                 r = -EEXIST;
5324                 if (irqchip_in_kernel(kvm))
5325                         goto split_irqchip_unlock;
5326                 if (kvm->created_vcpus)
5327                         goto split_irqchip_unlock;
5328                 r = kvm_setup_empty_irq_routing(kvm);
5329                 if (r)
5330                         goto split_irqchip_unlock;
5331                 /* Pairs with irqchip_in_kernel. */
5332                 smp_wmb();
5333                 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
5334                 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
5335                 r = 0;
5336 split_irqchip_unlock:
5337                 mutex_unlock(&kvm->lock);
5338                 break;
5339         }
5340         case KVM_CAP_X2APIC_API:
5341                 r = -EINVAL;
5342                 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
5343                         break;
5344 
5345                 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
5346                         kvm->arch.x2apic_format = true;
5347                 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
5348                         kvm->arch.x2apic_broadcast_quirk_disabled = true;
5349 
5350                 r = 0;
5351                 break;
5352         case KVM_CAP_X86_DISABLE_EXITS:
5353                 r = -EINVAL;
5354                 if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
5355                         break;
5356 
5357                 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
5358                         kvm_can_mwait_in_guest())
5359                         kvm->arch.mwait_in_guest = true;
5360                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
5361                         kvm->arch.hlt_in_guest = true;
5362                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
5363                         kvm->arch.pause_in_guest = true;
5364                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
5365                         kvm->arch.cstate_in_guest = true;
5366                 r = 0;
5367                 break;
5368         case KVM_CAP_MSR_PLATFORM_INFO:
5369                 kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
5370                 r = 0;
5371                 break;
5372         case KVM_CAP_EXCEPTION_PAYLOAD:
5373                 kvm->arch.exception_payload_enabled = cap->args[0];
5374                 r = 0;
5375                 break;
5376         case KVM_CAP_X86_USER_SPACE_MSR:
5377                 kvm->arch.user_space_msr_mask = cap->args[0];
5378                 r = 0;
5379                 break;
5380         case KVM_CAP_X86_BUS_LOCK_EXIT:
5381                 r = -EINVAL;
5382                 if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
5383                         break;
5384 
5385                 if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
5386                     (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
5387                         break;
5388 
5389                 if (kvm_has_bus_lock_exit &&
5390                     cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
5391                         kvm->arch.bus_lock_detection_enabled = true;
5392                 r = 0;
5393                 break;
5394         default:
5395                 r = -EINVAL;
5396                 break;
5397         }
5398         return r;
5399 }
5400 
5401 static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
5402 {
5403         struct kvm_x86_msr_filter *msr_filter;
5404 
5405         msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
5406         if (!msr_filter)
5407                 return NULL;
5408 
5409         msr_filter->default_allow = default_allow;
5410         return msr_filter;
5411 }
5412 
5413 static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
5414 {
5415         u32 i;
5416 
5417         if (!msr_filter)
5418                 return;
5419 
5420         for (i = 0; i < msr_filter->count; i++)
5421                 kfree(msr_filter->ranges[i].bitmap);
5422 
5423         kfree(msr_filter);
5424 }
5425 
5426 static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
5427                               struct kvm_msr_filter_range *user_range)
5428 {
5429         struct msr_bitmap_range range;
5430         unsigned long *bitmap = NULL;
5431         size_t bitmap_size;
5432         int r;
5433 
5434         if (!user_range->nmsrs)
5435                 return 0;
5436 
5437         bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
5438         if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
5439                 return -EINVAL;
5440 
5441         bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
5442         if (IS_ERR(bitmap))
5443                 return PTR_ERR(bitmap);
5444 
5445         range = (struct msr_bitmap_range) {
5446                 .flags = user_range->flags,
5447                 .base = user_range->base,
5448                 .nmsrs = user_range->nmsrs,
5449                 .bitmap = bitmap,
5450         };
5451 
5452         if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
5453                 r = -EINVAL;
5454                 goto err;
5455         }
5456 
5457         if (!range.flags) {
5458                 r = -EINVAL;
5459                 goto err;
5460         }
5461 
5462         /* Everything ok, add this range identifier. */
5463         msr_filter->ranges[msr_filter->count] = range;
5464         msr_filter->count++;
5465 
5466         return 0;
5467 err:
5468         kfree(bitmap);
5469         return r;
5470 }
5471 
5472 static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
5473 {
5474         struct kvm_msr_filter __user *user_msr_filter = argp;
5475         struct kvm_x86_msr_filter *new_filter, *old_filter;
5476         struct kvm_msr_filter filter;
5477         bool default_allow;
5478         bool empty = true;
5479         int r = 0;
5480         u32 i;
5481 
5482         if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
5483                 return -EFAULT;
5484 
5485         for (i = 0; i < ARRAY_SIZE(filter.ranges); i++)
5486                 empty &= !filter.ranges[i].nmsrs;
5487 
5488         default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
5489         if (empty && !default_allow)
5490                 return -EINVAL;
5491 
5492         new_filter = kvm_alloc_msr_filter(default_allow);
5493         if (!new_filter)
5494                 return -ENOMEM;
5495 
5496         for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
5497                 r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
5498                 if (r) {
5499                         kvm_free